In [42]:
import json
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from requests_html import AsyncHTMLSession, HTMLSession
from concurrent.futures import ThreadPoolExecutor, as_completed

In [43]:
with open('urls.json', 'r') as f:
    urls = json.load(f)

In [44]:
urls['urls'][0]

'https://www.leopalace21.com/en/search/chintai/area'

In [45]:
class scrapper():
    def __init__(self, url):
        self.domain = 'https://www.leopalace21.com'
        self.url = url
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
            "Accept-Language": "en-US,en;q=0.9"
        }
        self.session = HTMLSession()
        self.html = self.session.get(self.url, headers=self.headers) # take html from the site
        self.soup = BeautifulSoup(self.html.content, 'html.parser') # parse html with BeautifulSoup
    
    def change_url(self, new_url): # change url
        self.url = new_url
        self.html = self.session.get(self.url, headers=self.headers) # take html from the site
        self.soup = BeautifulSoup(self.html.content, 'html.parser') # parse html with BeautifulSoup
        
    def print_html(self): # print formatted html
        print(self.soup.prettify())
        
    def print_data(self): # print non html
        for x in self.soup:
            print(x)

In [46]:
def target_element(soup, tag, class_name): # first matching element
        if class_name is None:
            return soup.find(tag)
        return soup.find(tag, class_=class_name)
        
def target_element_all(soup, tag, class_name): # all matching elements
    if class_name is None:
        return soup.find_all(tag)
    elif class_name == 'href':  # special case for links
        return soup.find_all(tag, href=True)
    return soup.find_all(tag, class_=class_name)

def target_element_css(soup, css_selector): # css selector
    return soup.select_one(css_selector)
    
def target_element_text(soup): # search by text
    return [data.get_text() for data in soup]

def fetch_property_link(base_url, page_num, domain, headers):
    url = f"{base_url}{page_num}"
    try:
        r = scrapper(url)
        soup = target_element_all(r.soup, 'a', 'ApartmentItemDetails_apartment-item-details__pn2sc ApartmentItemDetails_-responsive__ZRuEF')
        return [domain + link['href'] for link in soup]
    except Exception as e:
        return []

In [47]:
crawler = scrapper(urls['urls'][0])
# crawler.print_html()

Getting Prefecture Names

In [48]:
crawler.soup = target_element_css(crawler.soup, 'div.FooterMiscInternalLinkContainer_content__M1tIW div.FooterMiscInternalLinkTextLink_text-link-container__2u2zd')
# crawler.print_html()

In [49]:
crawler.soup = target_element_all(crawler.soup, 'a', 'TextLink_text-link__Z6GQ4 TextLink_-white__uwMVe')

In [50]:
prefectures = target_element_text(crawler.soup)
# prefectures[:len(prefectures)//2]

In [51]:
prefecture_url = []
for i, prefecture in enumerate(prefectures):
    prefecture_url.append(urls['urls'][1].replace("PREFECTURE", prefecture.lower())) # attaching prefecture to base url
    # print(prefecture_url[i])


Getting Pages

In [52]:
properties_links_pages = []

with tqdm(prefecture_url) as pbar:
    for prefecture in pbar:
        crawler.change_url(prefecture) # change url to url with prefecture
        page_number = target_element_all(crawler.soup, 'a', 'Pager_link__rnYFP')[:-1] # excluding the last element which is "next page"
        page_number =target_element_text(page_number)[len(page_number)-1:]
        
        pbar.set_description(f"Processing {prefecture} with {page_number} pages")
        
        result = {"link": prefecture, "page_number": page_number[0]}
        properties_links_pages.append(result)

Processing https://www.leopalace21.com/en/properties/chintai/area/okinawa?page= with ['12'] pages: 100%|██████████| 47/47 [00:42<00:00,  1.11it/s]   


In [53]:
properties_links_pages[:len(properties_links_pages)//5]

[{'link': 'https://www.leopalace21.com/en/properties/chintai/area/hokkaido?page=',
  'page_number': '126'},
 {'link': 'https://www.leopalace21.com/en/properties/chintai/area/aomori?page=',
  'page_number': '47'},
 {'link': 'https://www.leopalace21.com/en/properties/chintai/area/iwate?page=',
  'page_number': '39'},
 {'link': 'https://www.leopalace21.com/en/properties/chintai/area/miyagi?page=',
  'page_number': '126'},
 {'link': 'https://www.leopalace21.com/en/properties/chintai/area/akita?page=',
  'page_number': '22'},
 {'link': 'https://www.leopalace21.com/en/properties/chintai/area/yamagata?page=',
  'page_number': '53'},
 {'link': 'https://www.leopalace21.com/en/properties/chintai/area/fukushima?page=',
  'page_number': '113'},
 {'link': 'https://www.leopalace21.com/en/properties/chintai/area/tokyo?page=',
  'page_number': '153'},
 {'link': 'https://www.leopalace21.com/en/properties/chintai/area/kanagawa?page=',
  'page_number': '250'}]

Getting Property Links

In [54]:
property_links = set()

for link in tqdm(properties_links_pages, desc="Processing prefectures"):
    total_pages = int(link['page_number'])
    page_range = range(1, total_pages + 1)

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(fetch_property_link, link['link'], page, crawler.domain, crawler.headers) for page in page_range]
        for future in tqdm(as_completed(futures), total=total_pages, desc="Pages", leave=False):
            property_links.update(future.result())
    

Processing prefectures: 100%|██████████| 47/47 [09:35<00:00, 12.25s/it]


In [55]:
property_links

{'https://www.leopalace21.com/en/properties/common/nara/kashihara-shi-29205/masuga-54649',
 'https://www.leopalace21.com/en/properties/common/gunma/maebashi-shi-10201/ease-1-23119',
 'https://www.leopalace21.com/en/properties/common/hyogo/kobe-shi-chuo-ku-28110/shin-kobe-23282',
 'https://www.leopalace21.com/en/properties/common/kanagawa/yokohama-shi-seya-ku-14114/seya-dai1-16002',
 'https://www.leopalace21.com/en/properties/common/hyogo/nishinomiya-shi-28204/tsutsumi-mn-1-39219',
 'https://www.leopalace21.com/en/properties/common/shizuoka/shizuoka-shi-suruga-ku-22102/route-1-miki-25411',
 'https://www.leopalace21.com/en/properties/common/saitama/haniyu-shi-11216/gricine-54648',
 'https://www.leopalace21.com/en/properties/common/hiroshima/miyoshi-shi-34209/nishi-miyoshi-2-57899',
 'https://www.leopalace21.com/en/properties/common/shizuoka/mishima-shi-22206/wakaba-15822',
 'https://www.leopalace21.com/en/properties/common/kanagawa/yokohama-shi-tsurumi-ku-14101/nakayama-2-33672',
 'https

In [56]:
len(property_links)

25938

In [57]:
property_links = list(set(property_links)) # remove duplicates
len(property_links)

25938

In [None]:
# propertys_links = []
# for link in tqdm(properties_links_pages, desc="Processing pages"):
    
#     for page in tqdm(range(int(link['page_number']))):
        
#         crawler.change_url(link['link'] + str(page + 1)) # change url to url with page number
#         crawler.soup = target_element_all(crawler.soup, 'a', 'ApartmentItemDetails_apartment-item-details__pn2sc ApartmentItemDetails_-responsive__ZRuEF')
#         propertys_links.extend([crawler.domain + link['href'] for link in crawler.soup])

In [None]:
# propertys_links

['https://www.leopalace21.com/en/properties/common/hokkaido/asahikawa-shi-01204/happiness-m-18139',
 'https://www.leopalace21.com/en/properties/common/hokkaido/ebetsu-shi-01217/sun-inlet-v-34351',
 'https://www.leopalace21.com/en/properties/common/hokkaido/ebetsu-shi-01217/dear-court-2-37561',
 'https://www.leopalace21.com/en/properties/common/hokkaido/ebetsu-shi-01217/dear-court-2-37561',
 'https://www.leopalace21.com/en/properties/common/hokkaido/ebetsu-shi-01217/dear-court-30918',
 'https://www.leopalace21.com/en/properties/common/hokkaido/ebetsu-shi-01217/dear-court-30918',
 'https://www.leopalace21.com/en/properties/common/hokkaido/ebetsu-shi-01217/premier-2-31374',
 'https://www.leopalace21.com/en/properties/common/hokkaido/ebetsu-shi-01217/luna-selene-39391',
 'https://www.leopalace21.com/en/properties/common/hokkaido/hakodate-shi-01202/eclair-19384',
 'https://www.leopalace21.com/en/properties/common/hokkaido/takikawa-shi-01225/amber-24256',
 'https://www.leopalace21.com/en/pro

In [None]:
# len(propertys_links)

29409