In [1]:
import re
import json
import requests
import numpy as np
import pandas as pd
import time, random
from tqdm import tqdm
from bs4 import BeautifulSoup
from requests_html import AsyncHTMLSession, HTMLSession
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
with open('urls.json', 'r') as f:
    urls = json.load(f)

In [3]:
urls['urls'][0]

'https://www.leopalace21.com/en/search/chintai/area'

In [4]:
session = HTMLSession()
class scrapper():
    def __init__(self, url):
        self.domain = 'https://www.leopalace21.com'
        self.url = url
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
            "Accept-Language": "en-US,en;q=0.9"
        }
        self.session = session
        self.html = self.session.get(self.url, headers=self.headers, timeout=10) # take html from the site
        self.soup = BeautifulSoup(self.html.content, 'html.parser') # parse html with BeautifulSoup
    
    def change_url(self, new_url): # change url
        self.url = new_url
        self.html = self.session.get(self.url, headers=self.headers, timeout=10) # take html from the site
        self.soup = BeautifulSoup(self.html.content, 'html.parser') # parse html with BeautifulSoup
        
    def print_html(self): # print formatted html
        print(self.soup.prettify())
        
    def print_data(self): # print non html
        for x in self.soup:
            print(x)

In [36]:
def target_element(soup, tag, class_name): # first matching element
        if class_name is None:
            return soup.find(tag)
        return soup.find(tag, class_=class_name)
        
def target_element_all(soup, tag, class_name): # all matching elements
    if class_name is None:
        return soup.find_all(tag)
    return soup.find_all(tag, class_=class_name)

def target_element_css(soup, css_selector): # css selector
    return soup.select_one(css_selector)
    
def target_element_text(soup): # search by text
    return [data.get_text() for data in soup]

def retry_with_backoff(attempt): # retry after failed connection
    wait_time = min(60, (2 ** attempt) + random.uniform(0, 1))
    time.sleep(wait_time)
    
def checkpoint_save(index, filename): # save data incrementally
    with open(filename, 'w') as f:
        json.dump({"index": index}, f)
        
def checkpoint_load(filename): # load data incrementally
    try:
        with open(filename, 'r') as f:
            return json.load(f).get("index", 0)
    except FileNotFoundError:
        return 0
    
def incremental_save(data, filename): # save data incrementally
    return 0

def fetch_property_link(base_url, page_num, domain): # scrape property links from listing page
    url = f"{base_url}{page_num}"
    try:
        r = scrapper(url)
        soup = target_element_all(r.soup, 'a', 'ApartmentItemDetails_apartment-item-details__pn2sc ApartmentItemDetails_-responsive__ZRuEF')
        time.sleep(random.uniform(0.5, 2))  # sleep between 0.5 to 2 seconds
        return [domain + link['href'] for link in soup]
    except Exception as e:
        retry_with_backoff(3)
        return []
    
def fetch_all_detail(url):
    print(f"Processing: {url}")
    r = scrapper(url)
    chunk = []
    try:
        chunk.append({"property name": target_element(r.soup, 'h1', 'page_heading__9lqpn').get_text()})
        chunk.append({"price": target_element(r.soup, 'span', 'Price_price__6qQfX').get_text().replace(',', '')})
        chunk.append({"maintanence": re.sub(r"[^0-9]", "", target_element(r.soup, 'span', 'Price_expenses__2meeQ').get_text().replace(',', ''))})
        chunk.append({"gmap": target_element(r.soup, 'a', 'page_map-link__v6STN')['href']})
        
        brokerage_fee = target_element_all(r.soup, 'span', 'RequiredRentCost_text__QVPh6')
        brokerage_fee = target_element_text(brokerage_fee)
        # in order: brokerage fee, scecurity deposit/deposit, non-refundable restoration fee, key money
        
        chunk.append({"brokerage fee":brokerage_fee[0]})
        chunk.append({"scecurity deposit/deposit":brokerage_fee[1]})
        chunk.append({"non-refundable restoration fee":brokerage_fee[2]})
        chunk.append({"key money":brokerage_fee[3]})
        
        match = re.search(r"/properties/chintai/([^/]+)/([^/]+)/", r.url)
        if match:
            prefecture = match.group(1)
            city = match.group(2)
            
            # remove post code
            city = re.sub(r'-\d+$', '', city)
        chunk.append({"prefecture": prefecture})
        chunk.append({"city": city})
        
        details = target_element_all(r.soup, 'div', 'TitleTextItem_title-text-item__3dJO_')
        for detail in details:
    
            name = target_element( detail,'p', 'TitleTextItem_title__kkVCx')
            value = target_element(detail, 'span', 'TitleTextItem_text__4vy_f')
            if value == None:
                continue
            chunk.append({name.get_text(): value.get_text()})
        
        time.sleep(random.uniform(0.5, 2))  # sleep between 0.5 to 2 seconds
        return chunk
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        retry_with_backoff(3)
        return []
    
def fetch_all_property_details_link(url): # scrape property details from property page
    r = scrapper(url)
    property_info_page = list(target_element_all(r.soup, 'a', 'RoomItem_link__RoXCn'))
    filtered_links = [item for item in property_info_page if not re.search(r"\bmonthly\b", item['href'], re.IGNORECASE)]
    
    if len(filtered_links) == 1:
        return fetch_all_detail(r.domain + filtered_links[0]['href'])
    else:
        chunk = []
        for link in filtered_links:
            chunk.extend(fetch_all_detail(r.domain + link['href']))
        return chunk

In [6]:
crawler = scrapper(urls['urls'][0])
# crawler.print_html()

Getting Prefecture Names

In [7]:
crawler.soup = target_element_css(crawler.soup, 'div.FooterMiscInternalLinkContainer_content__M1tIW div.FooterMiscInternalLinkTextLink_text-link-container__2u2zd')
# crawler.print_html()

In [8]:
crawler.soup = target_element_all(crawler.soup, 'a', 'TextLink_text-link__Z6GQ4 TextLink_-white__uwMVe')

In [9]:
prefectures = target_element_text(crawler.soup)
# prefectures[:len(prefectures)//2]

In [10]:
prefecture_url = []
for i, prefecture in enumerate(prefectures):
    prefecture_url.append(urls['urls'][1].replace("PREFECTURE", prefecture.lower())) # attaching prefecture to base url
    # print(prefecture_url[i])

Getting Pages

In [11]:
properties_links_pages = []

with tqdm(prefecture_url) as pbar:
    for prefecture in pbar:
        crawler.change_url(prefecture) # change url to url with prefecture
        page_number = target_element_all(crawler.soup, 'a', 'Pager_link__rnYFP')[:-1] # excluding the last element which is "next page"
        page_number =target_element_text(page_number)[len(page_number)-1:]
        
        pbar.set_description(f"Processing {prefecture} with {page_number} pages")
        
        result = {"link": prefecture, "page_number": page_number[0]}
        properties_links_pages.append(result)

Processing https://www.leopalace21.com/en/properties/chintai/area/okinawa?page= with ['12'] pages: 100%|██████████| 47/47 [00:52<00:00,  1.12s/it]   


In [12]:
properties_links_pages[:len(properties_links_pages)//5]

[{'link': 'https://www.leopalace21.com/en/properties/chintai/area/hokkaido?page=',
  'page_number': '128'},
 {'link': 'https://www.leopalace21.com/en/properties/chintai/area/aomori?page=',
  'page_number': '48'},
 {'link': 'https://www.leopalace21.com/en/properties/chintai/area/iwate?page=',
  'page_number': '37'},
 {'link': 'https://www.leopalace21.com/en/properties/chintai/area/miyagi?page=',
  'page_number': '124'},
 {'link': 'https://www.leopalace21.com/en/properties/chintai/area/akita?page=',
  'page_number': '21'},
 {'link': 'https://www.leopalace21.com/en/properties/chintai/area/yamagata?page=',
  'page_number': '53'},
 {'link': 'https://www.leopalace21.com/en/properties/chintai/area/fukushima?page=',
  'page_number': '113'},
 {'link': 'https://www.leopalace21.com/en/properties/chintai/area/tokyo?page=',
  'page_number': '153'},
 {'link': 'https://www.leopalace21.com/en/properties/chintai/area/kanagawa?page=',
  'page_number': '251'}]

Getting Property Links

In [13]:
property_links = set()

for link in tqdm(properties_links_pages, desc="Processing prefectures"):
    total_pages = int(link['page_number'])
    page_range = range(1, total_pages + 1)

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(fetch_property_link, link['link'], page, crawler.domain) for page in page_range]
        for future in tqdm(as_completed(futures), total=total_pages, desc="Pages", leave=False):
            property_links.update(future.result())
    

Processing prefectures: 100%|██████████| 47/47 [20:09<00:00, 25.74s/it]


In [14]:
property_links

{'https://www.leopalace21.com/en/properties/common/osaka/yao-shi-27212/ran-20898',
 'https://www.leopalace21.com/en/properties/common/ibaraki/mito-shi-08201/sumire-29973',
 'https://www.leopalace21.com/en/properties/common/tokyo/adachi-ku-13121/phrase-2-38934',
 'https://www.leopalace21.com/en/properties/common/fukushima/koriyama-shi-07203/residence-sakura-56350',
 'https://www.leopalace21.com/en/properties/common/osaka/toyonaka-shi-27203/g-park-29022',
 'https://www.leopalace21.com/en/properties/common/fukuoka/fukutsu-shi-40224/suishokan-v-54886',
 'https://www.leopalace21.com/en/properties/common/shizuoka/iwata-shi-22211/tornado-toyooka-38007',
 'https://www.leopalace21.com/en/properties/common/tochigi/nasushiobara-shi-09213/bonheur-pastoral-36683',
 'https://www.leopalace21.com/en/properties/common/okayama/okayama-shi-kita-ku-33101/pal-kitagata-2-29798',
 'https://www.leopalace21.com/en/properties/common/yamagata/yamagata-shi-06201/ayumi-14975',
 'https://www.leopalace21.com/en/prop

In [15]:
len(property_links)

26006

hell

In [None]:
json_result = []
    
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(fetch_all_property_details_link, link): link for link in property_links}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Properties", leave=False):
        try:
            result = future.result()
            if result:  # Check if result is not empty
                json_result.extend(result)
        except Exception as e:
            print(f"Error processing {futures[future]}: {e}")

Properties:   0%|          | 0/5 [00:00<?, ?it/s]

Processing: https://www.leopalace21.com/en/properties/chintai/fukushima/koriyama-shi-07203/residence-sakura-56350/103
Processing: https://www.leopalace21.com/en/properties/chintai/osaka/toyonaka-shi-27203/g-park-29022/102
Processing: https://www.leopalace21.com/en/properties/chintai/ibaraki/mito-shi-08201/sumire-29973/101
Processing: https://www.leopalace21.com/en/properties/chintai/tokyo/adachi-ku-13121/phrase-2-38934/304
Processing: https://www.leopalace21.com/en/properties/chintai/osaka/yao-shi-27212/ran-20898/101


Properties:  80%|████████  | 4/5 [00:03<00:00,  1.43it/s]

Processing: https://www.leopalace21.com/en/properties/chintai/ibaraki/mito-shi-08201/sumire-29973/207


                                                         

In [38]:
json_result

[{'property name': 'leopalace G Park Unit 102'},
 {'price': '66000'},
 {'maintanence': '5000'},
 {'gmap': 'https://www.google.com/maps/search/?api=1&query=34.77185%2C135.49614'},
 {'brokerage fee': 'Not required'},
 {'scecurity deposit/deposit': 'Not required'},
 {'non-refundable restoration fee': 'Not required'},
 {'key money': 'Not required'},
 {'prefecture': 'osaka'},
 {'city': 'toyonaka-shi'},
 {'Bathroom · Toilet': 'Separated bath and toilet、Bathroom dryer、Washlet with warm water、Indoor washing machine place'},
 {'Security': 'Security Camera、TV door phone、Delivery box 、Smartlock'},
 {'Broadcasting・Communication': 'Internet Compatible、LEONET、CATV'},
 {'Others': 'Hot-water supply、Air conditioner、Loft'},
 {'Bathroom · Toilet': 'Separated bath and toilet、Bathroom dryer、Washlet with warm water、Indoor washing machine place'},
 {'Security': 'Security Camera、TV door phone、Delivery box 、Smartlock'},
 {'Broadcasting・Communication': 'Internet Compatible、LEONET、CATV'},
 {'Others': 'Hot-water 