In [53]:
import re
import os
import json
import requests
import numpy as np
import pandas as pd
import time, random
from tqdm import tqdm
from bs4 import BeautifulSoup
from requests_html import AsyncHTMLSession, HTMLSession
from concurrent.futures import ThreadPoolExecutor, as_completed

In [54]:
with open('site_urls.json', 'r') as f:
    urls = json.load(f)

In [55]:
urls['urls'][0]

'https://www.leopalace21.com/en/search/chintai/area'

In [56]:
session = HTMLSession()


In [57]:
property_detail_checkpoint = "checkpoint_property_details_link.json"

properties_links_pages_filename = "checkpoint_properties_links_pages.json"

properties_links_pages_filename = "properties_links_pages.json"

In [58]:
class scrapper():
    def __init__(self, url):
        self.domain = 'https://www.leopalace21.com'
        self.url = url
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
            "Accept-Language": "en-US,en;q=0.9"
        }
        self.session = session
        self.html = self.session.get(self.url, headers=self.headers, timeout=10) # take html from the site
        self.soup = BeautifulSoup(self.html.content, 'html.parser') # parse html with BeautifulSoup
    
    def change_url(self, new_url): # change url
        self.url = new_url
        self.html = self.session.get(self.url, headers=self.headers, timeout=10) # take html from the site
        self.soup = BeautifulSoup(self.html.content, 'html.parser') # parse html with BeautifulSoup
        
    def print_html(self): # print formatted html
        print(self.soup.prettify())
        
    def print_data(self): # print non html
        for x in self.soup:
            print(x)

In [None]:
def target_element(soup, tag, class_name): # first matching element
        if class_name is None:
            return soup.find(tag)
        return soup.find(tag, class_=class_name)
        
def target_element_all(soup, tag, class_name): # all matching elements
    if class_name is None:
        return soup.find_all(tag)
    return soup.find_all(tag, class_=class_name)

def target_element_css(soup, css_selector): # css selector
    return soup.select_one(css_selector)
    
def target_element_text(soup): # search by text
    return [data.get_text() for data in soup]

def retry_with_backoff(attempt): # retry after failed connection
    wait_time = min(60, (2 ** attempt) + random.uniform(0, 1))
    time.sleep(wait_time)
    
def save_json(urls, filename):
    with open(filename, 'w') as f:
        json.dump({'urls': urls}, f, indent=4)
        
def load_json(filename):
    with open(filename, "r") as f:
        return json.load(f).get("urls", 0)
    
def checkpoint_save(index, filename): # save data incrementally
    with open(filename, 'w') as f:
        json.dump({"index": index}, f)
        
def checkpoint_load(filename): # load data incrementally
    try:
        with open(filename, 'r') as f:
            return json.load(f).get("index", 0)
    except FileNotFoundError:
        return 0
    
def save_to_excel(data):
    return 0

In [60]:
def fetch_all_property_details_link(url, index, total_pages): # scrape property details from property page
    
    #progress checkpoint
    mod = total_pages / 5 # 20% of progress
    if index % mod == 0 and index != 0:
        checkpoint_save(index, property_detail_checkpoint)
    
    r = scrapper(url)
    property_info_page = list(target_element_all(r.soup, 'a', 'RoomItem_link__RoXCn'))
    filtered_links = [item for item in property_info_page if not re.search(r"\bmonthly\b", item['href'], re.IGNORECASE)]
    
    if len(filtered_links) == 1:
        return fetch_all_detail(r.domain + filtered_links[0]['href'])
    else:
        chunk = []
        for link in filtered_links:
            chunk.extend(fetch_all_detail(r.domain + link['href']))
        return chunk
    
def fetch_property_link(base_url, page_num, domain): # scrape property links from listing page
    url = f"{base_url}{page_num}"
    try:
        r = scrapper(url)
        soup = target_element_all(r.soup, 'a', 'ApartmentItemDetails_apartment-item-details__pn2sc ApartmentItemDetails_-responsive__ZRuEF')
        time.sleep(random.uniform(0.5, 2))  # sleep between 0.5 to 2 seconds
        return [domain + link['href'] for link in soup]
    except Exception as e:
        retry_with_backoff(3)
        return []
    
def fetch_all_detail(url):
    r = scrapper(url)
    chunk = []
    try:
        chunk.append({"property name": target_element(r.soup, 'h1', 'page_heading__9lqpn').get_text()})
        chunk.append({"price": target_element(r.soup, 'span', 'Price_price__6qQfX').get_text().replace(',', '')})
        chunk.append({"maintanence": re.sub(r"[^0-9]", "", target_element(r.soup, 'span', 'Price_expenses__2meeQ').get_text().replace(',', ''))})
        chunk.append({"gmap": target_element(r.soup, 'a', 'page_map-link__v6STN')['href']})
        
        brokerage_fee = target_element_all(r.soup, 'span', 'RequiredRentCost_text__QVPh6')
        brokerage_fee = target_element_text(brokerage_fee)
        # in order: brokerage fee, scecurity deposit/deposit, non-refundable restoration fee, key money
        
        chunk.append({"brokerage fee":brokerage_fee[0]})
        chunk.append({"scecurity deposit/deposit":brokerage_fee[1]})
        chunk.append({"non-refundable restoration fee":brokerage_fee[2]})
        chunk.append({"key money":brokerage_fee[3]})
        
        match = re.search(r"/properties/chintai/([^/]+)/([^/]+)/", r.url)
        if match:
            prefecture = match.group(1)
            city = match.group(2)
            
            # remove post code
            city = re.sub(r'-\d+$', '', city)
        chunk.append({"prefecture": prefecture})
        chunk.append({"city": city})
        
        details = target_element_all(r.soup, 'div', 'TitleTextItem_title-text-item__3dJO_')
        for detail in details:
    
            name = target_element( detail,'p', 'TitleTextItem_title__kkVCx')
            value = target_element(detail, 'span', 'TitleTextItem_text__4vy_f')
            if value == None:
                continue
            chunk.append({name.get_text(): value.get_text()})
        
        time.sleep(random.uniform(0.5, 2))  # sleep between 0.5 to 2 seconds
        return chunk
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        retry_with_backoff(3)
        return []

In [61]:
crawler = scrapper(urls['urls'][0])

Getting Prefecture Names

In [62]:
crawler.soup = target_element_css(crawler.soup, 'div.FooterMiscInternalLinkContainer_content__M1tIW div.FooterMiscInternalLinkTextLink_text-link-container__2u2zd')
crawler.soup = target_element_all(crawler.soup, 'a', 'TextLink_text-link__Z6GQ4 TextLink_-white__uwMVe')

In [63]:
prefectures = target_element_text(crawler.soup)

In [64]:
prefecture_url = []
for i, prefecture in enumerate(prefectures):
    prefecture_url.append(urls['urls'][1].replace("PREFECTURE", prefecture.lower())) # attaching prefecture to base url

Getting Pages

In [65]:
properties_links_pages = []
    
if os.path.exists(properties_links_pages_filename):
    choice = input(f"{properties_links_pages_filename} File Detected, Use the File? (Y/n): ").strip().lower()
    if choice == "" or choice == "y":
        property_links = load_json(properties_links_pages_filename)
else:
    
    with tqdm(prefecture_url) as pbar:
        for prefecture in pbar:
            crawler.change_url(prefecture) # change url to url with prefecture
            page_number = target_element_all(crawler.soup, 'a', 'Pager_link__rnYFP')[:-1] # excluding the last element which is "next page"
            page_number =target_element_text(page_number)[len(page_number)-1:]
            
            pbar.set_description(f"Processing {prefecture} with {page_number} pages")
            
            result = {"link": prefecture, "page_number": page_number[0]}
            properties_links_pages.append(result)
            
    save_json(properties_links_pages, properties_links_pages_filename)

Processing https://www.leopalace21.com/en/properties/chintai/area/okinawa?page= with ['12'] pages: 100%|██████████| 47/47 [00:48<00:00,  1.03s/it]   


In [66]:
properties_links_pages[:len(properties_links_pages)//5]

[{'link': 'https://www.leopalace21.com/en/properties/chintai/area/hokkaido?page=',
  'page_number': '129'},
 {'link': 'https://www.leopalace21.com/en/properties/chintai/area/aomori?page=',
  'page_number': '48'},
 {'link': 'https://www.leopalace21.com/en/properties/chintai/area/iwate?page=',
  'page_number': '38'},
 {'link': 'https://www.leopalace21.com/en/properties/chintai/area/miyagi?page=',
  'page_number': '126'},
 {'link': 'https://www.leopalace21.com/en/properties/chintai/area/akita?page=',
  'page_number': '21'},
 {'link': 'https://www.leopalace21.com/en/properties/chintai/area/yamagata?page=',
  'page_number': '53'},
 {'link': 'https://www.leopalace21.com/en/properties/chintai/area/fukushima?page=',
  'page_number': '113'},
 {'link': 'https://www.leopalace21.com/en/properties/chintai/area/tokyo?page=',
  'page_number': '154'},
 {'link': 'https://www.leopalace21.com/en/properties/chintai/area/kanagawa?page=',
  'page_number': '250'}]

Getting Property Links

In [67]:
property_links = []

for link in tqdm(properties_links_pages, desc="Processing prefectures"):
    total_pages = int(link['page_number'])
    page_range = range(1, total_pages + 1)

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(fetch_property_link, link['link'], page, crawler.domain) for page in page_range]
        for future in tqdm(as_completed(futures), total=total_pages, desc="Pages", leave=False):
            property_links.extend(future.result())

property_links = list(dict.fromkeys(property_links))

save_json(property_links, "property_urls.json")

Processing prefectures: 100%|██████████| 47/47 [18:13<00:00, 23.27s/it]


Property Details Hell

In [68]:
len(property_links)

25966

In [69]:
json_result = []
property_links = property_links[:1000]

if os.path.exists(property_detail_checkpoint):
    choice = input("Checkpoint File Detected, Use the File? (Y/n): ").strip().lower()
    if choice == "" or choice == "y":
        property_links = property_links[checkpoint_load(property_detail_checkpoint):]
        
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(fetch_all_property_details_link, link, idx, len(property_links)): (idx, link) for idx, link in enumerate(property_links)}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Properties", leave=False):
        try:
            result = future.result()
            if result:  # Check if result is not empty
                json_result.extend(result)
        except Exception as e:
            print(f"Error processing {futures[future]}: {e}")

Properties:   0%|          | 0/1000 [00:00<?, ?it/s]

                                                               

In [75]:
len(json_result)

218146

Outputing as DataFrame

In [90]:
rows = []
current_row = {}
for item in json_result:
    for key, value in item.items():
        if key == 'property name' and 'property name' in current_row:
            # property name is already added -> push the old row and start new
            rows.append(current_row)
            current_row = {}
        current_row[key] = value
if current_row:
    rows.append(current_row)
    
df = pd.DataFrame(rows)

In [91]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4042 entries, 0 to 4041
Data columns (total 42 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   property name                                       4042 non-null   object
 1   price                                               4042 non-null   object
 2   maintanence                                         4042 non-null   object
 3   gmap                                                4042 non-null   object
 4   brokerage fee                                       4042 non-null   object
 5   scecurity deposit/deposit                           4042 non-null   object
 6   non-refundable restoration fee                      4042 non-null   object
 7   key money                                           4042 non-null   object
 8   prefecture                                          4042 non-null   object
 9   city    

In [94]:
df.to_excel("df.xlsx", index=False)