In [243]:
import re
import json
import requests
import numpy as np
import pandas as pd
import time, random
from tqdm import tqdm
from bs4 import BeautifulSoup
from requests_html import AsyncHTMLSession, HTMLSession
from concurrent.futures import ThreadPoolExecutor, as_completed

In [244]:
class scrapper():
    def __init__(self, url):
        self.domain = 'https://www.leopalace21.com'
        self.url = url
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
            "Accept-Language": "en-US,en;q=0.9"
        }
        self.session = HTMLSession()
        self.html = self.session.get(self.url, headers=self.headers, timeout=10) # take html from the site
        self.soup = BeautifulSoup(self.html.content, 'html.parser') # parse html with BeautifulSoup
    
    def change_url(self, new_url): # change url
        self.url = new_url
        self.html = self.session.get(self.url, headers=self.headers, timeout=10) # take html from the site
        self.soup = BeautifulSoup(self.html.content, 'html.parser') # parse html with BeautifulSoup
        
    def print_html(self): # print formatted html
        print(self.soup.prettify())
        
    def print_data(self): # print non html
        for x in self.soup:
            print(x)

In [245]:
def target_element(soup, tag, class_name): # first matching element
        if class_name is None:
            return soup.find(tag)
        return soup.find(tag, class_=class_name)
        
def target_element_all(soup, tag, class_name): # all matching elements
    if class_name is None:
        return soup.find_all(tag)
    return soup.find_all(tag, class_=class_name)

def target_element_css(soup, css_selector): # css selector
    return soup.select_one(css_selector)
    
def target_element_text(soup): # search by text
    return [data.get_text() for data in soup]

def retry_with_backoff(attempt): # retry after failed connection
    wait_time = min(60, (2 ** attempt) + random.uniform(0, 1))
    time.sleep(wait_time)

def fetch_property_link(base_url, page_num, domain, headers):
    url = f"{base_url}{page_num}"
    try:
        r = scrapper(url)
        soup = target_element_all(r.soup, 'a', 'ApartmentItemDetails_apartment-item-details__pn2sc ApartmentItemDetails_-responsive__ZRuEF')
        time.sleep(random.uniform(0.5, 2))  # sleep between 0.5 to 2 seconds
        return [domain + link['href'] for link in soup]
    except Exception as e:
        retry_with_backoff(3)
        return []

In [246]:
crawler = scrapper('https://www.leopalace21.com/en/properties/common/osaka/osaka-shi-minato-ku-27107/linrow-56334')

In [247]:
property_info_page = target_element(crawler.soup, 'a', 'RoomItem_link__RoXCn')['href']

print("Price:", property_info_page)

Price: /en/properties/chintai/osaka/osaka-shi-minato-ku-27107/linrow-56334/101


In [248]:
crawler.change_url(crawler.domain + property_info_page)

In [249]:
json_results = []
chunk = []

In [250]:
chunk.append({"property name": target_element(crawler.soup, 'h1', 'page_heading__9lqpn').get_text()})
chunk.append({"price": target_element(crawler.soup, 'span', 'Price_price__6qQfX').get_text().replace(',', '')})
chunk.append({"maintanence": re.sub(r"[^0-9]", "", target_element(crawler.soup, 'span', 'Price_expenses__2meeQ').get_text().replace(',', ''))})
chunk.append({"gmap": target_element(crawler.soup, 'a', 'page_map-link__v6STN')['href']})

In [251]:
brokerage_fee = target_element_all(crawler.soup, 'span', 'RequiredRentCost_text__QVPh6')
brokerage_fee = target_element_text(brokerage_fee)
# in order: brokerage fee, scecurity deposit/deposit, non-refundable restoration fee, key money

In [252]:
chunk.append({"brokerage fee":brokerage_fee[0]})
chunk.append({"scecurity deposit/deposit":brokerage_fee[1]})
chunk.append({"non-refundable restoration fee":brokerage_fee[2]})
chunk.append({"key money":brokerage_fee[3]})

In [253]:
match = re.search(r"/properties/chintai/([^/]+)/([^/]+)/", crawler.url)
if match:
    prefecture = match.group(1)
    city = match.group(2)
    
    # remove post code
    city = re.sub(r'-\d+$', '', city)
print(f"Prefecture: {prefecture}, City: {city}")

Prefecture: osaka, City: osaka-shi-minato-ku


In [255]:
details = target_element_all(crawler.soup, 'div', 'TitleTextItem_title-text-item__3dJO_')
details[0]


<div class="TitleTextItem_title-text-item__3dJO_ TitleTextItem_-vertical__rNWt_"><p class="TitleTextItem_title__kkVCx">Bathroom · Toilet</p><div class="TitleTextItem_pair__9wp5G"><span class="TitleTextItem_text__4vy_f">Separated bath and toilet<!-- -->、<!-- -->Bathroom dryer<!-- -->、<!-- -->Washlet with warm water<!-- -->、<!-- -->Independent washbasin<!-- -->、<!-- -->Indoor washing machine place</span></div></div>

In [256]:
for detail in details:
    
    name = target_element( detail,'p', 'TitleTextItem_title__kkVCx')
    value = target_element(detail, 'span', 'TitleTextItem_text__4vy_f')
    if value == None:
        continue
    chunk.append({name.get_text(): value.get_text()})
json_results.append(chunk)

In [257]:
json_results

[[{'property name': 'CLEINO Linrow Unit 101'},
  {'price': '57000'},
  {'maintanence': '5500'},
  {'gmap': 'https://www.google.com/maps/search/?api=1&query=34.659863%2C135.46265'},
  {'brokerage fee': 'Not required'},
  {'scecurity deposit/deposit': 'Not required'},
  {'non-refundable restoration fee': 'Not required'},
  {'key money': 'Not required'},
  {'Bathroom · Toilet': 'Separated bath and toilet、Bathroom dryer、Washlet with warm water、Independent washbasin、Indoor washing machine place'},
  {'Security': 'Security Camera、TV door phone、Smartlock'},
  {'Broadcasting・Communication': 'Internet Compatible、LEONET(Life Stick)'},
  {'Others': 'Walk-in closet、Hot-water supply、Air conditioner、Balcony、Bicycle parking area'},
  {'Bathroom · Toilet': 'Separated bath and toilet、Bathroom dryer、Washlet with warm water、Independent washbasin、Indoor washing machine place'},
  {'Security': 'Security Camera、TV door phone、Smartlock'},
  {'Broadcasting・Communication': 'Internet Compatible、LEONET(Life Stic