In [2]:
import os
import json
import asyncio
import nest_asyncio
import numpy as np
import pandas as pd
import time, random
import aiohttp
from tqdm import tqdm
from lxml import html
from concurrent.futures import ThreadPoolExecutor, as_completed

In [3]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Accept-Language": "en-US,en;q=0.9",
}
semaphore = asyncio.Semaphore(10)  # Limit concurrent requests

In [4]:
with open('site_urls.json', 'r') as f:
    site_urls = json.load(f)

with open('property_urls.json', 'r') as f:
    property_urls = json.load(f)

In [5]:
site_urls['urls']

['https://www.leopalace21.com',
 'https://www.leopalace21.com/en/search/chintai/area',
 'https://www.leopalace21.com/en/properties/chintai/area/PREFECTURE?page=']

In [6]:
property_urls['urls'][:5]

['https://www.leopalace21.com/en/properties/common/hokkaido/ebetsu-shi-01217/bonita-dois-38530',
 'https://www.leopalace21.com/en/properties/common/hokkaido/hakodate-shi-01202/shiawasenoie-17791',
 'https://www.leopalace21.com/en/properties/common/hokkaido/hakodate-shi-01202/hakodate-22098',
 'https://www.leopalace21.com/en/properties/common/hokkaido/asahikawa-shi-01204/nishimachi-dori-22990',
 'https://www.leopalace21.com/en/properties/common/hokkaido/hokuto-shi-01236/dolce-k-y-20774']

ASYNC FUNCTIONS

In [145]:
def query(tree):
    label = tree.xpath('//ul[@class="LabelList_label-list__JZVeD"]//text()')
    label = list(dict.fromkeys(label))
    
    # query + data cleaning
    data = [x for x in tree.xpath('//h1[@class="page_heading__9lqpn"]//text()') if x.strip() != ''][:2]
    name, unit = (data + [None, None])[:2]
    data = [x for x in tree.xpath('//div[@class="Price_text-price__OJin9 Price_-normal__yaKpe Price_-responsive__iSQmE Price_-vertical__jGQ9Y"]//text()') if x.strip() != '']
    currency, cost, maintenance = (data + [None, None])[:3]
    misc = [x for x in tree.xpath('//h2[@class="Heading_text__ihjGf Heading_-h2__ZjScF Heading_-black__oGzwX"]/text() | //span[@class="TitleTextItem_text__4vy_f"]//text()') if x.strip() != '、' and x.strip() != '-']
    misc.remove('Highly recommended points') # remove unwanted text
    misc = misc[:misc.index('Address')] # slice list to only include facilities before 'Address'
    
    facilities = [x for x in misc[: misc.index('Room Details')]]
    room_details = [x for x in misc[misc.index('Room Details')+1 : misc.index('Miscellaneous costs')]]
    misc_costs = [x for x in misc[misc.index('Miscellaneous costs')+1 : ]]
    
    details = {
        'Label' : label,
        'Name' : name,
        'Unit' : unit,
        'Currency' : currency,
        'Cost' : cost,
        'Maintanence' : maintenance,
        'Map' : tree.xpath('//span[@class="Caption_caption__ex6T_"]//@href')[0],
        'Facilities' : facilities,
        'Room Details' : room_details,
        'Miscellaneous Costs' : misc_costs
    }
    return details

async def scrape_data(instance, url):
    url = site_urls['urls'][0] + url
    print(f"Scraping details from {url}")
    details = []
    async with semaphore:
        await asyncio.sleep(random.uniform(1.2, 3.3))
        
        try:
            async with instance.get(url, headers=HEADERS, timeout=20) as response:
                content = await response.text()
                tree = html.fromstring(content)
                details.append(query(tree))
                # details = tree.xpath('//section[@class="Section_section-block__mDQ87"]/div[@class="page_head-bar__LmpS8"]//text()')
            
        except Exception as e:
            print(f"Error occurred while fetching {url}: {e}")
            details = {}
            
        return details

async def fetch_property(instance, url):
    
    async with semaphore:
        await asyncio.sleep(random.uniform(1.3, 2.7))
        
        try:
            async with instance.get(url, headers=HEADERS, timeout=20) as response:
                content = await response.text()
                tree = html.fromstring(content)
            
                tasks = []
                links = tree.xpath('//a[@class="RoomItem_link__RoXCn"]/@href')
                for url in links:
                    tasks.append(scrape_data(instance, url))
                
                result = await asyncio.gather(*tasks)
            
        except Exception as e:
            print(f"Error occurred while fetching {url}: {e}")
            result = []
            
        return result[0]
    
async def main():
    
    async with aiohttp.ClientSession() as instance:
        
        tasks = []
        for url in property_urls['urls'][:1]: 
            tasks.append(fetch_property(instance, url))
        
        results = await asyncio.gather(*tasks)
        
        # flaten_results = [item for sublist in results for item in sublist]
        for i in results:
            print(i)
            

In [146]:
nest_asyncio.apply()
asyncio.run(main())

Scraping details from https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/101
Scraping details from https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/105
Scraping details from https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/109
Scraping details from https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/201
Scraping details from https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/202
Scraping details from https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/203
Scraping details from https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/204
Scraping details from https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/205
Scraping details from https://www.leopal

In [144]:
async with aiohttp.ClientSession() as instance:
    async with instance.get('https://www.leopalace21.com/en/properties/monthly/hokkaido/ebetsu-shi-01217/bonita-dois-38530/202', headers=HEADERS) as response:
        content = await response.text()
        tree = html.fromstring(content)
        
        # query
        label = tree.xpath('//ul[@class="LabelList_label-list__JZVeD"]//text()')
        label = list(dict.fromkeys(label))
        
        name, unit = [x for x in tree.xpath('//h1[@class="page_heading__9lqpn"]//text()') if x.strip() != '']
        data = [x for x in tree.xpath('//div[@class="Price_text-price__OJin9 Price_-normal__yaKpe Price_-responsive__iSQmE Price_-vertical__jGQ9Y"]//text()') if x.strip() != '']
        currency, cost, maintenance = (data + [None, None])[:3]
        misc = [x for x in tree.xpath('//h2[@class="Heading_text__ihjGf Heading_-h2__ZjScF Heading_-black__oGzwX"]/text() | //span[@class="TitleTextItem_text__4vy_f"]//text()') if x.strip() != '、' and x.strip() != '-']
        misc.remove('Highly recommended points') # remove unwanted text
        misc = misc[:misc.index('Address')] # slice list to only include facilities before 'Address'
        
        facilities = [x for x in misc[: misc.index('Room Details')]]
        room_details = [x for x in misc[misc.index('Room Details')+1 : misc.index('Miscellaneous costs')]]
        misc_costs = [x for x in misc[misc.index('Miscellaneous costs')+1 : ]]
        
        details = {
            'Label' : label,
            'Name' : name,
            'Unit' : unit,
            'Currency' : currency,
            'Cost' : cost,
            'Maintanence' : maintenance,
            'Map' : tree.xpath('//span[@class="Caption_caption__ex6T_"]//@href')[0],
            'Facilities' : facilities,
            'Room Details' : room_details,
            'Miscellaneous Costs' : misc_costs
        }

In [131]:
details

{'Label': ['Online Consultation・Contract', 'Double Discount'],
 'Name': 'leopalace Bonita Dois',
 'Unit': 'Unit 101',
 'Currency': '¥',
 'Cost': '37,000',
 'Maintanence': '（Maintenance Fee ¥4,000）',
 'Map': 'https://www.google.com/maps/search/?api=1&query=43.061775%2C141.48653',
 'Facilities': ['Facilities',
  'Separated bath and toilet',
  'Bathroom dryer',
  'Washlet with warm water',
  'Indoor washing machine place',
  'Smartlock',
  'Internet Compatible',
  'LEONET',
  'Hot-water supply',
  'Flooring ',
  'Bicycle parking area',
  'Separated bath and toilet',
  'Bathroom dryer',
  'Washlet with warm water',
  'Indoor washing machine place',
  'Smartlock',
  'Internet Compatible',
  'LEONET',
  'Hot-water supply',
  'Flooring ',
  'Bicycle parking area'],
 'Room Details': ['Bunkyodai 51-5,Ebetsu-shi,Hokkai-do',
  'Hakodate main line 「Shinrinkoenstation」walk 17 minutes\nHakodate main line 「Oasastation」walk 21 minutes',
  '1K/23.18㎡',
  'Available ',
  'Wooden',
  '1/2',
  '18',
  'No