In [107]:
import os
import re
import json
import asyncio
import nest_asyncio
import numpy as np
import pandas as pd
import time, random
import aiohttp
from tqdm import tqdm
from lxml import html
from concurrent.futures import ThreadPoolExecutor, as_completed

In [108]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Accept-Language": "en-US,en;q=0.9",
}
semaphore = asyncio.Semaphore(10)  # Limit concurrent requests
file = 'result.csv' # csv filename

In [109]:
with open('site_urls.json', 'r') as f:
    site_urls = json.load(f)

with open('property_urls.json', 'r') as f:
    property_urls = json.load(f)

In [110]:
site_urls['urls']

['https://www.leopalace21.com',
 'https://www.leopalace21.com/en/search/chintai/area',
 'https://www.leopalace21.com/en/properties/chintai/area/PREFECTURE?page=']

In [111]:
property_urls['urls'][:5]

['https://www.leopalace21.com/en/properties/common/hokkaido/ebetsu-shi-01217/bonita-dois-38530',
 'https://www.leopalace21.com/en/properties/common/hokkaido/hakodate-shi-01202/shiawasenoie-17791',
 'https://www.leopalace21.com/en/properties/common/hokkaido/hakodate-shi-01202/hakodate-22098',
 'https://www.leopalace21.com/en/properties/common/hokkaido/asahikawa-shi-01204/nishimachi-dori-22990',
 'https://www.leopalace21.com/en/properties/common/hokkaido/hokuto-shi-01236/dolce-k-y-20774']

In [None]:
def csv_export(result):
    df = pd.DataFrame(result)
    # if file exists, append without headers
    if os.path.exists(file):
        dt = pd.read_csv(file)
        df = pd.concat([df, dt], ignore_index=True)
        df = df.drop_duplicates(subset=['Link'])

        df.to_csv(file, mode='w', index=False)
    else:
        df.to_csv(file, mode='w', index=False)

def misc_cost_cleanup(x):
    # Try to find a number after the '~'
    match = re.search(r'~\s*[^0-9]*([\d,]+)', x)
    if match:
        # Found something after '~'
        return int(re.sub(r'[^0-9]', '', match.group(1)))
    else:
        # No '~' found, just take the numbers in the string
        numbers = re.findall(r'[\d,]+', x)
        if numbers:
            return int(re.sub(r'[^0-9]', '', numbers[-1]))
        else:
            return None  # No numbers at all
        
def query(tree, url):
    label = tree.xpath('//ul[@class="LabelList_label-list__JZVeD"]//text()')
    label = list(dict.fromkeys(label))
    
    # query + data cleaning
    label = tree.xpath('//ul[@class="LabelList_label-list__JZVeD"]//text()')
    label = list(dict.fromkeys(label))
    
    name, unit = [x for x in tree.xpath('//h1[@class="page_heading__9lqpn"]//text()') if x.strip() != '']
    data = [x for x in tree.xpath('//div[@class="Price_text-price__OJin9 Price_-normal__yaKpe Price_-responsive__iSQmE Price_-vertical__jGQ9Y"]//text()') if x.strip() != '']
    currency, cost, maintenance = (data + [None, None])[:3]
    misc = [x for x in tree.xpath('//h2[@class="Heading_text__ihjGf Heading_-h2__ZjScF Heading_-black__oGzwX"]/text() | //span[@class="TitleTextItem_text__4vy_f"]//text()') if x.strip() != '、' and x.strip() != '-']
    misc.remove('Highly recommended points') # remove unwanted text
    misc = misc[:misc.index('Address')] # slice list to only include facilities before 'Address'
    
    facilities = [x for x in misc[: misc.index('Room Details')]]
    room_details = [x for x in misc[misc.index('Room Details')+1 : misc.index('Miscellaneous costs')]]
    misc_costs_tmp = [x for x in misc[misc.index('Miscellaneous costs')+1 : ]]
    
    misc_costs = [misc_cost_cleanup(x) for x in misc_costs_tmp[:int(len(misc_costs_tmp)/2)]]
    
    details = {
        'Label' : label,
        'Name' : name,
        'Unit' : unit,
        'Currency' : currency,
        'Cost' : cost,
        'Maintanence' : maintenance,
        'Facilities' : facilities,
        'Room Details' : room_details,
        'Miscellaneous Costs' : sum(misc_costs),
        'Link' : url,
        'Map' : tree.xpath('//span[@class="Caption_caption__ex6T_"]//@href')[0]
    }
    return details

async def scrape_data(instance, url):
    url = site_urls['urls'][0] + url
    print(f"Scraping details from {url}")
    
    async with semaphore:
        await asyncio.sleep(random.uniform(1.2, 3.3))
        
        try:
            async with instance.get(url, headers=HEADERS, timeout=20) as response:
                content = await response.text()
                tree = html.fromstring(content)
                return query(tree, url)
                
        except Exception as e:
            print(f"Error occurred while fetching {url}: {e}")
            return {}

async def fetch_property(instance, url):
    
    async with semaphore:
        await asyncio.sleep(random.uniform(1.3, 2.7))
        
        try:
            async with instance.get(url, headers=HEADERS, timeout=20) as response:
                content = await response.text()
                tree = html.fromstring(content)
            
                links = tree.xpath('//a[@class="RoomItem_link__RoXCn"]/@href')
                tasks = [scrape_data(instance, url) for url in links]
                result = await asyncio.gather(*tasks)
                
                return result
                
        except Exception as e:
            print(f"Error occurred while fetching {url}: {e}")
            result = []
    
async def main():
    
    async with aiohttp.ClientSession() as instance:
        
        tasks = []
        for url in property_urls['urls'][:2]: 
            tasks.append(fetch_property(instance, url))
        
        results = await asyncio.gather(*tasks)
        
        flaten_results = [item for sublist in results for item in sublist]
        csv_export(flaten_results)
        # for i in flaten_results:
        #     print(i)

In [113]:
nest_asyncio.apply()

In [144]:
asyncio.run(main())

Scraping details from https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/101
Scraping details from https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/105
Scraping details from https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/109
Scraping details from https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/201
Scraping details from https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/202
Scraping details from https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/203
Scraping details from https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/204
Scraping details from https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/205
Scraping details from https://www.leopal