In [65]:
!playwright install chromium

In [66]:
import os
import json
import asyncio
import nest_asyncio
import numpy as np
import pandas as pd
import time, random
import aiohttp
from tqdm import tqdm
from lxml import html
from concurrent.futures import ThreadPoolExecutor, as_completed

In [67]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Accept-Language": "en-US,en;q=0.9",
}
semaphore = asyncio.Semaphore(10)  # Limit concurrent requests

In [68]:
with open('site_urls.json', 'r') as f:
    site_urls = json.load(f)

with open('property_urls.json', 'r') as f:
    property_urls = json.load(f)

In [69]:
site_urls['urls']

['https://www.leopalace21.com',
 'https://www.leopalace21.com/en/search/chintai/area',
 'https://www.leopalace21.com/en/properties/chintai/area/PREFECTURE?page=']

In [70]:
property_urls['urls'][:5]

['https://www.leopalace21.com/en/properties/common/hokkaido/ebetsu-shi-01217/bonita-dois-38530',
 'https://www.leopalace21.com/en/properties/common/hokkaido/hakodate-shi-01202/shiawasenoie-17791',
 'https://www.leopalace21.com/en/properties/common/hokkaido/hakodate-shi-01202/hakodate-22098',
 'https://www.leopalace21.com/en/properties/common/hokkaido/asahikawa-shi-01204/nishimachi-dori-22990',
 'https://www.leopalace21.com/en/properties/common/hokkaido/hokuto-shi-01236/dolce-k-y-20774']

ASYNC FUNCTIONS

In [93]:
async def scrape_data(instance, url):
    url = site_urls['urls'][0] + url
    print(f"Scraping details from {url}")
    async with semaphore:
        await asyncio.sleep(random.uniform(1.2, 3.3))
        
        try:
            async with instance.get(url, headers=HEADERS, timeout=20) as response:
                content = await response.text()
                tree = html.fromstring(content)
                details = tree.xpath('//section[@class="Section_section-block__mDQ87"]/div[@class="page_head-bar__LmpS8"]//text()')
            
        except Exception as e:
            print(f"Error occurred while fetching {url}: {e}")
            details = []
            
        return details

async def fetch_property(instance, url):
    
    async with semaphore:
        await asyncio.sleep(random.uniform(1.3, 2.7))
        
        try:
            async with instance.get(url, headers=HEADERS, timeout=20) as response:
                content = await response.text()
                tree = html.fromstring(content)
            
                tasks = []
                links = tree.xpath('//a[@class="RoomItem_link__RoXCn"]/@href')
                for url in links:
                    tasks.append(scrape_data(instance, url))
                
                result = await asyncio.gather(*tasks)
            
        except Exception as e:
            print(f"Error occurred while fetching {url}: {e}")
            result = []
            
        return result
    
async def main():
    
    async with aiohttp.ClientSession() as instance:
        
        tasks = []
        for url in property_urls['urls'][:1]: 
            tasks.append(fetch_property(instance, url))
        
        results = await asyncio.gather(*tasks)
        
        # flaten_results = [item for sublist in results for item in sublist]
        for i in results:
            print(i)
            

In [94]:
nest_asyncio.apply()
asyncio.run(main())

Scraping details from https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/101
Scraping details from https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/105
Scraping details from https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/109
Scraping details from https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/201
Scraping details from https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/202
Scraping details from https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/203
Scraping details from https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/204
Scraping details from https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/205
Scraping details from https://www.leopal

In [None]:
async with aiohttp.ClientSession() as instance:
    async with instance.get('https://www.leopalace21.com/en/properties/chintai/hokkaido/ebetsu-shi-01217/bonita-dois-38530/101', headers=HEADERS) as response:
        content = await response.text()
        tree = html.fromstring(content)
        details = {
            'Label' : tree.xpath('//section[@class="Section_section-block__mDQ87"]/div[@class="page_head-bar__LmpS8"]//ul[@class="LabelList_label-list__JZVeD"]//text()'),
            'Name' : [x for x in tree.xpath('//section[@class="Section_section-block__mDQ87"]/div[@class="page_head-bar__LmpS8"]//h1[@class="page_heading__9lqpn"]//text()') if x.strip() != ''][0],
        }

In [123]:
details

{'Label': ['Online Consultation・Contract', 'Double Discount'],
 'Name': 'leopalace Bonita Dois'}