In [1]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time
from tqdm import tqdm  # Progress bar
import nest_asyncio  # For environments like Jupyter notebooks
import random  # For random delays

# Apply nest_asyncio to allow nested event loops (only if needed in environments like Jupyter)
nest_asyncio.apply()

# Asynchronous Scraper class
class AsyncScraper:
    def __init__(self):
        self.session = None
        self.failed_links = []  # To store failed links

    async def fetch(self, session, url):
        retries = 2  # Reduced retries to speed up scraping
        for attempt in range(retries):
            try:
                async with session.get(url, timeout=5) as response:  # Reduced timeout for faster responses
                    response.raise_for_status()
                    html = await response.text()
                    soup = BeautifulSoup(html, 'lxml')

                    # Extracting required data
                    sqm_container = soup.find('div', class_="sc-ejnaz6-21 dWMjEs")
                    last_span = sqm_container.find_all('span')[-1].get_text().strip() if sqm_container else "No sqm data"

                    name_elem = soup.select_one(".sc-ejnaz6-2 .sc-ejnaz6-3")
                    name = name_elem.get_text() if name_elem else "No name"

                    price_elem = soup.select_one(".sc-ejnaz6-5 .sale-price")
                    price = price_elem.get_text().strip('฿').replace(',', "") if price_elem else "No price"

                    return (name, last_span, price)
            except (aiohttp.ClientError, asyncio.TimeoutError):
                if attempt < retries - 1:
                    await asyncio.sleep(2 ** attempt)  # Exponential backoff
                else:
                    self.failed_links.append(url)  # Collect failed links
                    return None

    async def scrape_links(self, urls, max_concurrency=100):  # Increased concurrency to 100
        async with aiohttp.ClientSession(headers={'User-Agent': 'Mozilla/5.0'}) as session:
            tasks = []
            semaphore = asyncio.Semaphore(max_concurrency)  # Limit concurrency

            async def bound_fetch(url):
                async with semaphore:
                    await asyncio.sleep(random.uniform(0.1, 0.5))  # Random delay to prevent rate limiting
                    return await self.fetch(session, url)

            tasks = [bound_fetch(url) for url in urls]

            # Using tqdm to show progress
            results = []
            for future in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Processing"):
                result = await future
                if result:
                    results.append(result)

            return results

# Helper function to read links from file
def read_links_from_file(file_path):
    with open(file_path, 'r') as file:
        return [line.strip() for line in file if line.strip()]

# Main function to execute the scraping
async def run_scraper():
    file_path = 'condo_links_all.txt'
    condo_links_all = read_links_from_file(file_path)
    scraper = AsyncScraper()

    start_time = datetime.now()

    # Scrape the links with progress bar
    condo_list = await scraper.scrape_links(condo_links_all)

    # Save to DataFrame and CSV
    df = pd.DataFrame(condo_list, columns=['Name', 'SQM', 'Price'])
    df.to_csv("propertynewprice.csv", header=['Name', 'SQM', 'Price'], index=False, encoding='utf-8-sig')

    print(f"Data saved, total valid entries: {len(condo_list)}")
    
    # Save failed links to CSV
    if scraper.failed_links:
        pd.DataFrame(scraper.failed_links, columns=["Failed Links"]).to_csv("Error_fetching.csv", index=False)

    print(f'Time elapsed (hh:mm:ss.ms): {datetime.now() - start_time}')

# Function to check if the event loop is already running and run the scraper
def main():
    try:
        # If already inside an event loop (e.g., Jupyter Notebook), use this
        if asyncio.get_event_loop().is_running():
            return asyncio.ensure_future(run_scraper())
        # Otherwise, use asyncio.run() to run the main async function
        else:
            return asyncio.run(run_scraper())
    except RuntimeError:
        # For environments like Jupyter Notebook, where get_event_loop() may raise an error
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(run_scraper())

# Run the scraper
if __name__ == "__main__":
    main()


Processing: 100%|██████████| 111314/111314 [3:12:13<00:00,  9.65it/s] 


Data saved, total valid entries: 29548
Time elapsed (hh:mm:ss.ms): 3:12:13.763106
