In [1]:
from datetime import datetime
import time
import pandas as pd
import pickle as pk
from bs4 import BeautifulSoup
import requests

In [2]:
with open('living_insider_condo_links.txt') as f:
    condo_links_all = f.read().splitlines()
print(f"Total condo links: {len(condo_links_all)}")

Total condo links: 485


In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import concurrent.futures

# Scraper class to manage the session and retrieval
class Scraper:
    def __init__(self):
        self.session = requests.Session()  # Reuse connections for efficiency

    def retrieve(self, link):
        try:
            page = self.session.get(link)
            soup = BeautifulSoup(page.content, 'html.parser')

            # Extracting the sqm container
            sqm_container = soup.find('div', class_="sc-ejnaz6-21 dWMjEs")
            if sqm_container:
                spans = sqm_container.find_all('span')
                last_span = spans[-1].get_text().strip() if spans else "No sqm data"
            else:
                last_span = "No sqm data"

            # Extract other information
            name_elem = soup.find(class_="sc-ejnaz6-2 fuLHNZ")
            name = name_elem.find(class_="sc-ejnaz6-3 gSIBgi").get_text() if name_elem else "No name"

            price_elem = soup.find(class_="sc-ejnaz6-5 hgrkiv")
            price = price_elem.find(class_="sale-price").get_text().strip('฿').replace(',', "") if price_elem else "No price"

            return (name, last_span, price)
        except Exception as e:
            print(f"Error processing {link}: {e}")
            return None

# Function to process links using threading
def process_links(links, max_workers=20):
    scraper = Scraper()
    condo_list = []
    total_links = len(links)
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_link = {executor.submit(scraper.retrieve, link): link for link in links}
        for i, future in enumerate(concurrent.futures.as_completed(future_to_link), 1):
            data = future.result()
            if data is not None:
                condo_list.append(data)
            if i % 1000 == 0:  # Print progress every 1000 links
                print(f"Processed {i}/{total_links} links", end='\r', flush=True)

    print(f"Processing complete. Total links processed: {total_links}")
    return condo_list

# Function to read links from a text file
def read_links_from_file(file_path):
    with open(file_path, 'r') as file:
        links = [line.strip() for line in file.readlines()]
    return links

# Main function to execute the scraping and save the results
def main():
    file_path = 'condo_links_all_faster.txt'
    condo_links_all = read_links_from_file(file_path)

    start_time = datetime.now()

    # Process the links and get the condo details
    condo_list = process_links(condo_links_all)

    # Create a DataFrame and save to CSV
    df = pd.DataFrame(condo_list, columns=['Name', 'SQM', 'Price'])
    df.to_csv("NewPrice.csv", header=['Name', 'SQM', 'Price'], index=False, encoding='utf-8-sig')

    print(f"Data saved, total valid entries: {len(condo_list)}")
    print(f'Time elapsed (hh:mm:ss.ms): {datetime.now() - start_time}')

if __name__ == "__main__":
    main()


Processing complete. Total links processed: 84000
Data saved, total valid entries: 84000
Time elapsed (hh:mm:ss.ms): 2:12:45.870510
