In [14]:
import os
import pandas as pd
import asyncio
import json
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright

# Function to load progress from a file
def load_progress():
    if os.path.exists("progress.json"):
        with open("progress.json", "r") as file:
            return json.load(file)
    else:
        return {"current_batch": None, "links": []}

# Function to save progress to a file
def save_progress(current_batch, links):
    progress = {"current_batch": current_batch, "links": links}
    with open("progress.json", "w") as file:
        json.dump(progress, file)

# Function to save company data to a file
def save_company_data(company_data):
    file_path = os.path.join(os.getcwd(), "company_data.json")
    with open(file_path, "w") as file:
        json.dump(company_data, file, indent=4)
    print(f"Company data has been saved to '{file_path}'.")

# Function to scrape all batch numbers
async def scrape_all_batch_numbers():
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        await page.goto("https://www.ycombinator.com/companies")
        
        # Wait for the main content to load
        await page.wait_for_selector('div._facet_99gj3_85')
        
        # Click on the "More options" link to load additional content
        await page.click('a._showMoreLess_99gj3_241')
        
        # Wait for additional content to load
        await page.wait_for_selector('div._facet_99gj3_85')
        
        # Get the HTML content after all content is loaded
        html_content = await page.content()

    soup = BeautifulSoup(html_content, "html.parser")
    batch_elements = soup.select('div._facet_99gj3_85:has(h4:-soup-contains("Batch")) div:has(label)')

    batch_numbers = []
    for batch in batch_elements:
        span = batch.find('span', class_='_label_99gj3_225')  # Find the span element
        if span:  # Check if span element exists
            batch_numbers.append(span.text)
    
    return batch_numbers

async def scrape_company_links(batch_num):
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        await page.goto(f"https://www.ycombinator.com/companies?batch={batch_num}")
        await page.wait_for_selector('a._company_99gj3_339')
        # Scroll down the page until there are no more new companies
        last_height = await page.evaluate("document.body.scrollHeight")
        while True:
            try:
                await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                await page.wait_for_timeout(6000)  # Let the page load
                new_height = await page.evaluate("document.body.scrollHeight")
                if new_height == last_height:
                    break
                last_height = new_height
            except Exception as e:
                print(f"An error occurred: {str(e)}")
                print("Saving links scraped so far and retrying...")
                break
        
        html_content = await page.content()

    soup = BeautifulSoup(html_content, "html.parser")
    company_cards = soup.find_all("a", class_="_company_99gj3_339")

    companies_links = []
    for card in company_cards:
        company_url = f"https://www.ycombinator.com{card['href']}"
        companies_links.append(company_url)

    return companies_links

# Function to scrape company information
async def scrape_company_info(company_url):
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        await page.goto(company_url)
        
        # Wait for the main content to load
        await page.wait_for_selector('h1.font-extralight')
        
        # Get the HTML content after all content is loaded
        html_content = await page.content()

    soup = BeautifulSoup(html_content, "html.parser")
    
    # Extract company information
    company_info = {}
    company_info['Name'] = soup.select_one('h1.font-extralight').text.strip()
    company_info['Tagline'] = soup.select('div.prosemax-w-full div h3').get_text(strip=True) if soup.find('div.prosemax-w-full h3') else None
    company_info['Description'] = soup.select_one('p.whitespace-pre-line').text.strip()
    company_info['Batch'] = soup.select_one('div.flex.flex-row.items-center.gap-\[6px\] span').text.strip()
    company_info['Company Type'] = soup.select_one('div.flex.flex-row.items-center.justify-between').text.strip()
    company_info['Industry Tags'] = [tag.text.strip() for tag in soup.select('div.align-center flex flex-row flex-wrap gap-x-2 gap-y-2 a:has(div.yc-tw-Pill.rounded-sm.bg-\[#E6E4DC\].uppercase.tracking-widest.px-3py-\[3px\].text-\[12px\].font-thin)')]
    company_info['Location'] = soup.select_one('div.flex.flex-row.justify-between:contains("Location")').text.strip()
    company_info['Website URL'] = soup.select_one('a[href].mb-2.whitespace-nowrap')['href']
    company_info['Year Founded'] = soup.select_one('div.flex.flex-row.justify-between:contains("Founded")').text.strip()
    company_info['Team Size'] = soup.select_one('div.flex.flex-row.justify-between:contains("Team Size")').text.strip()
    
    # Extract social media links
    social_links = soup.select('div.space-x-2 a')
    for link in social_links:
        if 'linkedin.com' in link['href']:
            company_info['LinkedIn Profile'] = link['href']
        elif 'twitter.com' in link['href']:
            company_info['Twitter Handle'] = link['href']
        elif 'facebook.com' in link['href']:
            company_info['Facebook Page'] = link['href']
    
    # Extract founder information
    founders = []
    founder_elements = soup.select('div.space-y-5')
    for founder_element in founder_elements:
        founder = {}
        founder['Name'] = founder_element.select_one('div.flex-grow h3.text-lg.font-bold').text.strip()
        founder['Role'] =  founder_element.get_text().split(',')[1].strip() if (founder_element := soup.select_one('div.flex-grow h3.text-lg.font-bold')) and ',' in founder_element.get_text() else None
        founder['Biography'] = founder_element.select_one('p.prose.max-w-full.whitespace-pre-line').text.strip() if founder_element.select_one('p.prose.max-w-full.whitespace-pre-line') else None
        founder_links = founder_element.select('a')
        for link in founder_links:
            if 'linkedin.com' in link['href']:
                founder['LinkedIn Profile'] = link['href']
            elif 'twitter.com' in link['href']:
                founder['Twitter Profile'] = link['href']
        founders.append(founder)
    
    company_info['Founders'] = founders
    return company_info

async def main():
    # Load progress
    progress = load_progress()
    current_batch = progress["current_batch"]
    links = progress["links"]

    if current_batch is None:
        # Start from the beginning if no progress is saved
        batch_numbers = await scrape_all_batch_numbers()
        current_batch_index = 0
    else:
        # Continue from the last batch if progress is saved
        batch_numbers = await scrape_all_batch_numbers()
        current_batch_index = batch_numbers.index(current_batch)

    for batch_num in batch_numbers[current_batch_index:]:
        print(f"Scraping batch {batch_num}...")
        batch_links = await scrape_company_links(batch_num)
        links.extend(batch_links)
        save_progress(batch_num, links)

    # Scrape company information
    company_data = []
    for link in links:
        print(f"Scraping company information from {link}...")
        try:
            company_info = await scrape_company_info(link)
            company_data.append(company_info)
        except Exception as e:
            print(f"An error occurred while scraping {link}: {str(e)}")
            print("Saving scraped data so far and retrying...")
            break
    
    # Save company data to JSON file
    save_company_data(company_data)

# Run the main function
await main()



Scraping batch S05...
Scraping company information from https://www.ycombinator.com/companies/alacrity...
Scraping company information from https://www.ycombinator.com/companies/parcelbio...
Scraping company information from https://www.ycombinator.com/companies/k-scale-labs...
An error occurred while scraping https://www.ycombinator.com/companies/k-scale-labs: Page.goto: Timeout 30000ms exceeded.
Call log:
navigating to "https://www.ycombinator.com/companies/k-scale-labs", waiting until "load"

Saving scraped data so far and retrying...
Company data has been saved to 'C:\Users\user\company_data.json'.
