In [8]:
import os
import pandas as pd
import asyncio
import json
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright

# Function to load progress from a file
def load_progress():
    if os.path.exists("progress.json"):
        with open("progress.json", "r") as file:
            return json.load(file)
    else:
        return {"current_batch": None, "links": []}

# Function to save progress to a file
def save_progress(current_batch, links):
    progress = {"current_batch": current_batch, "links": links}
    with open("progress.json", "w") as file:
        json.dump(progress, file)

# Function to scrape all batch numbers
async def scrape_all_batch_numbers():
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        await page.goto("https://www.ycombinator.com/companies")
        
        # Wait for the main content to load
        await page.wait_for_selector('div._facet_99gj3_85')
        
        # Click on the "More options" link to load additional content
        await page.click('a._showMoreLess_99gj3_241')
        
        # Wait for additional content to load
        await page.wait_for_selector('div._facet_99gj3_85')
        
        # Get the HTML content after all content is loaded
        html_content = await page.content()

    soup = BeautifulSoup(html_content, "html.parser")
    batch_elements = soup.select('div._facet_99gj3_85:has(h4:-soup-contains("Batch")) div:has(label)')

    batch_numbers = []
    for batch in batch_elements:
        span = batch.find('span', class_='_label_99gj3_225')  # Find the span element
        if span:  # Check if span element exists
            batch_numbers.append(span.text)
    
    return batch_numbers

# Function to scrape company links
async def scrape_company_links(batch_num):
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        await page.goto(f"https://www.ycombinator.com/companies?batch={batch_num}")
        await page.wait_for_selector('a._company_99gj3_339')
        # Scroll down the page until there are no more new companies
        last_height = await page.evaluate("document.body.scrollHeight")
        while True:
            try:
                await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                await page.wait_for_timeout(6000)  # Let the page load
                new_height = await page.evaluate("document.body.scrollHeight")
                if new_height == last_height:
                    break
                last_height = new_height
            except Exception as e:
                print(f"An error occurred: {str(e)}")
                print("Saving links scraped so far and retrying...")
                break
        
        html_content = await page.content()

    soup = BeautifulSoup(html_content, "html.parser")
    company_cards = soup.find_all("a", class_="_company_99gj3_339")

    companies_links = []
    for card in company_cards:
        company_url = f"https://www.ycombinator.com{card['href']}"
        companies_links.append(company_url)

    return companies_links

async def main():
    # Load progress
    progress = load_progress()
    current_batch = progress["current_batch"]
    links = progress["links"]

    if current_batch is None:
        # Start from the beginning if no progress is saved
        batch_numbers = await scrape_all_batch_numbers()
        current_batch_index = 0
    else:
        # Continue from the last batch if progress is saved
        batch_numbers = await scrape_all_batch_numbers()
        current_batch_index = batch_numbers.index(current_batch)

    for batch_num in batch_numbers[current_batch_index:]:
        print(f"Scraping batch {batch_num}...")
        batch_links = await scrape_company_links(batch_num)
        links.extend(batch_links)
        save_progress(batch_num, links)

    # Convert scraped data to DataFrame
    df = pd.DataFrame(links, columns=['links'])

    # Save data to CSV file
    file_path = os.path.join(os.getcwd(), "ycombinator_companies.csv")
    df.to_csv(file_path, index=False)
    print(f"Data has been scraped and saved to '{file_path}'.")

# Run the main function
await main()


Scraping batch W14...
Scraping batch S13...
Scraping batch W13...
Scraping batch S12...
Scraping batch W12...
Scraping batch S11...
Scraping batch W11...
Scraping batch S10...
Scraping batch W10...
Scraping batch S09...
Scraping batch W09...
Scraping batch S08...
Scraping batch W08...
Scraping batch S07...
Scraping batch W07...
Scraping batch S06...
Scraping batch W06...
Scraping batch S05...
Data has been scraped and saved to 'C:\Users\user\ycombinator_companies.csv'.
