In [116]:
#importing libraries
import os
import pandas as pd
import asyncio
import json
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright

# Function to load progress from a file
def load_progress():
    if os.path.exists("progress.json"):
        with open("progress.json", "r") as file:
            return json.load(file)
    else:
        return {"current_batch": None, "links": []}

# Function to save progress to a file
def save_progress(current_batch, links):
    progress = {"current_batch": current_batch, "links": links}
    with open("progress.json", "w") as file:
        json.dump(progress, file)

# Function to save company data to a file
def save_company_data(company_data):
    file_path = os.path.join(os.getcwd(), "company_data.json")
    with open(file_path, "w") as file:
        json.dump(company_data, file, indent=4)
    print(f"Company data has been saved to '{file_path}'.")

# Function to save company data progress to a file
def save_company_progress(company_data):
    progress_path = os.path.join(os.getcwd(), "company_data_progress.json")
    with open(progress_path, "w") as file:
        json.dump(company_data, file, indent=4)
    print(f"Company data progress has been saved to '{progress_path}'.")

# Function to load company data progress from a file
def load_company_progress():
    if os.path.exists("company_data_progress.json"):
        with open("company_data_progress.json", "r") as file:
            return json.load(file)
    else:
        return []

# Function to scrape all batch numbers
async def scrape_all_batch_numbers():
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        await page.goto("https://www.ycombinator.com/companies")
        
        # Wait for the main content to load
        await page.wait_for_selector('div._facet_99gj3_85')
        
        # Click on the "More options" link to load additional content
        await page.click('a._showMoreLess_99gj3_241')
        
        # Wait for additional content to load
        await page.wait_for_selector('div._facet_99gj3_85')
        
        # Get the HTML content after all content is loaded
        html_content = await page.content()

    soup = BeautifulSoup(html_content, "html.parser")
    batch_elements = soup.select('div._facet_99gj3_85:has(h4:-soup-contains("Batch")) div:has(label)')

    batch_numbers = []
    for batch in batch_elements:
        span = batch.find('span', class_='_label_99gj3_225')  # Find the span element
        if span:  # Check if span element exists
            batch_numbers.append(span.text)
    
    return batch_numbers
    
#function to scrape company links
async def scrape_company_links(batch_num):
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        await page.goto(f"https://www.ycombinator.com/companies?batch={batch_num}")
        await page.wait_for_selector('a._company_99gj3_339')
        # Scroll down the page until there are no more new companies
        last_height = await page.evaluate("document.body.scrollHeight")
        while True:
            try:
                await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                await page.wait_for_timeout(6000)  # Let the page load
                new_height = await page.evaluate("document.body.scrollHeight")
                if new_height == last_height:
                    break
                last_height = new_height
            except Exception as e:
                print(f"An error occurred: {str(e)}")
                print("Saving links scraped so far and retrying...")
                break
        
        html_content = await page.content()

    soup = BeautifulSoup(html_content, "html.parser")
    company_cards = soup.find_all("a", class_="_company_99gj3_339")

    companies_links = []
    for card in company_cards:
        company_url = f"https://www.ycombinator.com{card['href']}"
        companies_links.append(company_url)

    return companies_links
    
#function to scrape detailed company information
async def scrape_company_info(company_url):
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()

        # Retry logic
        for attempt in range(3):  # Retry up to 3 times
            try:
                await page.goto(company_url, timeout=30000)  # Set timeout to 30 seconds
                break  # Break the loop if successful
            except Exception as e:
                print(f"Attempt {attempt + 1} failed: {str(e)}")
                if attempt < 2:
                    print("Retrying...")
                else:
                    print("Maximum attempts reached. Skipping this URL.")
                    return None  # Return None if maximum attempts reached
        
        if page.url != company_url:
            print("Page navigation failed. Skipping this URL.")
            return None

        # Wait for the main content to load
        await page.wait_for_selector('h1.font-extralight')

        # Get the HTML content after all content is loaded
        html_content = await page.content()

    soup = BeautifulSoup(html_content, "html.parser")
    
    # Extract company information
    company_info = {}
    company_info['Name'] = soup.select_one('h1.font-extralight').text.strip()
    company_info['Tagline'] = soup.find('h3').get_text()
    company_info['Description'] = soup.select_one('p.whitespace-pre-line').text.strip()
    company_info['Batch'] = soup.select_one('div.flex.flex-row.items-center.gap-\[6px\] span').text.strip()
    company_info['Company Type'] = soup.select_one('div.flex.flex-row.items-center.justify-between').text.strip()
    company_info['Industry Tags'] = [tag.text.strip() for tag in soup.select('a div.yc-tw-Pill.rounded-sm.bg-\[#E6E4DC\].uppercase.tracking-widest.px-3py-\[3px\].text-\[12px\].font-thin')] if soup.select('a div.yc-tw-Pill.rounded-sm.bg-\[#E6E4DC\].uppercase.tracking-widest.px-3py-\[3px\].text-\[12px\].font-thin') else None
    company_info['Location'] = soup.select_one('div.flex.flex-row.justify-between:contains("Location")').text.strip()
    company_info['Website URL'] = soup.select_one('a[href].mb-2.whitespace-nowrap')['href']
    company_info['Year Founded'] = soup.select_one('div.flex.flex-row.justify-between:contains("Founded")').text.strip()
    company_info['Team Size'] = soup.select_one('div.flex.flex-row.justify-between:contains("Team Size")').text.strip()
    
    # Extract social media links
    social_links = soup.select('div.space-x-2 a')
    for link in social_links:
        if 'linkedin.com' in link['href']:
            company_info['LinkedIn Profile'] = link['href']
        elif 'twitter.com' in link['href']:
            company_info['Twitter Handle'] = link['href']
        elif 'facebook.com' in link['href']:
            company_info['Facebook Page'] = link['href']
    
    # Extract founder information
    founders = []
    founder_elements = soup.select('div.flex.flex-row.flex-col.items-start.gap-3.md\:flex-row')

    for founder_element in founder_elements:
        founder = {}
        founder_name_element = founder_element.select_one('div.flex-grow h3.text-lg.font-bold')
        if founder_name_element:
            founder['Name'] = founder_name_element.text.strip()
        
        # Extracting Role
        role_element = founder_element.select_one('div.flex-grow h3.text-lg.font-bold')
        if role_element:
            founder['Role'] = role_element.get_text().split(',')[1].strip() if (role_element and ',' in role_element.get_text()) else None
        
        # Extracting Biography
        biography_element = founder_element.select_one('div.flex-grow p')
        if biography_element:
            founder['Biography'] = biography_element.text.strip()
        
        # Extracting Links
        links = founder_element.select('a')
        for link in links:
            href = link.get('href')
            if href:
                if 'linkedin.com' in href:
                    founder['LinkedIn Profile'] = href
                elif 'twitter.com' in href:
                    founder['Twitter Profile'] = href

        founders.append(founder)

    company_info['Founders'] = founders

    return company_info

async def main():
    # Load progress
    progress = load_progress()
    current_batch = progress["current_batch"]
    links = progress["links"]

    if current_batch is None:
        # Start from the beginning if no progress is saved
        batch_numbers = await scrape_all_batch_numbers()
        current_batch_index = 0
    else:
        # Continue from the last batch if progress is saved
        batch_numbers = await scrape_all_batch_numbers()
        current_batch_index = batch_numbers.index(current_batch) + 1  # Start from the next batch

    total_companies = 0
    for batch_num in batch_numbers[current_batch_index:]:
        print(f"Scraping batch {batch_num}...")
        batch_links = await scrape_company_links(batch_num)
        links.extend(batch_links)
        save_progress(batch_num, links)
        
        total_companies += len(batch_links)
        if total_companies >= 4665:
            print("Total 4665 companies' information has been scraped.")
            break

    # Resume scraping company information
    company_data_progress = load_company_progress()  # Load company data progress
    company_data = []  # List to store scraped company information

    for link in links:
        if link not in company_data_progress:  # Check if company information already scraped
            print(f"Scraping company information from {link}...")
            try:
                company_info = await scrape_company_info(link)
                company_data.append(company_info)
                company_data_progress.append(link)  # Add link to progress
                if len(company_data) >= 4665:
                    print("Total 4665 companies' information has been scraped.")
                    break
            except Exception as e:
                print(f"An error occurred while scraping {link}: {str(e)}")
                print("Saving scraped data so far and retrying...")
                break

    # Save company data to JSON file
    save_company_data(company_data)

    # Save company data progress to JSON file
    save_company_progress(company_data_progress)

# Run the main function
await main()

 

Scraping company information from https://www.ycombinator.com/companies/trueclaim...
Attempt 1 failed: Page.goto: Timeout 30000ms exceeded.
Call log:
navigating to "https://www.ycombinator.com/companies/trueclaim", waiting until "load"

Retrying...
Scraping company information from https://www.ycombinator.com/companies/cocrafter...
Scraping company information from https://www.ycombinator.com/companies/reform...
Scraping company information from https://www.ycombinator.com/companies/piramidal...
Scraping company information from https://www.ycombinator.com/companies/junction-bioscience...
Scraping company information from https://www.ycombinator.com/companies/oddsview...
Scraping company information from https://www.ycombinator.com/companies/silogy...
Scraping company information from https://www.ycombinator.com/companies/senso...
Scraping company information from https://www.ycombinator.com/companies/eris-biotech...
Scraping company information from https://www.ycombinator.com/compani