In [9]:
#importing libraries
import os
import pandas as pd
import asyncio
import json
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright

# Function to load progress from a file
def load_progress():
    if os.path.exists("progress.json"):
        with open("progress.json", "r") as file:
            return json.load(file)
    else:
        return {"current_batch": None, "links": []}

# Function to save progress to a file
def save_progress(current_batch, links):
    progress = {"current_batch": current_batch, "links": links}
    with open("progress.json", "w") as file:
        json.dump(progress, file)

# Function to save company data to a file
def save_company_data(company_data):
    file_path = os.path.join(os.getcwd(), "company_data.json")
    with open(file_path, "w") as file:
        json.dump(company_data, file, indent=4)
    print(f"Company data has been saved to '{file_path}'.")

# Function to save company data progress to a file
def save_company_progress(company_data):
    progress_path = os.path.join(os.getcwd(), "company_data_progress.json")
    with open(progress_path, "w") as file:
        json.dump(company_data, file, indent=4)
    print(f"Company data progress has been saved to '{progress_path}'.")

# Function to load company data progress from a file
def load_company_progress():
    if os.path.exists("company_data_progress.json"):
        with open("company_data_progress.json", "r") as file:
            return json.load(file)
    else:
        return []

# Function to scrape all batch numbers
async def scrape_all_batch_numbers():
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        await page.goto("https://www.ycombinator.com/companies")
        
        # Wait for the main content to load
        await page.wait_for_selector('div._facet_99gj3_85')
        
        # Click on the "More options" link to load additional content
        await page.click('a._showMoreLess_99gj3_241')
        
        # Wait for additional content to load
        await page.wait_for_selector('div._facet_99gj3_85')
        
        # Get the HTML content after all content is loaded
        html_content = await page.content()

    soup = BeautifulSoup(html_content, "html.parser")
    batch_elements = soup.select('div._facet_99gj3_85:has(h4:-soup-contains("Batch")) div:has(label)')

    batch_numbers = []
    for batch in batch_elements:
        span = batch.find('span', class_='_label_99gj3_225')  # Find the span element
        if span:  # Check if span element exists
            batch_numbers.append(span.text)
    
    return batch_numbers
    
#function to scrape company links
async def scrape_company_links(batch_num):
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        await page.goto(f"https://www.ycombinator.com/companies?batch={batch_num}")
        await page.wait_for_selector('a._company_99gj3_339')
        # Scroll down the page until there are no more new companies
        last_height = await page.evaluate("document.body.scrollHeight")
        while True:
            try:
                await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                await page.wait_for_timeout(6000)  # Let the page load
                new_height = await page.evaluate("document.body.scrollHeight")
                if new_height == last_height:
                    break
                last_height = new_height
            except Exception as e:
                print(f"An error occurred: {str(e)}")
                print("Saving links scraped so far and retrying...")
                break
        
        html_content = await page.content()

    soup = BeautifulSoup(html_content, "html.parser")
    company_cards = soup.find_all("a", class_="_company_99gj3_339")

    companies_links = []
    for card in company_cards:
        company_url = f"https://www.ycombinator.com{card['href']}"
        companies_links.append(company_url)

    return companies_links
    
#function to scrape detailed company information
async def scrape_company_info(company_url):
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()

        # Retry logic
        for attempt in range(3):  # Retry up to 3 times
            try:
                await page.goto(company_url, timeout=30000)  # Set timeout to 30 seconds
                break  # Break the loop if successful
            except Exception as e:
                print(f"Attempt {attempt + 1} failed: {str(e)}")
                if attempt < 2:
                    print("Retrying...")
                else:
                    print("Maximum attempts reached. Skipping this URL.")
                    return None  # Return None if maximum attempts reached
        
        if page.url != company_url:
            print("Page navigation failed. Skipping this URL.")
            return None

        # Wait for the main content to load
        await page.wait_for_selector('h1.font-extralight')

        # Get the HTML content after all content is loaded
        html_content = await page.content()

    soup = BeautifulSoup(html_content, "html.parser")
    
    # Extract company information
    company_info = {}
    company_info['Name'] = soup.select_one('h1.font-extralight').text.strip()
    company_info['Tagline'] = soup.find('h3').get_text()
    company_info['Description'] = soup.select_one('p.whitespace-pre-line').text.strip()
    company_info['Batch'] = soup.select_one('div.flex.flex-row.items-center.gap-\[6px\] span').text.strip()
    company_info['Company Type'] = soup.select_one('div.flex.flex-row.items-center.justify-between').text.strip()
    company_info['Industry Tags'] = [tag.text.strip() for tag in soup.select('a div.yc-tw-Pill.rounded-sm.bg-\[#E6E4DC\].uppercase.tracking-widest.px-3py-\[3px\].text-\[12px\].font-thin')] if soup.select('a div.yc-tw-Pill.rounded-sm.bg-\[#E6E4DC\].uppercase.tracking-widest.px-3py-\[3px\].text-\[12px\].font-thin') else None
    company_info['Location'] = soup.select_one('div.flex.flex-row.justify-between:contains("Location")').text.strip()
    company_info['Website URL'] = soup.select_one('a[href].mb-2.whitespace-nowrap')['href']
    company_info['Year Founded'] = soup.select_one('div.flex.flex-row.justify-between:contains("Founded")').text.strip()
    company_info['Team Size'] = soup.select_one('div.flex.flex-row.justify-between:contains("Team Size")').text.strip()
    
    # Extract social media links
    social_links = soup.select('div.space-x-2 a')
    for link in social_links:
        if 'linkedin.com' in link['href']:
            company_info['LinkedIn Profile'] = link['href']
        elif 'twitter.com' in link['href']:
            company_info['Twitter Handle'] = link['href']
        elif 'facebook.com' in link['href']:
            company_info['Facebook Page'] = link['href']
    
    # Extract founder information
    founders = []
    founder_elements = soup.select('div.flex.flex-row.flex-col.items-start.gap-3.md\:flex-row')

    for founder_element in founder_elements:
        founder = {}
        founder_name_element = founder_element.select_one('div.flex-grow h3.text-lg.font-bold')
        if founder_name_element:
            founder['Name'] = founder_name_element.text.strip()
        
        # Extracting Role
        role_element = founder_element.select_one('div.flex-grow h3.text-lg.font-bold')
        if role_element:
            founder['Role'] = role_element.get_text().split(',')[1].strip() if (role_element and ',' in role_element.get_text()) else None
        
        # Extracting Biography
        biography_element = founder_element.select_one('div.flex-grow p')
        if biography_element:
            founder['Biography'] = biography_element.text.strip()
        
        # Extracting Links
        links = founder_element.select('a')
        for link in links:
            href = link.get('href')
            if href:
                if 'linkedin.com' in href:
                    founder['LinkedIn Profile'] = href
                elif 'twitter.com' in href:
                    founder['Twitter Profile'] = href

        founders.append(founder)

    company_info['Founders'] = founders

    return company_info

async def main():
    # Load progress
    progress = load_progress()
    current_batch = progress["current_batch"]
    links = progress["links"]

    if current_batch is None:
        # Start from the beginning if no progress is saved
        batch_numbers = await scrape_all_batch_numbers()
        current_batch_index = 0
    else:
        # Continue from the last batch if progress is saved
        batch_numbers = await scrape_all_batch_numbers()
        current_batch_index = batch_numbers.index(current_batch) + 1  # Start from the next batch

    total_companies = 0
    for batch_num in batch_numbers[current_batch_index:]:
        print(f"Scraping batch {batch_num}...")
        batch_links = await scrape_company_links(batch_num)
        links.extend(batch_links)
        save_progress(batch_num, links)
        
        total_companies += len(batch_links)
        if total_companies >= 4665:
            print("Total 4665 companies' information has been scraped.")
            break

    # Resume scraping company information
    company_data_progress = load_company_progress()  # Load company data progress
    company_data = []  # List to store scraped company information

    for link in links:
        if link not in company_data_progress:  # Check if company information already scraped
            print(f"Scraping company information from {link}...")
            try:
                company_info = await scrape_company_info(link)
                company_data.append(company_info)
                company_data_progress.append(link)  # Add link to progress
                if len(company_data) >= 4665:
                    print("Total 4665 companies' information has been scraped.")
                    break
            except Exception as e:
                print(f"An error occurred while scraping {link}: {str(e)}")
                print("Saving scraped data so far and retrying...")
                break

    # Save company data to JSON file
    save_company_data(company_data)

    # Save company data progress to JSON file
    save_company_progress(company_data_progress)

# Run the main function
await main()

 

Scraping company information from https://www.ycombinator.com/companies/bitsample...
An error occurred while scraping https://www.ycombinator.com/companies/bitsample: 'NoneType' object is not subscriptable
Saving scraped data so far and retrying...
Company data has been saved to 'C:\Users\user\company_data.json'.
Company data progress has been saved to 'C:\Users\user\company_data_progress.json'.


## Adding company urls to company_data.json file

In [70]:
import json
import re

# Load company data from company_data.json
with open("company_data.json", "r") as json_file:
    company_data = json.load(json_file)

# Load company URLs from company_data_progress.json
with open("progress.json", "r") as urls_file:
    progress_data = json.load(urls_file)
# Extracting URLs from the second dictionary
url_list = []
links_data = progress_data.get('links', [])
if isinstance(links_data, list):
    url_list = links_data

# Create a dictionary to map company names to URLs
company_url_map = {url.split("/")[-1]: url for url in url_list}
# List to store companies for which URLs are not updated
company_not_updated=[]

# Function to remove words in parentheses from a string

def remove_words_in_parentheses(text):
    return re.sub(r'\([^)]*\)', '', text)
# Iterate through the company data
for idx, company in enumerate(company_data):
    # Check if company is not None
    if company is not None:
        # Get the name of the company
        company_name =remove_words_in_parentheses(company.get("Name").lower().replace(" ", "-").replace(".","-").replace("&","and"))
        
        # Print company name and index for debugging
        print(f"Processing company {company_name} at index {idx}")
        
        # Check if the company name exists and is not None
        if company_name:
           
            # Check if the company name exists in the URL map
            if company_name in company_url_map:
                # Add URL to the company's dictionary
                
                company["url"] = company_url_map[company_name]
                print(company["url"])
            else:
                company_not_updated.append(company_name)
# Save updated company data to company_data
with open("company_data.json", "w") as json_file:
        json.dump(company_data, json_file, indent=4)
print("Company data successfully updated.")
print(len(company_not_updated))
#print(f"the company which is not updated with url :{company_not_updated}")



Processing company trueclaim at index 0
https://www.ycombinator.com/companies/trueclaim
Processing company cocrafter at index 1
https://www.ycombinator.com/companies/cocrafter
Processing company reform at index 2
https://www.ycombinator.com/companies/reform
Processing company piramidal at index 3
https://www.ycombinator.com/companies/piramidal
Processing company junction-bioscience at index 4
https://www.ycombinator.com/companies/junction-bioscience
Processing company oddsview at index 5
https://www.ycombinator.com/companies/oddsview
Processing company silogy at index 6
https://www.ycombinator.com/companies/silogy
Processing company senso at index 7
https://www.ycombinator.com/companies/senso
Processing company eris-biotech at index 8
https://www.ycombinator.com/companies/eris-biotech
Processing company powder at index 9
https://www.ycombinator.com/companies/powder
Processing company openfoundry at index 10
https://www.ycombinator.com/companies/openfoundry
Processing company goldenbasi


## As i see there is 179 company data is not updated with url 
## To handle the situation where the company name in the URL might not exactly match the company name in the company_data
## So, implemented a fuzzy matching algorithm or a partial string matching algorithm. One popular library for this task is fuzzywuzzy.

In [69]:
from fuzzywuzzy import fuzz

# Set a threshold for similarity score
SIMILARITY_THRESHOLD = 50  # Adjust as needed

# List to store company names for which URLs are not updated
unmatched_company_names = []


# Update the URLs for each company
for company in company_not_updated:
    company_name = company.lower().replace(" ","-") # Convert company name to lowercase
    # Flag to check if URL is updated
    url_updated = False
    # Find the URL corresponding to the company name
    for key, value in company_url_map.items():
        if fuzz.partial_ratio(key.lower(), company_name) >= SIMILARITY_THRESHOLD:
            # Update the URL for the company
            for company in company_data:
                if company.get("Name", "").lower().replace(" ","-") == company_name:
                    company["url"] = value
                    print(value)
                    url_updated = True
                    break
    if url_updated==True:
        break
    # If URL is not updated, add company name to the list
    else:
        unmatched_company_names.append(company_name)

# Save updated company data to company_data.json
with open("company_data.json", "w") as json_file:
    json.dump(company_data, json_file, indent=4)

print("Company URLs added to company data and saved successfully.")

# Print unmatched company names
print("Company names for which URLs are not updated:", unmatched_company_names)

https://www.ycombinator.com/companies/mathgptpro
https://www.ycombinator.com/companies/cocrafter
https://www.ycombinator.com/companies/eris-biotech
https://www.ycombinator.com/companies/arcane
https://www.ycombinator.com/companies/vista-space
https://www.ycombinator.com/companies/hatchet-2
https://www.ycombinator.com/companies/sparechange
https://www.ycombinator.com/companies/resonance
https://www.ycombinator.com/companies/aether-energy
https://www.ycombinator.com/companies/intercept
https://www.ycombinator.com/companies/creo
https://www.ycombinator.com/companies/celest
https://www.ycombinator.com/companies/meticulate
https://www.ycombinator.com/companies/patchwork
https://www.ycombinator.com/companies/basalt-tech
https://www.ycombinator.com/companies/tile
https://www.ycombinator.com/companies/maihem
https://www.ycombinator.com/companies/happenstance
https://www.ycombinator.com/companies/stitch-technologies
https://www.ycombinator.com/companies/healthtech-1
https://www.ycombinator.com/

update the structure of the JSON file so that the "url" is placed directly above the "Name" in each company's entry

In [12]:
import json

# Load the JSON data
with open('company_data.json', 'r') as file:
    data = json.load(file)

if data is not None:  # Check if data is not None
    # Rename the "url" key to "yc-company url" if it exists and move it to the first position
    for company in data:
        # Check if the company dictionary is not None
        if company is not None:
            if company.get('url') is not None:
                company['yc-company url'] = company.pop('url')

    # Update the structure
    for company in data:
        # Check if the company dictionary is not None
        if company is not None:
            # Get the "yc-company url" value
            url = company.pop('yc-company url', None)  # Handle the case if 'yc-company url' doesn't exist
            # Reconstruct the dictionary with reordered keys
            updated_company = {}
            if url is not None:
                updated_company['yc-company url'] = url
            for key, value in company.items():
                updated_company[key] = value
            # Update the original dictionary with the reordered keys
            company.clear()
            company.update(updated_company)

    # Save the updated JSON data
    with open('updated_company_data.json', 'w') as file:
        json.dump(data, file, indent=4)

    # Print the structure of the first few dictionaries after updating
    for i, company in enumerate(data[:5], start=1):
        print(f"Company {i}:")
        print(json.dumps(company, indent=4))
        print()
else:
    print("No data loaded from the JSON file.")


Company 1:
{
    "yc-company url": "https://www.ycombinator.com/companies/trueclaim",
    "Name": "TrueClaim",
    "Tagline": "Automatically saving self-insured companies 7% on healthcare",
    "Description": "TrueClaim processes all payments between healthcare providers and companies that self-fund their insurance. Unlike incumbents, TrueClaim uses newly available data and AI to save companies 7% of their healthcare costs.",
    "Batch": "W24",
    "Company Type": "Active",
    "Industry Tags": null,
    "Location": "Location:San Francisco",
    "Website URL": "https://www.trytrueclaim.com",
    "Year Founded": "Founded:2023",
    "Team Size": "Team Size:2",
    "LinkedIn Profile": "https://linkedin.com/in/rwbayer",
    "Founders": [
        {
            "Name": "Barbora Howell",
            "Role": null,
            "Biography": "Barbora got her MBA at Stanford. She joined Hinge Health as employee #30 where she built and ran a number of functions, including HR, benefits and customer

In [15]:
import json
from unidecode import unidecode

# Function to fix Unicode errors and remove escape sequences in JSON data
def fix_unicode_and_remove_escapes(data):
    if isinstance(data, str):
        # Transliterate Unicode characters into ASCII
        data = unidecode(data)
        # Remove escape sequences
        data = data.replace('\n', '').replace('\t', '').replace('\\u', '')
        return data
    elif isinstance(data, dict):
        return {fix_unicode_and_remove_escapes(key): fix_unicode_and_remove_escapes(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [fix_unicode_and_remove_escapes(item) for item in data]
    else:
        return data

# Read the original JSON file
with open('updated_company_data.json', 'r', encoding='utf-8') as file:
    try:
        # Load JSON data
        original_data = json.load(file)
    except UnicodeDecodeError:
        # If Unicode error occurs, read the file as bytes and decode it
        file.seek(0)
        original_data = json.loads(file.read().decode('utf-8'))

# Fix Unicode errors and remove escape sequences in original JSON data
fixed_data = fix_unicode_and_remove_escapes(original_data)

# Write the fixed JSON data back to the file
with open('updated_company_data.json', 'w', encoding='utf-8') as file:
    json.dump(fixed_data, file, ensure_ascii=False, indent=4)

# Function to print the first few elements of JSON data
def print_first_few_elements(data, num_elements=5):
    print(json.dumps(data[:num_elements], indent=4))


# Print the first few elements of fixed JSON data
print("\nFixed JSON data (First 5 elements):")
print_first_few_elements(fixed_data)



Fixed JSON data (First 5 elements):
[
    {
        "yc-company url": "https://www.ycombinator.com/companies/trueclaim",
        "Name": "TrueClaim",
        "Tagline": "Automatically saving self-insured companies 7% on healthcare",
        "Description": "TrueClaim processes all payments between healthcare providers and companies that self-fund their insurance. Unlike incumbents, TrueClaim uses newly available data and AI to save companies 7% of their healthcare costs.",
        "Batch": "W24",
        "Company Type": "Active",
        "Industry Tags": null,
        "Location": "Location:San Francisco",
        "Website URL": "https://www.trytrueclaim.com",
        "Year Founded": "Founded:2023",
        "Team Size": "Team Size:2",
        "LinkedIn Profile": "https://linkedin.com/in/rwbayer",
        "Founders": [
            {
                "Name": "Barbora Howell",
                "Role": null,
                "Biography": "Barbora got her MBA at Stanford. She joined Hinge Healt

In [18]:
import json
from unidecode import unidecode

# Function to remove \r and \n from strings
def remove_newlines(data):
    if isinstance(data, str):
        return data.replace('\r', '').replace('\n', '')
    elif isinstance(data, dict):
        return {key: remove_newlines(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [remove_newlines(item) for item in data]
    else:
        return data

# Read the JSON file
with open('updated_company_data.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Remove \r and \n from the data
updated_data = remove_newlines(data)

# Write the updated data back to the file
with open('updated_company_data.json', 'w', encoding='utf-8') as file:
    json.dump(updated_data, file, ensure_ascii=False, indent=4)

# Print the first 5 elements of the updated data
print("First 5 elements of updated JSON data (after removing \\r and \\n):")
print(json.dumps(updated_data[:5], indent=4))


First 5 elements of updated JSON data (after removing \r and \n):
[
    {
        "yc-company url": "https://www.ycombinator.com/companies/trueclaim",
        "Name": "TrueClaim",
        "Tagline": "Automatically saving self-insured companies 7% on healthcare",
        "Description": "TrueClaim processes all payments between healthcare providers and companies that self-fund their insurance. Unlike incumbents, TrueClaim uses newly available data and AI to save companies 7% of their healthcare costs.",
        "Batch": "W24",
        "Company Type": "Active",
        "Industry Tags": null,
        "Location": "Location:San Francisco",
        "Website URL": "https://www.trytrueclaim.com",
        "Year Founded": "Founded:2023",
        "Team Size": "Team Size:2",
        "LinkedIn Profile": "https://linkedin.com/in/rwbayer",
        "Founders": [
            {
                "Name": "Barbora Howell",
                "Role": null,
                "Biography": "Barbora got her MBA at Sta