In [None]:
!pip install selenium
from selenium.webdriver.common.by import By
from datetime import datetime
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
import re
import numpy as np
import pandas as pd
from selenium.webdriver.edge.options import Options
import time 
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time 
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException, TimeoutException

In [None]:
#Handles all driver related operations
class WebDriverManager:
     
    def __init__(self):
        self.driver = self._init_driver()

    def _init_driver(self):
        options = Options()
        options.add_argument("--no-sandbox")
        options.add_argument("--headless")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--window-size=1920x1080")
        options.add_argument("--disable-javascript")
        driver = webdriver.Edge(options=options)
        driver.implicitly_wait(5)
        driver.delete_all_cookies()
        driver.maximize_window() 
        return driver

    def close_driver(self):
        self.driver.close()
        self.driver.quit()


In [None]:
#scraping the links to the company pages from the main listing page.
class CompanyLinkScraper:

    def __init__(self,driver):
        self.driver = driver
        self.company_url = "https://yourstory.com/companies/search?page=1"
        self.company_links = set()
       

    # Extracts all company links from the listing page
    def get_company_links(self):
        self.driver.get(self.company_url)
        self.driver.execute_script("window.scrollBy(0,500);")

        try:
            company_table = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='sc-68e2f78-2 eYOrlk'] table"))
            )
            company_rows = company_table.find_elements(By.CSS_SELECTOR, 'tr')

            for row in company_rows[1:]:
                cells = row.find_elements(By.CSS_SELECTOR, "td")
                for cell in cells:
                    link_element = cell.find_element(By.TAG_NAME, 'a')
                    if link_element:
                        link = link_element.get_attribute('href')
                        self.company_links.add(link)
        except TimeoutException:
            print("Company table not found on the page.")

In [None]:
#Extracting data from each company page
class CompanyDataScraper:
    
    def __init__(self, driver):
        self.driver = driver
               
    #To extract text from elements
    def get_text(self,parent, xpath):
        try:
            return parent.find_element(By.XPATH, xpath).text
        except (NoSuchElementException, TimeoutException):
            return "null"
        
    # Extract data from an individual company page
    def extract_company_data(self,company_links):
        companies_data=[]
        for company in company_links:
            self.driver.get(company)
            self.driver.implicitly_wait(5)
            self.driver.execute_script("window.scrollBy(0,500);")

            try:
                parent_element = WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, 'sc-d93b8f6d-0'))
                )
            except TimeoutException:
                parent_element = "null"
                

            company_data = {
            "Company Name": self.get_text(self.driver, ".//div[contains(@class, 'sc-68e2f78-2') and .//span[text()='Legal Name']]//span[@class='sc-711bcf56-13 jlAcqN']"),
            "Headquarters": self.get_text(parent_element, ".//div[contains(@class, 'sc-68e2f78-2') and .//span[text()='Headquarters']]//following-sibling::div//span[@class='sc-711bcf56-13 jlAcqN']"),
            "Business Model":self.get_text(parent_element, ".//div[contains(@class, 'sc-68e2f78-2') and .//span[text()='Business Model']]//following-sibling::div//span[@class='sc-711bcf56-13 dppYfQ']"),
            "Founding Date":self.get_text(parent_element, ".//div[contains(@class, 'sc-68e2f78-2') and .//span[text()='Founding Date']]//following-sibling::div//span[@class='sc-711bcf56-13 jlAcqN']"),
            "Number of Employees":self.get_text(parent_element, ".//div[contains(@class, 'sc-68e2f78-2') and .//span[text()='No. of Employees']]//following-sibling::div//span[@class='sc-711bcf56-13 jlAcqN']"),
            "Core Team / Founders":self.extract_team_info()
            }
            companies_data.append(company_data)
            
        return companies_data
            
    def extract_team_info(self):
        team_data=[]
        team_members = self.driver.find_elements(By.XPATH, "//div[contains(@class, 'sc-5bfa486-0')]")
        for member in team_members:
            try:
                name = member.find_element(By.XPATH, ".//div[contains(@class, 'sc-5bfa486-2')]//span[contains(@class, 'sc-711bcf56-13')]").text    
                try:
                    linkedin_url = member.find_element(By.XPATH, ".//a[contains(@href, 'linkedin.com/in/')]").get_attribute('href')
                except (NoSuchElementException, TimeoutException):
                    linkedin_url="null"
            except (NoSuchElementException, TimeoutException):
                name="null"    
            team_data.append(f'{name}:{linkedin_url}')
        return ', '.join(team_data)  

In [None]:
#Manages the storage of the scraped data and saves it to a csv file
class DataStore:
    def __init__(self):
        self.companies_df = pd.DataFrame(columns=["Company Name", "Headquarters", "Business Model", "Founding Date", "Number of Employees", "Core Team / Founders"])
        
    def store_data(self, company_data):
        for data in company_data:
            self.companies_df = pd.concat([self.companies_df, pd.DataFrame([data])], ignore_index=True)
        self.companies_df.drop_duplicates(keep='first', inplace=True)
        self.companies_df.replace("", None, inplace=True)
        
    def save_to_csv(self, filename):
        self.companies_df.to_csv(filename, index=False, na_rep='null')
        print(f"Data saved to {filename}.")

In [None]:
#Orchestrates the entire scraping process 
class CompanyScraperController:
    def __init__(self):
        self.driver_manager = WebDriverManager()
        self.link_scraper = CompanyLinkScraper(self.driver_manager.driver)
        self.data_scraper = CompanyDataScraper(self.driver_manager.driver)
        self.data_store = DataStore()
        
    def start_scraping(self):
        self.link_scraper.get_company_links()
        company_data = self.data_scraper.extract_company_data(self.link_scraper.company_links)
        self.data_store.store_data(company_data)
        self.data_store.save_to_csv("company_scraper.csv")
        self.driver_manager.close_driver()
        
if __name__ == "__main__":
    scraper = CompanyScraperController()
    scraper.start_scraping()