In [1]:
#import the Necessary Libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import time

In [2]:
#Create seperate Functions for Each column to be filled

def company_description(company_name,response):
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        #print(soup)
        # Check if the 'About' section exists on the page
        about_section = soup.find('span', {'class': 'description ng-star-inserted'})

        if about_section:
            # Check if text content is available
            description = about_section.text.strip() if about_section.text else "No company description available"
            return description
        else:
            return f"No Description found for {company_name}"
    else:
        return f"Failed to retrieve information. Status code: {response.status_code}"


def find_sector(company_name,response):

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        #print(soup)
        # Check if the information on sector exists on the page
        sector = soup.find('span', {'class': 'component--field-formatter field-type-enum ng-star-inserted'})

        if sector:
            # Check if text content is available
            description = sector.text.strip() if sector.text else "No company description available"
            return description
        else:
            return f"No information found for {company_name}"
    else:
        return f"Failed to retrieve information. Status code: {response.status_code}"


def find_industry(company_name,response):

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all div elements with the class 'chip-text'(industry)
        chip_texts = soup.find_all('div', {'class': 'chip-text'})

        if chip_texts:
            # Extract the text content of each 'chip-text'
            descriptions = [chip_text.text.strip() for chip_text in chip_texts]
            industry_str = ', '.join(descriptions)
            #print(industry_str)
            return industry_str
        else:
            return f"No Information found for {company_name}"
    else:
        return f"Failed to retrieve information. Status code: {response.status_code}"


def last_fund(company_name,response):

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all <a> elements with the specified class
        a_elements = soup.find_all('a', {'class': 'component--field-formatter field-type-enum accent highlight-color-contrast-light ng-star-inserted'})

        if a_elements and len(a_elements) >= 2:
            # Extract and print the text content of the second <a> element
            description = a_elements[1].text.strip()
            return description
        else:
            return f"No Funding found for {company_name}"
    else:
        return f"Failed to retrieve information. Status code: {response.status_code}"


def find_competitors(company_name):
    #since competitors information is on another page we are navigating to different api and extracting the information
    url = f'https://www.crunchbase.com/organization/{company_name.lower()}/org_similarity_overview'

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    response = requests.get(url, headers=headers)
    print(response.status_code)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all <a> elements with specific attributes
        matching_a_elements = soup.find_all('div', {'class': 'reasons-container ng-star-inserted'})
        com_name=soup.find('h1',{'class':'profile-name'})
        if matching_a_elements and com_name:
            # Extract and return the text content of all matching <a> elements in a list
            results = [element.text.strip() for element in matching_a_elements]
            comp_name=com_name.text.strip()
            res=[]
            #print(results)
            for text in results:
              # matches = re.findall(fr"{company_name} and (\w+)", text)
              pattern = re.compile(comp_name + r'\s+and\s+(\w+)')
              matches = pattern.findall(text)
              res.extend(matches)
            competitors_str = ', '.join(res)
            #print(competitors_str)
            return competitors_str
        else:
            return [f"No Competitors found for {company_name}"]
    else:
        return [f"Failed to retrieve information. Status code: {response.status_code}"]



def products_services(company_name,response):

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        #print(soup)
        # Check if the product and service information exists
        p_s = soup.find('span', {'class': 'description has-overflow ng-star-inserted'})

        if p_s:
            # Check if text content is available
            description = p_s.text.strip() if p_s.text else "No company description available"
            return description
        else:
            return f"No Products/Services found for {company_name}"
    else:
        return f"Failed to retrieve information. Status code: {response.status_code}"


In [3]:
def test(name):
  #Processing the Name of company before we pass it to the url
  temp_name=name.lower().replace(' ', '-')
  url = f'https://www.crunchbase.com/organization/{temp_name}'

  headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
  }

  response = requests.get(url, headers=headers)
  print(response.status_code)
  #check the Response code from the website
  if response.status_code==200:
    inf1=company_description(name,response)
    inf2=find_sector(name,response)
    inf3=find_industry(name,response)
    inf4=last_fund(name,response)
    inf5=find_competitors(temp_name)
    inf6=products_services(name,response)
    # print(f"{inf1} {inf2} {inf3} {inf4} {inf6}")
    # print(inf5)
    return {
            'Company Name' : name,
            'Company Description': inf1,
            'Sector': inf2,
            'Industry': inf3,
            'Funding': inf4,
            'Products/Services Description': inf6,
            'Competitors': inf5
        }
  else:
      print(f"Failed to retrieve information for {name}")
      return None


In [4]:
#Generating Excel File for List of Companies
def generate_excel_file(company_names):
    data = []
    #Processing Each Company
    for company_name in company_names:
        company_info = test(company_name)
        time.sleep(4) #Introducing Delay so not to spam requests
        if company_info:
            data.append(company_info)

    #concverting the Information into a Data Frame
    df = pd.DataFrame(data)
    #converting The Dataframe to Excel File
    excel_filename = 'output_file.xlsx'

    # Please Note that if You are Ruinning the Code on a local machine make sure that you create an Empty excel file in your Working Directory,
    # copy the file path and provide it to the code
    # #example
    # excel_filename='r'C:\Users\Vara\OneDrive\Desktop\Karya\ouput_file.xlsx'
    # Do not forget to add your correct file path'''

    df.to_excel(excel_filename, index=False)

    print(f'Excel file "{excel_filename}" generated successfully.')

if __name__=='__main__':
  # Example usage
  company_names = ['Amazon','Flipkart','Google', 'ZS Associates']#['CompanyA', 'CompanyB', 'CompanyC']
  generate_excel_file(company_names)

200
200
200
200
200
200
200
200
Excel file "output_file.xlsx" generated successfully.
