## Crawl data from CBInsights.com

After crawed data from here, We have data for more than 1100 unicorns around the world, as of July 2022.

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [3]:
url = "https://www.cbinsights.com/research-unicorn-companies"
response = requests.get(url) #Using Requests to Access a Web Content

In [4]:
class HTMLTableParser:
        def parse_url(self, url):
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'lxml')
            return [(table['class'],self.parse_html_table(table))\
                    for table in soup.find_all('table')]  
    
        def parse_html_table(self, table):
            n_columns = 0
            n_rows=0
            column_names = []
    
            # Find number of rows and columns
            # we also find the column titles if we can
            for row in table.find_all('tr'):
                
                # Determine the number of rows in the table
                td_tags = row.find_all('td')
                if len(td_tags) > 0:
                    n_rows+=1
                    if n_columns == 0:
                        # Set the number of columns for our table
                        n_columns = len(td_tags)
                        
                # Handle column names if we find them
                th_tags = row.find_all('th') 
                if len(th_tags) > 0 and len(column_names) == 0:
                    for th in th_tags:
                        column_names.append(th.get_text())
    
            # Safeguard on Column Titles
            if len(column_names) > 0 and len(column_names) != n_columns:
                raise Exception("Column titles do not match the number of columns")
    
            columns = column_names if len(column_names) > 0 else range(0,n_columns)
            df = pd.DataFrame(columns = columns,
                              index= range(0,n_rows))
            row_marker = 0
            for row in table.find_all('tr'):
                column_marker = 0
                columns = row.find_all('td')
                for column in columns:
                    df.iat[row_marker,column_marker] = column.get_text()
                    column_marker += 1
                if len(columns) > 0:
                    row_marker += 1
                    
            # Convert to float if possible
            for col in df:
                try:
                    df[col] = df[col].astype(float)
                except ValueError:
                    pass
            
            return df

Run 3 commands to make sure we have all relevant packages installed:

pip install bs4

pip install html5lib

pip install lxml

In [17]:
hp = HTMLTableParser()
table = hp.parse_url(url)[0][1] # Grabbing the table from the tuple
table.head()

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Investors
0,ByteDance,$140,4/7/2017,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S..."
1,SpaceX,$127,12/1/2012,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen..."
2,SHEIN,$100,7/3/2018,China,Shenzhen,E-commerce & direct-to-consumer,"Tiger Global Management, Sequoia Capital China..."
3,Stripe,$95,1/23/2014,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG"
4,Canva,$40,1/8/2018,Australia,Surry Hills,Internet software & services,"Sequoia Capital China, Blackbird Ventures, Mat..."


## Preprocessing

In [11]:
table.sort_values(by='Valuation ($B)', ascending=False).head()

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Investors
3,Stripe,$95,1/23/2014,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG"
57,ServiceTitan,$9.5,11/14/2018,United States,Glendale,Internet software & services,"Bessemer Venture Partners, ICONIQ Capital, Bat..."
56,OutSystems,$9.5,6/5/2018,United States,Boston,Internet software & services,"KKR, ES Ventures, North Bridge Growth Equity"
58,HEYTEA,$9.28,7/1/2019,China,Shenzhen,Other,"Sequoia Capital China, Tencent Investment, BA ..."
59,N26,$9.23,1/10/2019,Germany,Berlin,Fintech,"Redalpine Venture Partners, Earlybird Venture ..."


In [15]:
table.loc[table["Company"].str.contains("Space", na=False)]

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Investors
1,SpaceX,$127,12/1/2012,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen..."
183,Relativity Space,$4.2,11/23/2020,United States,Inglewood,Other,"Playground Global, Bond, Tribe Capital"
382,ABL Space Systems,$2.4,3/25/2021,United States,El Segundo,Other,"T. Rowe Price, Lockheed Martin Ventures, Fidel..."
697,Astranis Space Technologies,$1.4,4/14/2021,United States,San Francisco,Mobile & telecommunications,"Refactor Capital, Andreessen Horowitz, Fifty Y..."
770,GalaxySpace,$1.22,11/17/2020,China,Beijing,Mobile & telecommunications,"Shunwei Capital Partners, 5Y Capital, Legend C..."
809,LivSpace,$1.2,2/8/2022,India,Bengaluru,E-commerce & direct-to-consumer,"Jungle Ventures, Helion Venture Partners, INGK..."
985,Axiom Space,$1,2/16/2021,United States,Houston,Other,"C5 Capital, Hemisphere Ventures, The Venture C..."


In [16]:
table.shape

(1178, 7)

In [20]:
table = table.replace('\n', '', regex=True)
table = table.replace('\t', '', regex=True)
table.head()

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Investors
0,ByteDance,$140,4/7/2017,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S..."
1,SpaceX,$127,12/1/2012,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen..."
2,SHEIN,$100,7/3/2018,China,Shenzhen,E-commerce & direct-to-consumer,"Tiger Global Management, Sequoia Capital China..."
3,Stripe,$95,1/23/2014,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG"
4,Canva,$40,1/8/2018,Australia,Surry Hills,Internet software & services,"Sequoia Capital China, Blackbird Ventures, Mat..."


Export csv file

In [21]:
table.to_csv("CB-Insights_Global-Unicorns_2022.csv")