# Unicorn Companies Data Analysis

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
unicorn_master = pd.read_csv("Unicorn_Companies.csv")
unicorn_master.head(3)

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Inverstors,Founded Year,Total Raised,Financial Stage,Investors Count,Deal Terms,Portfolio Exits
0,Bytedance,$140,4/7/2017,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S...",2012,$7.44B,IPO,28,8,5.0
1,SpaceX,$100.3,12/1/2012,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen...",2002,$6.874B,,29,12,
2,Stripe,$95,1/23/2014,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG",2010,$2.901B,Asset,39,12,1.0


In [3]:
print(f"Shape of the dataset is {unicorn_master.shape}")
print(f"The dataset has the following datatypes for the corresponding columns\n {unicorn_master.dtypes}")
unicorn_master.describe()

Shape of the dataset is (1037, 13)
The dataset has the following datatypes for the corresponding columns
 Company              object
Valuation ($B)       object
Date Joined          object
Country              object
City                 object
Industry             object
Select Inverstors    object
Founded Year         object
Total Raised         object
Financial Stage      object
Investors Count      object
Deal Terms           object
Portfolio Exits      object
dtype: object


Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Inverstors,Founded Year,Total Raised,Financial Stage,Investors Count,Deal Terms,Portfolio Exits
count,1037,1037,1037,1037,1037,1037,1037.0,1037,1037.0,1037.0,1037,1037,1037.0
unique,1035,200,623,46,256,33,1006.0,37,914.0,10.0,54,16,5.0
top,Bolt,$1,7/13/2021,United States,San Francisco,Fintech,,2015,,,10,1,
freq,2,244,9,536,145,205,17.0,144,24.0,988.0,69,280,988.0


In [4]:
# Converting Valuation
unicorn_master["Valuation ($B)"] = unicorn_master["Valuation ($B)"].replace({"\$": ""}, regex=True)
unicorn_master["Valuation ($B)"] = unicorn_master["Valuation ($B)"].astype(float)

In [5]:
# Basic Overview of Data before data cleaning.
# Doing it here because I added a nonetype later

fig = px.treemap(unicorn_master,path= ["Country","Industry", "Company"],
            values="Valuation ($B)", color_discrete_sequence=px.colors.qualitative.Pastel)

fig.update_traces(root_color="lightgrey")
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))

  df_all_trees = df_all_trees.append(df_tree, ignore_index=True)
  df_all_trees = df_all_trees.append(df_tree, ignore_index=True)
  df_all_trees = df_all_trees.append(df_tree, ignore_index=True)


In [6]:
# Converting Total Raised

# new column to separate billions, mil and thousands.
unicorn_master["Total Raised Unit"] = unicorn_master["Total Raised"].str[-1]
unicorn_master["Total Raised"] = unicorn_master["Total Raised"].replace({"\$": "", "B$": "", "M$": "", 
                "None": np.nan, "K$": ""}, regex=True)
unicorn_master["Total Raised"] = unicorn_master["Total Raised"].astype(float)

# used a loop here (might be a better way to do it)
for raised, row in unicorn_master.iterrows():
    if row["Total Raised Unit"] == "B":
        unicorn_master.loc[raised, "Total Raised"] = row["Total Raised"] * 1000000000
    elif row["Total Raised Unit"] == "M":
        unicorn_master.loc[raised, "Total Raised"] = row["Total Raised"] * 1000000
    elif row["Total Raised Unit"] == "K":
        unicorn_master.loc[raised, "Total Raised"] = row["Total Raised"] * 1000

# remove added column, add total raised column
# divide by 1 bil to match it to valuation column
unicorn_master = unicorn_master.drop("Total Raised Unit", axis=1)
unicorn_master["Total Raised"] = unicorn_master["Total Raised"].values/1000000000
unicorn_master.head()


Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Inverstors,Founded Year,Total Raised,Financial Stage,Investors Count,Deal Terms,Portfolio Exits
0,Bytedance,140.0,4/7/2017,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S...",2012,7.44,IPO,28,8,5.0
1,SpaceX,100.3,12/1/2012,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen...",2002,6.874,,29,12,
2,Stripe,95.0,1/23/2014,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG",2010,2.901,Asset,39,12,1.0
3,Klarna,45.6,12/12/2011,Sweden,Stockholm,Fintech,"Institutional Venture Partners, Sequoia Capita...",2005,3.472,Acquired,56,13,1.0
4,Epic Games,42.0,10/26/2018,United States,Cary,Other,"Tencent Holdings, KKR, Smash Ventures",1991,4.377,Acquired,25,5,2.0


In [7]:
# Convert Dates for "Joining" and "Date Founded"

unicorn_master["Date Joined"] = pd.to_datetime(unicorn_master["Date Joined"])
unicorn_master[unicorn_master["Founded Year"] == "None"] = None
unicorn_master["Founded Year"] = pd.to_datetime(unicorn_master["Founded Year"])

In [8]:
# Converting Number of Investors from Object to float
unicorn_master['Investors Count'] = unicorn_master['Investors Count'].replace({'None': '0'}, regex=True)
unicorn_master['Investors Count'] = unicorn_master['Investors Count'].astype(float)

In [9]:
# Duplicates and NAN values check
print(unicorn_master.isna().values.any())
print(unicorn_master.duplicated().values.any())

# couldn't find duplicates so didn't drop them yet.


True
True


In [10]:
# Financial Stage comparison wrt Valuation,
# Dropped "none" values here, so only companies with valid financial stage are shown.
Financial_St = unicorn_master[unicorn_master["Financial Stage"] != "None"]
Financial_St["Financial Stage"] = Financial_St['Financial Stage'].replace({"Acq": "Acquired"})
Financial_St = Financial_St.dropna()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [15]:
fig = px.bar(data_frame=Financial_St, x="Financial Stage", y="Valuation ($B)", color="Country", color_discrete_sequence=px.colors.qualitative.Pastel)
fig.show()

In [16]:
fig = px.bar(data_frame=Financial_St, x="Financial Stage", y="Valuation ($B)", color="Country", facet_col="Industry", facet_col_wrap=3,
color_discrete_sequence=px.colors.qualitative.Pastel)
fig.update_yaxes(matches=None)
fig.for_each_yaxis(lambda y: y.update(title = ''))
fig.add_annotation(x=-2,y=0.5,
                   text="Valuation (in $B)", textangle=-90,
                    xref="paper", yref="paper")
fig.show()


In [13]:
#biggest companies
top_10_companies = unicorn_master.sort_values("Valuation ($B)", ascending=False)[:10]
top_10_companies

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Inverstors,Founded Year,Total Raised,Financial Stage,Investors Count,Deal Terms,Portfolio Exits
0,Bytedance,140.0,2017-04-07,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S...",2012-01-01,7.44,IPO,28.0,8,5.0
1,SpaceX,100.3,2012-12-01,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen...",2002-01-01,6.874,,29.0,12,
2,Stripe,95.0,2014-01-23,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG",2010-01-01,2.901,Asset,39.0,12,1.0
3,Klarna,45.6,2011-12-12,Sweden,Stockholm,Fintech,"Institutional Venture Partners, Sequoia Capita...",2005-01-01,3.472,Acquired,56.0,13,1.0
4,Epic Games,42.0,2018-10-26,United States,Cary,Other,"Tencent Holdings, KKR, Smash Ventures",1991-01-01,4.377,Acquired,25.0,5,2.0
5,Canva,40.0,2018-01-08,Australia,Surry Hills,Internet software & services,"Sequoia Capital China, Blackbird Ventures, Mat...",2012-01-01,0.57126,,26.0,8,
6,Checkout.com,40.0,2019-05-02,United Kingdom,London,Fintech,"Tiger Global Management, Insight Partners, DST...",2012-01-01,1.83,,15.0,4,
7,Instacart,39.0,2014-12-30,United States,San Francisco,"Supply chain, logistics, & delivery","Khosla Ventures, Kleiner Perkins Caufield & By...",2012-01-01,2.686,,29.0,12,
8,Databricks,38.0,2019-02-05,United States,San Francisco,Data management & analytics,"Andreessen Horowitz, New Enterprise Associates...",2013-01-01,3.497,,29.0,8,
9,Revolut,33.0,2018-04-26,United Kingdom,London,Fintech,"index Ventures, DST Global, Ribbit Capital",2015-01-01,1.716,,31.0,6,


In [17]:
px.bar(top_10_companies,x="Company", y=["Total Raised", "Valuation ($B)"],opacity = 0.5,
orientation = "v", barmode="group",color_discrete_sequence=px.colors.qualitative.Bold)

In [75]:
# Top 5 countries
country_unicorns = unicorn_master.groupby("Country")
top5Countries = country_unicorns['Valuation ($B)'].sum().sort_values(ascending=False)[:5]
top5Countries

Country
United States     1793.64
China              542.10
India              175.54
United Kingdom     168.93
Germany             66.70
Name: Valuation ($B), dtype: float64

In [76]:
px.bar(top5Countries)

In [104]:
industry_total = unicorn_master['Industry'].value_counts()
industry_top_5 = industry_total.head()

In [107]:
px.bar(industry_top_5, color_discrete_sequence=px.colors.qualitative.Pastel)

In [111]:
# Total Investors
fig = px.scatter(unicorn_master, x=unicorn_master["Company"][:100], y=unicorn_master["Valuation ($B)"][:100], color=unicorn_master["Investors Count"][:100],
color_continuous_scale=px.colors.cyclical.mrybm)

fig.show()