# Unicorn dataset EDA

# dataset: https://www.kaggle.com/datasets/shivavashishtha/the-complete-list-of-unicorn-companies

original website: https://www.cbinsights.com/research-unicorn-companies

This is a simple database about Unicorn companies. We will check some basic distribution, and use mainly plotly since it provides basic interactivity with the charts! 

In [29]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
sns.set(rc = {'figure.figsize':(15,5)})

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go #for a few interactive plot
import plotly.express as px

In [28]:
pip install plotly

Collecting plotlyNote: you may need to restart the kernel to use updated packages.
  Downloading plotly-5.9.0-py2.py3-none-any.whl (15.2 MB)
Collecting tenacity>=6.2.0
  Downloading tenacity-8.0.1-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.9.0 tenacity-8.0.1



In [10]:
df=pd.read_html("https://www.cbinsights.com/research-unicorn-companies")

In [11]:
df = df[0]

In [12]:
df.tail()

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Investors
1160,Glean,$1,5/18/2022,United States,Palo Alto,Internet software & services,"General Catalyst, Kleiner Perkins Caufield & B..."
1161,CareBridge,$1,6/8/2022,United States,Nashville,Health,"Bain Capital Ventures, Splunk Ventures, Cisco ..."
1162,Immuta,$1,6/8/2022,United States,Boston,Data management & analytics,"DFJ Growth Fund, Dell Technologies Capital, Te..."
1163,JupiterOne,$1,6/2/2022,United States,Morrisville,Cybersecurity,"Bain Capital Ventures, Splunk Ventures, Cisco ..."
1164,LeadSquared,$1,6/21/2022,India,Bengaluru,Internet software & services,"Gaja Capital Partners, Stakeboat Capital, West..."


In [13]:
df[['Investor1','Investor2','Investor3','Investor4']] = df['Select Investors'].str.split(',', expand=True)

In [14]:
df.tail()

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Investors,Investor1,Investor2,Investor3,Investor4
1160,Glean,$1,5/18/2022,United States,Palo Alto,Internet software & services,"General Catalyst, Kleiner Perkins Caufield & B...",General Catalyst,Kleiner Perkins Caufield & Byers,Lightspeed Venture Partners,
1161,CareBridge,$1,6/8/2022,United States,Nashville,Health,"Bain Capital Ventures, Splunk Ventures, Cisco ...",Bain Capital Ventures,Splunk Ventures,Cisco Investments,
1162,Immuta,$1,6/8/2022,United States,Boston,Data management & analytics,"DFJ Growth Fund, Dell Technologies Capital, Te...",DFJ Growth Fund,Dell Technologies Capital,Ten Eleven Ventures,
1163,JupiterOne,$1,6/2/2022,United States,Morrisville,Cybersecurity,"Bain Capital Ventures, Splunk Ventures, Cisco ...",Bain Capital Ventures,Splunk Ventures,Cisco Investments,
1164,LeadSquared,$1,6/21/2022,India,Bengaluru,Internet software & services,"Gaja Capital Partners, Stakeboat Capital, West...",Gaja Capital Partners,Stakeboat Capital,WestBridge Capital,


In [15]:
df.describe()

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Investors,Investor1,Investor2,Investor3,Investor4
count,1165,1165,1165,1165,1148,1165,1164,1164,1114,1024,9
unique,1162,221,682,48,276,17,1148,593,642,616,9
top,Branch,$1,7/13/2021,United States,San Francisco,Fintech,Sequoia Capital,Andreessen Horowitz,Tiger Global Management,Tiger Global Management,Softbank Group
freq,2,271,9,626,164,245,3,32,22,20,1


In [16]:
df.rename(columns = {'Valuation ($B)':'Value', 'Date Joined':'Date'}, inplace = True) #for easier usage we rename 2 columns

In [17]:
df.head()

Unnamed: 0,Company,Value,Date,Country,City,Industry,Select Investors,Investor1,Investor2,Investor3,Investor4
0,Bytedance,$140,4/7/2017,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S...",Sequoia Capital China,SIG Asia Investments,Sina Weibo,Softbank Group
1,SpaceX,$127,12/1/2012,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen...",Founders Fund,Draper Fisher Jurvetson,Rothenberg Ventures,
2,SHEIN,$100,7/3/2018,China,Shenzhen,E-commerce & direct-to-consumer,"Tiger Global Management, Sequoia Capital China...",Tiger Global Management,Sequoia Capital China,Shunwei Capital Partners,
3,Stripe,$95,1/23/2014,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG",Khosla Ventures,LowercaseCapital,capitalG,
4,Klarna,$45.6,12/12/2011,Sweden,Stockholm,Fintech,"Institutional Venture Partners, Sequoia Capita...",Institutional Venture Partners,Sequoia Capital,General Atlantic,


In [19]:
df[['Value1', 'Value2']] = df['Value'].str.split('$', expand=True)


In [20]:
df.head()

Unnamed: 0,Company,Value,Date,Country,City,Industry,Select Investors,Investor1,Investor2,Investor3,Investor4,Value1,Value2
0,Bytedance,$140,4/7/2017,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S...",Sequoia Capital China,SIG Asia Investments,Sina Weibo,Softbank Group,,140.0
1,SpaceX,$127,12/1/2012,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen...",Founders Fund,Draper Fisher Jurvetson,Rothenberg Ventures,,,127.0
2,SHEIN,$100,7/3/2018,China,Shenzhen,E-commerce & direct-to-consumer,"Tiger Global Management, Sequoia Capital China...",Tiger Global Management,Sequoia Capital China,Shunwei Capital Partners,,,100.0
3,Stripe,$95,1/23/2014,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG",Khosla Ventures,LowercaseCapital,capitalG,,,95.0
4,Klarna,$45.6,12/12/2011,Sweden,Stockholm,Fintech,"Institutional Venture Partners, Sequoia Capita...",Institutional Venture Partners,Sequoia Capital,General Atlantic,,,45.6


In [21]:
df.rename(columns = {'Value2':'USD'}, inplace = True)
df.drop(['Value', 'Value1'], axis=1, inplace=True)

In [22]:
df.head()
df["USD"] = df.USD.astype(float)

In [23]:
df.head()

Unnamed: 0,Company,Date,Country,City,Industry,Select Investors,Investor1,Investor2,Investor3,Investor4,USD
0,Bytedance,4/7/2017,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S...",Sequoia Capital China,SIG Asia Investments,Sina Weibo,Softbank Group,140.0
1,SpaceX,12/1/2012,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen...",Founders Fund,Draper Fisher Jurvetson,Rothenberg Ventures,,127.0
2,SHEIN,7/3/2018,China,Shenzhen,E-commerce & direct-to-consumer,"Tiger Global Management, Sequoia Capital China...",Tiger Global Management,Sequoia Capital China,Shunwei Capital Partners,,100.0
3,Stripe,1/23/2014,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG",Khosla Ventures,LowercaseCapital,capitalG,,95.0
4,Klarna,12/12/2011,Sweden,Stockholm,Fintech,"Institutional Venture Partners, Sequoia Capita...",Institutional Venture Partners,Sequoia Capital,General Atlantic,,45.6


In [24]:
df.dtypes

Company              object
Date                 object
Country              object
City                 object
Industry             object
Select Investors     object
Investor1            object
Investor2            object
Investor3            object
Investor4            object
USD                 float64
dtype: object

In [25]:
from datetime import datetime
df['Date'] = list([pd.to_datetime(x, format='%m/%d/%Y') for x in df['Date'].to_list()])
df.dtypes

Company                     object
Date                datetime64[ns]
Country                     object
City                        object
Industry                    object
Select Investors            object
Investor1                   object
Investor2                   object
Investor3                   object
Investor4                   object
USD                        float64
dtype: object

In [26]:
df.head()

Unnamed: 0,Company,Date,Country,City,Industry,Select Investors,Investor1,Investor2,Investor3,Investor4,USD
0,Bytedance,2017-04-07,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S...",Sequoia Capital China,SIG Asia Investments,Sina Weibo,Softbank Group,140.0
1,SpaceX,2012-12-01,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen...",Founders Fund,Draper Fisher Jurvetson,Rothenberg Ventures,,127.0
2,SHEIN,2018-07-03,China,Shenzhen,E-commerce & direct-to-consumer,"Tiger Global Management, Sequoia Capital China...",Tiger Global Management,Sequoia Capital China,Shunwei Capital Partners,,100.0
3,Stripe,2014-01-23,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG",Khosla Ventures,LowercaseCapital,capitalG,,95.0
4,Klarna,2011-12-12,Sweden,Stockholm,Fintech,"Institutional Venture Partners, Sequoia Capita...",Institutional Venture Partners,Sequoia Capital,General Atlantic,,45.6


In [30]:
#Let's see a histogram for the USD column
fig = px.histogram(df, x="USD",title = 'USD (B) Value of Unicorn Companies')
fig.show()

In [31]:
fig.update_xaxes(range=[1, 16]) #here we fix x-axis values, since histogram flattened out already
fig.show()

In [32]:
fig = px.histogram(df, x="USD",title = 'USD (B) Value of Unicorn Companies', cumulative = True )
fig.show()
print ("The USD Value is less than 20 Billion $ for 1140 company out of 1152! ")

The USD Value is less than 20 Billion $ for 1140 company out of 1152! 


In [33]:
fig.update_xaxes(range=[1, 16])
fig.show()

In [34]:
pboxlabel = str('Company')
fig = px.box(df, y="USD", hover_name=pboxlabel, log_y = True) #simple box-whisker plot, but Y-axis is displayed on logarithmic scale!!!
fig.show()

In [35]:

fig = px.histogram(df, x="Date",title='Join Date of Unicorn Companies to the "1B Club" by Industry', color = 'Industry',animation_frame='Industry') # we could convert country into continent / region
fig.update_layout(bargap=0.1)
fig.show()

In [36]:
df2 = df.groupby('City')['USD'].sum()
df2 = df2.sort_values(ascending = False).head(10)
fig = px.bar(df2, orientation='h', color = df2,text=df2,  title = 'Top 10 Cities for Unicorn Company Value (Billion $)')

fig.show()


In [37]:
df2 = df.groupby('Country')['USD'].sum()
df2 = df2.sort_values(ascending = False).head(10)
fig = px.bar(df2, orientation='h', color = df2,text=df2, title = 'Top 10 Countries for Unicorn Company Value (Billion $)')

fig.show()


In [38]:
fig = px.bar(df, x="USD", y="Country",
height=1600,log_x=True, color = 'USD',text='Company',hover_name='Company',
             title='Unicorns by Country')
fig.show()

In [39]:
dfBigs = df.head(50) # replace the number to see top N!
fig = px.bar(dfBigs, x="USD", y="Country",
height=600, color = 'USD',text='Company',hover_name='Company',
             title='TOP N Unicorns by Country')
fig.show()

In [40]:
dfBigs = df.head(50) # replace the number to see top N!
fig = px.bar(dfBigs, x="USD", y="City",
height=600, color = 'USD',text='Company',hover_name='Company',
             title='TOP N Unicorns by City')
fig.show()

In [41]:
dfBigs = df.head(50) # replace the number to see top N!
fig = px.bar(dfBigs, x="USD", y="Industry",
height=600, color = 'USD',text='Company',hover_name='Company',
             title='TOP N Unicorns by Industry')
fig.show()

In [42]:
dfBigs = df.head(100) # replace the number to see top N!
fig = px.density_heatmap(dfBigs, x="Industry", y="Country", z="USD", histfunc="sum",
                         height = 800, marginal_x="histogram", marginal_y="histogram", 
                         title ='Contry vs Industry, TOP N Unicorn Companies!')
fig.show()

In [43]:
df.Industry.unique()

array(['Artificial intelligence', 'Other',
       'E-commerce & direct-to-consumer', 'Fintech',
       'Internet software & services',
       'Supply chain, logistics, & delivery',
       'Data management & analytics', 'Edtech', 'Hardware',
       'Consumer & retail', 'Health', 'Auto & transportation',
       'Cybersecurity', 'Mobile & telecommunications', 'Travel',
       'Artificial Intelligence', 'Internet Software Services'],
      dtype=object)

In [44]:
fig = px.treemap(dfBigs, path=[px.Constant("all"), 'Country'], 
                 values='USD', color ='USD', color_continuous_scale='algae')
fig.show()

In [45]:
fig = px.treemap(dfBigs, path=[px.Constant("all"), 'Industry'], 
                 values='USD', color ='USD', labels = 'Company', color_continuous_scale='algae')
fig.show()

In [46]:
import plotly.io as pio

import pandas as pd



aggs = ["count","sum","max"]

agg = []
agg_func = []
for i in range(0, len(aggs)):
    agg = dict(
        args=['transforms[0].aggregations[0].func', aggs[i]],
        label=aggs[i],
        method='restyle'
    )
    agg_func.append(agg)

data = [dict(
  type = 'choropleth',
  locationmode = 'country names',
  locations = df['Country'],
  z = df['USD'],
  autocolorscale = False,
  colorscale = 'earth',
  reversescale = False,
  transforms = [dict(
    type = 'aggregate',
    groups = df['Country'],
    aggregations = [dict(
        target = 'z', func = 'sum', enabled = True)
    ]
  )]
)]

layout = dict(
  title = '<b>Unicorn Companies</b><br>use dropdown to change aggregation',
  xaxis = dict(title = 'Subject'),
  yaxis = dict(title = 'Score'),
  height = 600,
  width = 900,
  updatemenus = [dict(
        x = 0.85,
        y = 1.15,
        xref = 'paper',
        yref = 'paper',
        yanchor = 'top',
        active = 1,
        showactive = False,
        buttons = agg_func
  )]
)

fig_dict = dict(data=data, layout=layout)

pio.show(fig_dict, validate=False)

In [47]:
col= ['Investor1','Investor2','Investor3','Investor4']

single = pd.concat([df[col] for col in col])


In [49]:
pip install PIL

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement PIL (from versions: none)
ERROR: No matching distribution found for PIL


In [48]:
#df2= df.loc[:, ['Investor1','Investor2','Investor3','Investor4']]
s = single.values.tolist()
#print(investor_list)

text = ' '.join([str(elem) for elem in s])
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator




# Display the generated image:

wordcloud = WordCloud(max_font_size=50, max_words=50, 
                      background_color="white", 
                      stopwords = ['None',"'"],
                     min_word_length=5).generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Investors')

plt.show()

ModuleNotFoundError: No module named 'wordcloud'