**import libraries**

In [1]:
import numpy as np
import plotly.express as px
import pandas as pd


**Read Data**

In [2]:
df = pd.read_csv(r"Unicorn_Companies.csv")

In [3]:
df.head()

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country,Continent,Year Founded,Funding,Select Investors
0,Bytedance,$180B,2017-04-07,Artificial intelligence,Beijing,China,Asia,2012,$8B,"Sequoia Capital China, SIG Asia Investments, S..."
1,SpaceX,$100B,2012-12-01,Other,Hawthorne,United States,North America,2002,$7B,"Founders Fund, Draper Fisher Jurvetson, Rothen..."
2,SHEIN,$100B,2018-07-03,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,$2B,"Tiger Global Management, Sequoia Capital China..."
3,Stripe,$95B,2014-01-23,Fintech,San Francisco,United States,North America,2010,$2B,"Khosla Ventures, LowercaseCapital, capitalG"
4,Klarna,$46B,2011-12-12,Fintech,Stockholm,Sweden,Europe,2005,$4B,"Institutional Venture Partners, Sequoia Capita..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1074 entries, 0 to 1073
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Company           1074 non-null   object
 1   Valuation         1074 non-null   object
 2   Date Joined       1074 non-null   object
 3   Industry          1074 non-null   object
 4   City              1058 non-null   object
 5   Country           1074 non-null   object
 6   Continent         1074 non-null   object
 7   Year Founded      1074 non-null   int64 
 8   Funding           1074 non-null   object
 9   Select Investors  1073 non-null   object
dtypes: int64(1), object(9)
memory usage: 84.0+ KB


**preprocessing for data**

In [5]:
# Fill missing values in the 'City' column with "Unknown"
df['City'].fillna("Unknown", inplace=True)

# Fill missing values in the 'Select Investors' column with "Unknown"
df['Select Investors'].fillna("Unknown", inplace=True)

In [6]:
# Define a function to convert funding and valuation values to billion
def convert_to_billion(x):
    # Remove the dollar sign from the value
    x = x.replace('$', '')

    # Check if the value ends with 'B' for billion
    if x.endswith('B'):
        # If so, remove 'B' and append '0' for each missing digit to represent billion
        x = x.replace('B', '') + '0' * 9
    # Check if the value ends with 'M' for million
    elif x.endswith('M'):
        # If so, remove 'M' and append '0' for each missing digit to represent million
        x = x.replace('M', '') + '0' * 6

    # Return the modified value
    return x

# Apply the conversion function to the 'Valuation' column
df['Valuation'] = df['Valuation'].apply(convert_to_billion)

# Apply the conversion function to the 'Funding' column
df['Funding'] = df['Funding'].apply(convert_to_billion)


In [7]:
# Convert the 'Valuation' column to 64-bit integer type
df['Valuation'] = df['Valuation'].astype(np.int64)

# Convert the 'Funding' column to numeric type, handling errors by coercing to NaN,
# filling NaN values with -1, and then converting to 64-bit integer type
df['Funding'] = pd.to_numeric(df['Funding'], errors='coerce').fillna(-1).astype(np.int64)

# Convert the 'Date Joined' column to datetime type
df["Date Joined"] = pd.to_datetime(df["Date Joined"])

In [8]:
# Loop through columns of object type and convert them to category type
for col in df.select_dtypes(include=['object']).columns:
    # Convert the current column to category type
    df[col] = df[col].astype('category')

In [9]:
# Extract the year from the 'Date Joined' column and create a new column 'Joined_by_year'
df['Joined_by_year'] = df['Date Joined'].dt.year

In [10]:
# Calculate the number of years it took for a company to become a unicorn
df['Time to Unicorn'] = df['Joined_by_year'] - df['Year Founded']

In [11]:
# Calculate ROI
df['ROI'] = df['Valuation'] / df['Funding']

In [12]:
# Split and explode the "Select Investors" column to create separate rows for each investor
investors_df = df["Select Investors"].str.split(", ").explode()

# Group by investors and count the number of unicorns they funded
investor_counts = investors_df.groupby(investors_df).size().reset_index(name="Number of Unicorns")

# Sort the investors by the number of unicorns in descending order
investor_counts = investor_counts.sort_values(by="Number of Unicorns", ascending=False)
# Display the top investors
investor_counts.head()

Unnamed: 0,Select Investors,Number of Unicorns
26,Accel,60
1110,Tiger Global Management,53
80,Andreessen Horowitz,53
976,Sequoia Capital China,48
975,Sequoia Capital,47


In [13]:
df.head()

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country,Continent,Year Founded,Funding,Select Investors,Joined_by_year,Time to Unicorn,ROI
0,Bytedance,180000000000,2017-04-07,Artificial intelligence,Beijing,China,Asia,2012,8000000000,"Sequoia Capital China, SIG Asia Investments, S...",2017,5,22.5
1,SpaceX,100000000000,2012-12-01,Other,Hawthorne,United States,North America,2002,7000000000,"Founders Fund, Draper Fisher Jurvetson, Rothen...",2012,10,14.285714
2,SHEIN,100000000000,2018-07-03,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,2000000000,"Tiger Global Management, Sequoia Capital China...",2018,10,50.0
3,Stripe,95000000000,2014-01-23,Fintech,San Francisco,United States,North America,2010,2000000000,"Khosla Ventures, LowercaseCapital, capitalG",2014,4,47.5
4,Klarna,46000000000,2011-12-12,Fintech,Stockholm,Sweden,Europe,2005,4000000000,"Institutional Venture Partners, Sequoia Capita...",2011,6,11.5


**Countries with the Most Unicorns:**


In [44]:
# Group by Country
unicorn_count_by_country = df['Country'].value_counts().reset_index()
unicorn_count_by_country.columns = ['Country', 'Number of Unicorns']

# Select top 5 countries and sort by 'Number of Unicorns' in descending order
top5_countries = unicorn_count_by_country.head(5).sort_values(by='Number of Unicorns', ascending=True)

# Plot with Plotly Express
fig = px.bar(
    top5_countries,
    y='Country',
    x='Number of Unicorns',
    text='Number of Unicorns',
    color_discrete_sequence=['#000000'],
    labels={'Number of Unicorns': 'Number of Unicorns'},
    title='Top 5 Countries with the Most Unicorns',
)

# Rotate y-axis labels
fig.update_layout(yaxis=dict(tickangle=0))

# Remove x-axis and y-axis labels
fig.update_layout(
    xaxis=dict(
        showline=False,
        showgrid=False,
        zeroline=False,
        showticklabels=False,
    ),
    yaxis=dict(
        showline=False,
        showgrid=False,
        zeroline=False,
        showticklabels=False,
    ),
)

# Remove background
fig.update_layout(
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
)

# Adjust text position
fig.update_traces(textposition='inside')

fig.show()


**Which investors have funded the most unicorns?:**

In [53]:
investor_counts = investor_counts.sort_values(by='Number of Unicorns', ascending=True).tail(5)

# Plot with Plotly Express
fig = px.bar(
    investor_counts,
    y='Select Investors',
    x='Number of Unicorns',
    text='Number of Unicorns',
    color_discrete_sequence=['#000000'],
    labels={'Number of Unicorns': 'Number of Unicorns'},
    title='Top 5 Investors by Number of Unicorns',
)

# Rotate y-axis labels
fig.update_layout(yaxis=dict(tickangle=0))

# Remove background
fig.update_layout(
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
)

# Adjust text position
fig.update_traces(textposition='inside')

fig.show()


**Unicorn Companies with the Biggest Return on Investment (ROI):**


In [54]:
top5_roi = df.nlargest(5, 'ROI').sort_values(by='ROI', ascending=True)

# Plot with Plotly Express
fig = px.bar(
    top5_roi,
    y='Company',
    x='ROI',
    text=top5_roi['ROI'].round(2),  # Rounded numbers
    color_discrete_sequence=['#000000'],  # Black color
    labels={'ROI': 'Return on Investment (ROI)'},
    title='Top 5 Unicorn Companies with the Biggest ROI',
)

# Rotate y-axis labels
fig.update_layout(yaxis=dict(tickangle=0))

# Remove background
fig.update_layout(
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
)

# Adjust text position
fig.update_traces(textposition='inside')

fig.show()


**In which year did most companies become unicorns?**

In [63]:
joined_by_year = df.groupby(df["Date Joined"].dt.year)["Company"].count().reset_index()

# Plot with Plotly Express
fig = px.scatter(
    joined_by_year,
    x='Date Joined',
    y='Company',
    text='Company',
    color_discrete_sequence=['black'],  # Set color to black
    labels={'Date Joined': 'Year', 'Company': 'Number of Companies Joined'},
    title='Number of Companies Joined by Year',
)

# Increase size of numbers inside circles and set white color
fig.update_traces(
    marker=dict(symbol='circle', size=25, line=dict(width=2, color='black')),
    textfont=dict(color='white', size=14)  # Set white color and adjust font size
)

# Add line with black color
fig.add_trace(px.line(
    joined_by_year,
    x='Date Joined',
    y='Company',
).update_traces(line=dict(color='black', width=2)).data[0])

# Remove background
fig.update_layout(
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
)

fig.show()


**------------------------------------------------**


In [18]:
investor_counts.to_csv(r'investor_counts.csv', index=False)

In [19]:
df.to_csv(r'data_after_cleaing.csv', index=False)