In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

In [4]:
df.shape

(4255, 90)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4255 entries, 0 to 4254
Data columns (total 90 columns):
 #   Column                                                                                  Non-Null Count  Dtype         
---  ------                                                                                  --------------  -----         
 0   company_id                                                                              4255 non-null   int64         
 1   event_name                                                                              4237 non-null   object        
 2   country                                                                                 4242 non-null   object        
 3   industry                                                                                4133 non-null   object        
 4   business_stage                                                                          3560 non-null   object        
 5   incorporated        

In [6]:
df.describe()

Unnamed: 0,company_id,incorporated,Age today,Number of team members,revenue_model_commission,revenue_model_product,revenue_model_on-demand,revenue_model_subscription,revenue_model_freemium,revenue_model_advertising,...,Unnamed: 78,Unnamed: 80,Unnamed: 81,Unnamed: 82,Unnamed: 84,Unnamed: 85,Funding after the event?,FUNDING - ALL FROM 3RD PARTY,total funding,age at the time of final funding (in months)
count,4255.0,3561.0,3690.0,3776.0,3538.0,3538.0,3538.0,3538.0,3538.0,3538.0,...,0.0,0.0,0.0,0.0,0.0,0.0,429.0,429.0,0.0,0.0
mean,2128.0,0.632687,54.835501,2.892744,0.287168,0.420859,0.163369,0.581119,0.186546,0.228095,...,,,,,,,1843025.0,1843025.0,,
std,1228.457027,0.48214,28.760295,1.830685,0.452505,0.493767,0.369754,0.493446,0.389602,0.419663,...,,,,,,,9021500.0,9021500.0,,
min,1.0,0.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,0.0,0.0,,
25%,1064.5,0.0,35.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,60000.0,60000.0,,
50%,2128.0,1.0,51.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,...,,,,,,,220000.0,220000.0,,
75%,3191.5,1.0,72.0,4.0,1.0,1.0,0.0,1.0,0.0,0.0,...,,,,,,,796610.0,796610.0,,
max,4255.0,1.0,608.0,18.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,125300000.0,125300000.0,,


In [7]:
## dropping columns with 0 values 
cols = [63,64,65,66,73,74,75,76,77,78,79,80,81,82,84,85,88,89]
df.drop(df.columns[cols],axis=1,inplace=True)

In [8]:
## creating feature "year_incorporated" as only year is relevant
def find_year(row):
    year = str(row).split('-')[0]
    return year
    

In [9]:
df['year_incorporated'] = df.apply(lambda row : find_year(row['incorporation_date']),axis = 1)

In [10]:
df.groupby('year_incorporated').year_incorporated.count()

year_incorporated
1970       2
1990       1
2000       1
2002       2
2003       1
2004       2
2007       1
2008       2
2009       6
2010      17
2011      54
2012     179
2013     339
2014     454
2015     506
2015       2
2016     627
2017     704
2017       1
2018     544
2019     242
2020       3
nan      565
Name: year_incorporated, dtype: int64

In [11]:
#segmenting based on year as before 2010, and then year wise for everything after 
def segmentation_based_on_year(row):
    if row == 'nan':
        pass
    else:
        if int(row) < 2010:
            return 'before 2010'
        else:
            return row

In [12]:
df['year_segment'] = df.apply(lambda row : segmentation_based_on_year(row['year_incorporated']),axis = 1)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4255 entries, 0 to 4254
Data columns (total 74 columns):
 #   Column                                                                                  Non-Null Count  Dtype         
---  ------                                                                                  --------------  -----         
 0   company_id                                                                              4255 non-null   int64         
 1   event_name                                                                              4237 non-null   object        
 2   country                                                                                 4242 non-null   object        
 3   industry                                                                                4133 non-null   object        
 4   business_stage                                                                          3560 non-null   object        
 5   incorporated        

In [14]:
### creating field to check presence of revenue model
def revenue_model_check(col1,col2,col3,col4,col5,col6,col7):
    cols = [col1,col2,col3,col4,col5,col6,col7]
    if any(cols):
        return True
    else:
        return False
    

In [15]:
df['revenue_model'] = df.apply(lambda row : revenue_model_check(row['revenue_model_commission'],row['revenue_model_product'],row['revenue_model_on-demand'],row['revenue_model_subscription'],row['revenue_model_freemium'],row['revenue_model_advertising'],row['revenue_model_licensing']),axis = 1)

In [16]:
df.groupby('revenue_model').revenue_model.count()

revenue_model
False     146
True     4109
Name: revenue_model, dtype: int64

In [17]:
## creating field to check percentage increase in revenue
def percent_increase_in_revenue(month1,month2,month3):
    avg = ((month2-month1)+(month3-month2))/2
    return avg/(month1+1)*100

In [18]:
df['percent_increase_in_revenue'] = df.apply(lambda row : percent_increase_in_revenue(row['revenue_1month'],row['revenue_2month'],row['revenue_3month']),axis = 1)

In [19]:
## creating field to check percentage increase in users
def percent_increase_in_users(month1,month2,month3):
    avg = ((month2-month1)+(month3-month2))/2
    return avg/(month1+1)*100

In [20]:
df['percent_increase_in_users'] = df.apply(lambda row : percent_increase_in_revenue(row['users_1month'],row['users_2month'],row['users_3month']),axis = 1)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4255 entries, 0 to 4254
Data columns (total 77 columns):
 #   Column                                                                                  Non-Null Count  Dtype         
---  ------                                                                                  --------------  -----         
 0   company_id                                                                              4255 non-null   int64         
 1   event_name                                                                              4237 non-null   object        
 2   country                                                                                 4242 non-null   object        
 3   industry                                                                                4133 non-null   object        
 4   business_stage                                                                          3560 non-null   object        
 5   incorporated        