In [67]:
%pip install pyodbc  
%pip install python-dotenv 





In [68]:
import pyodbc      #just installed with pip
from dotenv import dotenv_values    #import the dotenv_values function from the dotenv package
import pandas as pd
import warnings 

warnings.filterwarnings('ignore')

In [69]:
# Load environment variables from .env file into a dictionary
environment_variables = dotenv_values('.env')


# Get the values for the credentials you set in the '.env' file
server = environment_variables.get("SERVER")
database = environment_variables.get("DATABASE")
username = environment_variables.get("USERNAME")
password = environment_variables.get("PASSWORD")
connection_string = f"DRIVER={{SQL Server}};SERVER={server};DATABASE={database};UID={username};PWD={password}"

In [70]:
# Use the connect method of the pyodbc library and pass in the connection string.
# This will connect to the server and might take a few seconds to be complete. 
# Check your internet connection if it takes more time than necessary

connection = pyodbc.connect(connection_string)

In [71]:
# Reading the 2020 dataset. 

query = "Select * from LP1_startup_funding2020"

data2020 = pd.read_sql(query, connection)

In [72]:
# Reading the 2021 dataset. 

query = "Select * from LP1_startup_funding2021"

data2021 = pd.read_sql(query, connection)

In [73]:
# Reading the 2019 dataset. 

data2019 = pd.read_csv('startup_funding2019.csv')

In [74]:
data2018 = pd.read_csv('startup_funding2018.csv')

In [75]:
data2019.head(5)

Unnamed: 0,Company/Brand,Founded,HeadQuarter,Sector,What it does,Founders,Investor,Amount($),Stage
0,Bombay Shaving,,,Ecommerce,Provides a range of male grooming products,Shantanu Deshpande,Sixth Sense Ventures,"$6,300,000",
1,Ruangguru,2014.0,Mumbai,Edtech,A learning platform that provides topic-based ...,"Adamas Belva Syah Devara, Iman Usman.",General Atlantic,"$150,000,000",Series C
2,Eduisfun,,Mumbai,Edtech,It aims to make learning fun via games.,Jatin Solanki,"Deepak Parekh, Amitabh Bachchan, Piyush Pandey","$28,000,000",Fresh funding
3,HomeLane,2014.0,Chennai,Interior design,Provides interior designing solutions,"Srikanth Iyer, Rama Harinath","Evolvence India Fund (EIF), Pidilite Group, FJ...","$30,000,000",Series D
4,Nu Genes,2004.0,Telangana,AgriTech,"It is a seed company engaged in production, pr...",Narayana Reddy Punyala,Innovation in Food and Agriculture (IFA),"$6,000,000",


datasets highlights, shape and composition


In [76]:
data2018.shape

(526, 6)

In [77]:
data2019.shape

(89, 9)

In [78]:
data2020.shape

(1055, 10)

In [79]:
data2021.shape

(1209, 9)

In [80]:
data2018.dtypes

Company Name     object
Industry         object
Round/Series     object
Amount           object
Location         object
About Company    object
dtype: object

the amount column need to be changed, 
we can investigate the company about to see if it has relevant information


In [81]:
data2018.head(5)

Unnamed: 0,Company Name,Industry,Round/Series,Amount,Location,About Company
0,TheCollegeFever,"Brand Marketing, Event Promotion, Marketing, S...",Seed,250000,"Bangalore, Karnataka, India","TheCollegeFever is a hub for fun, fiesta and f..."
1,Happy Cow Dairy,"Agriculture, Farming",Seed,"₹40,000,000","Mumbai, Maharashtra, India",A startup which aggregates milk from dairy far...
2,MyLoanCare,"Credit, Financial Services, Lending, Marketplace",Series A,"₹65,000,000","Gurgaon, Haryana, India",Leading Online Loans Marketplace in India
3,PayMe India,"Financial Services, FinTech",Angel,2000000,"Noida, Uttar Pradesh, India",PayMe India is an innovative FinTech organizat...
4,Eunimart,"E-Commerce Platforms, Retail, SaaS",Seed,—,"Hyderabad, Andhra Pradesh, India",Eunimart is a one stop solution for merchants ...


,,  ,the industry, round/series, amount, location columns need to be synchronized 
we should use columns in the 2018 data only

In [82]:
data2019.dtypes

Company/Brand     object
Founded          float64
HeadQuarter       object
Sector            object
What it does      object
Founders          object
Investor          object
Amount($)         object
Stage             object
dtype: object

 the founded, HeadQuarter, founders, investors column, do we need it?, what it does is like about company in 2018, stage is similar to round/series, 

In [83]:
data2019.head(5)

Unnamed: 0,Company/Brand,Founded,HeadQuarter,Sector,What it does,Founders,Investor,Amount($),Stage
0,Bombay Shaving,,,Ecommerce,Provides a range of male grooming products,Shantanu Deshpande,Sixth Sense Ventures,"$6,300,000",
1,Ruangguru,2014.0,Mumbai,Edtech,A learning platform that provides topic-based ...,"Adamas Belva Syah Devara, Iman Usman.",General Atlantic,"$150,000,000",Series C
2,Eduisfun,,Mumbai,Edtech,It aims to make learning fun via games.,Jatin Solanki,"Deepak Parekh, Amitabh Bachchan, Piyush Pandey","$28,000,000",Fresh funding
3,HomeLane,2014.0,Chennai,Interior design,Provides interior designing solutions,"Srikanth Iyer, Rama Harinath","Evolvence India Fund (EIF), Pidilite Group, FJ...","$30,000,000",Series D
4,Nu Genes,2004.0,Telangana,AgriTech,"It is a seed company engaged in production, pr...",Narayana Reddy Punyala,Innovation in Food and Agriculture (IFA),"$6,000,000",


In [84]:
data2020.dtypes

Company_Brand     object
Founded          float64
HeadQuarter       object
Sector            object
What_it_does      object
Founders          object
Investor          object
Amount           float64
Stage             object
column10          object
dtype: object

In [85]:
data2020.head(10)

Unnamed: 0,Company_Brand,Founded,HeadQuarter,Sector,What_it_does,Founders,Investor,Amount,Stage,column10
0,Aqgromalin,2019.0,Chennai,AgriTech,Cultivating Ideas for Profit,"Prasanna Manogaran, Bharani C L",Angel investors,200000.0,,
1,Krayonnz,2019.0,Bangalore,EdTech,An academy-guardian-scholar centric ecosystem ...,"Saurabh Dixit, Gurudutt Upadhyay",GSF Accelerator,100000.0,Pre-seed,
2,PadCare Labs,2018.0,Pune,Hygiene management,Converting bio-hazardous waste to harmless waste,Ajinkya Dhariya,Venture Center,,Pre-seed,
3,NCOME,2020.0,New Delhi,Escrow,Escrow-as-a-service platform,Ritesh Tiwari,"Venture Catalysts, PointOne Capital",400000.0,,
4,Gramophone,2016.0,Indore,AgriTech,Gramophone is an AgTech platform enabling acce...,"Ashish Rajan Singh, Harshit Gupta, Nishant Mah...","Siana Capital Management, Info Edge",340000.0,,
5,qZense,2019.0,Bangalore,AgriTech,qZense Labs is building the next-generation Io...,"Rubal Chib, Dr Srishti Batra","Venture Catalysts, 9Unicorns Accelerator Fund",600000.0,Seed,
6,MyClassboard,2008.0,Hyderabad,EdTech,MyClassboard is a full-fledged School / Colleg...,Ajay Sakhamuri,ICICI Bank.,600000.0,Pre-series A,
7,Metvy,2018.0,Gurgaon,Networking platform,AI driven networking platform for individuals ...,Shawrya Mehrotra,HostelFund,,Pre-series,
8,Rupeek,2015.0,Bangalore,FinTech,Rupeek is an online lending platform that spec...,"Amar Prabhu, Ashwin Soni, Sumit Maniyar","KB Investment, Bertelsmann India Investments",45000000.0,Series C,
9,Gig India,2017.0,Pune,Crowdsourcing,GigIndia is a marketplace that provides on-dem...,"Aditya Shirole, Sahil Sharma","Shantanu Deshpande, Subramaniam Ramadorai",1000000.0,Pre-series A,


In [86]:
data2021.dtypes

Company_Brand     object
Founded          float64
HeadQuarter       object
Sector            object
What_it_does      object
Founders          object
Investor          object
Amount            object
Stage             object
dtype: object

In [87]:
data2021.head(10)

Unnamed: 0,Company_Brand,Founded,HeadQuarter,Sector,What_it_does,Founders,Investor,Amount,Stage
0,Unbox Robotics,2019.0,Bangalore,AI startup,Unbox Robotics builds on-demand AI-driven ware...,"Pramod Ghadge, Shahid Memon","BEENEXT, Entrepreneur First","$1,200,000",Pre-series A
1,upGrad,2015.0,Mumbai,EdTech,UpGrad is an online higher education platform.,"Mayank Kumar, Phalgun Kompalli, Ravijot Chugh,...","Unilazer Ventures, IIFL Asset Management","$120,000,000",
2,Lead School,2012.0,Mumbai,EdTech,LEAD School offers technology based school tra...,"Smita Deorah, Sumeet Mehta","GSV Ventures, Westbridge Capital","$30,000,000",Series D
3,Bizongo,2015.0,Mumbai,B2B E-commerce,Bizongo is a business-to-business online marke...,"Aniket Deb, Ankit Tomar, Sachin Agrawal","CDC Group, IDG Capital","$51,000,000",Series C
4,FypMoney,2021.0,Gurugram,FinTech,"FypMoney is Digital NEO Bank for Teenagers, em...",Kapil Banwari,"Liberatha Kallat, Mukesh Yadav, Dinesh Nagpal","$2,000,000",Seed
5,Urban Company,2014.0,New Delhi,Home services,Urban Company (Formerly UrbanClap) is a home a...,"Abhiraj Singh Bhal, Raghav Chandra, Varun Khaitan",Vy Capital,"$188,000,000",
6,Comofi Medtech,2018.0,Bangalore,HealthTech,Comofi Medtech is a healthcare robotics startup.,Gururaj KB,"CIIE.CO, KIIT-TBI","$200,000",
7,Qube Health,2016.0,Mumbai,HealthTech,India's Most Respected Workplace Healthcare Ma...,Gagan Kapur,Inflection Point Ventures,Undisclosed,Pre-series A
8,Vitra.ai,2020.0,Bangalore,Tech Startup,Vitra.ai is an AI-based video translation plat...,Akash Nidhi PS,Inflexor Ventures,Undisclosed,
9,Taikee,2010.0,Mumbai,E-commerce,"Taikee is the ISO-certified, B2B e-commerce pl...","Nidhi Ramachandran, Sachin Chhabra",,"$1,000,000",


Assuming we will use company name, industry, type of funding, amount and location

Null hypothesis: The amount of funding a company receives is not affected by its location, type of industry, and the type of funding
alternative hypothesis; the amount of funding received by a company is significantly affected by the location of the company, type of funding and the industry the company is.

Questions:
1. The industry in which a startup operates is related to the amount of funding it receives
2. The location of a startup is related to the amount of funding it receives.
3. The funding type of the company is related to the amount the company receives in funding
4. The time when the startups were funded affects the amount the startups received.

Cleaning Data

In [88]:
data2019.isnull().sum()

Company/Brand     0
Founded          29
HeadQuarter      19
Sector            5
What it does      0
Founders          3
Investor          0
Amount($)         0
Stage            46
dtype: int64

In [89]:
data2018.isnull().sum()

Company Name     0
Industry         0
Round/Series     0
Amount           0
Location         0
About Company    0
dtype: int64

In [90]:
data2020.isnull().sum()

Company_Brand       0
Founded           213
HeadQuarter        94
Sector             13
What_it_does        0
Founders           12
Investor           38
Amount            254
Stage             464
column10         1053
dtype: int64

In [91]:
data2021.isnull().sum()

Company_Brand      0
Founded            1
HeadQuarter        1
Sector             0
What_it_does       0
Founders           4
Investor          62
Amount             3
Stage            428
dtype: int64

Given thata the 2018 data does not have null values, and the columns we are using for the 2018, 2019, 2020 and 2021 data have just a few missing variables, I will extract them then investigate them further.

In [92]:
data2018.describe()

Unnamed: 0,Company Name,Industry,Round/Series,Amount,Location,About Company
count,526,526,526,526,526,526
unique,525,405,21,198,50,524
top,TheCollegeFever,—,Seed,—,"Bangalore, Karnataka, India","TheCollegeFever is a hub for fun, fiesta and f..."
freq,2,30,280,148,102,2


In [93]:
data2019.describe()

Unnamed: 0,Founded
count,60.0
mean,2014.533333
std,2.937003
min,2004.0
25%,2013.0
50%,2015.0
75%,2016.25
max,2019.0


In [94]:
data2020.describe()

Unnamed: 0,Founded,Amount
count,842.0,801.0
mean,2015.36342,113043000.0
std,4.097909,2476635000.0
min,1973.0,12700.0
25%,2014.0,1000000.0
50%,2016.0,3000000.0
75%,2018.0,11000000.0
max,2020.0,70000000000.0


In [95]:
data2021.describe()

Unnamed: 0,Founded
count,1208.0
mean,2016.655629
std,4.517364
min,1963.0
25%,2015.0
50%,2018.0
75%,2020.0
max,2021.0


a number of variables in the datasets have wrong data types and their summary statistics do not show up. They need to be changed

In [96]:
data2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 526 entries, 0 to 525
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Company Name   526 non-null    object
 1   Industry       526 non-null    object
 2   Round/Series   526 non-null    object
 3   Amount         526 non-null    object
 4   Location       526 non-null    object
 5   About Company  526 non-null    object
dtypes: object(6)
memory usage: 24.8+ KB


the amount variable need to be changed to an integer or float

In [97]:
data2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Company/Brand  89 non-null     object 
 1   Founded        60 non-null     float64
 2   HeadQuarter    70 non-null     object 
 3   Sector         84 non-null     object 
 4   What it does   89 non-null     object 
 5   Founders       86 non-null     object 
 6   Investor       89 non-null     object 
 7   Amount($)      89 non-null     object 
 8   Stage          43 non-null     object 
dtypes: float64(1), object(8)
memory usage: 6.4+ KB


In [98]:
data2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1055 entries, 0 to 1054
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Company_Brand  1055 non-null   object 
 1   Founded        842 non-null    float64
 2   HeadQuarter    961 non-null    object 
 3   Sector         1042 non-null   object 
 4   What_it_does   1055 non-null   object 
 5   Founders       1043 non-null   object 
 6   Investor       1017 non-null   object 
 7   Amount         801 non-null    float64
 8   Stage          591 non-null    object 
 9   column10       2 non-null      object 
dtypes: float64(2), object(8)
memory usage: 82.6+ KB


In [99]:
data2021.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1209 entries, 0 to 1208
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Company_Brand  1209 non-null   object 
 1   Founded        1208 non-null   float64
 2   HeadQuarter    1208 non-null   object 
 3   Sector         1209 non-null   object 
 4   What_it_does   1209 non-null   object 
 5   Founders       1205 non-null   object 
 6   Investor       1147 non-null   object 
 7   Amount         1206 non-null   object 
 8   Stage          781 non-null    object 
dtypes: float64(1), object(8)
memory usage: 85.1+ KB


renaming the columns before merging

In [100]:
#renaming 2018
data2018.rename(columns = {"Round/Series":"Stage"}, inplace = True)

In [101]:
#renaming 2019
data2019.rename(columns = {"Company/Brand":"Company Name","Sector":"Industry","What it does":"About Company","Amount($)":"Amount","HeadQuarter":"Location"}, inplace = True)
data2019.head(5)

Unnamed: 0,Company Name,Founded,Location,Industry,About Company,Founders,Investor,Amount,Stage
0,Bombay Shaving,,,Ecommerce,Provides a range of male grooming products,Shantanu Deshpande,Sixth Sense Ventures,"$6,300,000",
1,Ruangguru,2014.0,Mumbai,Edtech,A learning platform that provides topic-based ...,"Adamas Belva Syah Devara, Iman Usman.",General Atlantic,"$150,000,000",Series C
2,Eduisfun,,Mumbai,Edtech,It aims to make learning fun via games.,Jatin Solanki,"Deepak Parekh, Amitabh Bachchan, Piyush Pandey","$28,000,000",Fresh funding
3,HomeLane,2014.0,Chennai,Interior design,Provides interior designing solutions,"Srikanth Iyer, Rama Harinath","Evolvence India Fund (EIF), Pidilite Group, FJ...","$30,000,000",Series D
4,Nu Genes,2004.0,Telangana,AgriTech,"It is a seed company engaged in production, pr...",Narayana Reddy Punyala,Innovation in Food and Agriculture (IFA),"$6,000,000",


In [102]:
#renaming 2020 data
data2020.rename(columns = {"Company_Brand":"Company Name","Sector":"Industry","What_it_does":"About Company","HeadQuarter":"Location"}, inplace = True)
data2020.sample(5)

Unnamed: 0,Company Name,Founded,Location,Industry,About Company,Founders,Investor,Amount,Stage,column10
87,Habitat,2016.0,,EdTech,"Habitat, a social learning edtech platform for...","Rohit Pande, Shikhar Sachan","Unitus Ventures, Whiteboard Capital",600000.0,Seed,
438,BuildPan,2019.0,Indore,SaaS startup,"Buildpan helps you with continuous build, deve...","Sonal Dandotia, Shantanu S, Virendra Chouhan, ...","Sunil Kumar Singhvi, Yusho Kawata",500000.0,Seed,
919,Paytm money,2010.0,Bangalore,Fintech,Wealth Management,Vijay Shekhar Sharma,One97 Communications,5600000.0,,
271,LIVSPACE,2014.0,Bangalore,Interior Design,Livspace is an interior design startup that of...,"Anuj Srivastava, Ramakant Sharma, Shagufta Anurag","Mercer Investments, Bessemer Venture Partners",4000000.0,Debt,
181,Charcoal Eats,2015.0,Mumbai,QSR startup,"In the last five years, have served over 1.5 m...","Anurag Mehrotra, Krishnakant Thakur, Mohammed ...",Lokmat Investments,2000000.0,,


In [103]:
#renaming 2021 columns
data2021.rename(columns = {"Company_Brand":"Company Name","Sector":"Industry","What_it_does":"About Company","HeadQuarter":"Location"}, inplace = True)
data2021.head(5)

Unnamed: 0,Company Name,Founded,Location,Industry,About Company,Founders,Investor,Amount,Stage
0,Unbox Robotics,2019.0,Bangalore,AI startup,Unbox Robotics builds on-demand AI-driven ware...,"Pramod Ghadge, Shahid Memon","BEENEXT, Entrepreneur First","$1,200,000",Pre-series A
1,upGrad,2015.0,Mumbai,EdTech,UpGrad is an online higher education platform.,"Mayank Kumar, Phalgun Kompalli, Ravijot Chugh,...","Unilazer Ventures, IIFL Asset Management","$120,000,000",
2,Lead School,2012.0,Mumbai,EdTech,LEAD School offers technology based school tra...,"Smita Deorah, Sumeet Mehta","GSV Ventures, Westbridge Capital","$30,000,000",Series D
3,Bizongo,2015.0,Mumbai,B2B E-commerce,Bizongo is a business-to-business online marke...,"Aniket Deb, Ankit Tomar, Sachin Agrawal","CDC Group, IDG Capital","$51,000,000",Series C
4,FypMoney,2021.0,Gurugram,FinTech,"FypMoney is Digital NEO Bank for Teenagers, em...",Kapil Banwari,"Liberatha Kallat, Mukesh Yadav, Dinesh Nagpal","$2,000,000",Seed


dropping irrelevant columns

In [104]:
nwdata2019 = data2019[['Company Name','Founded','Industry','About Company','Amount','Stage','Location']]
nwdata2019.tail(5)

Unnamed: 0,Company Name,Founded,Industry,About Company,Amount,Stage,Location
84,Infra.Market,,Infratech,It connects client requirements to their suppl...,"$20,000,000",Series A,Mumbai
85,Oyo,2013.0,Hospitality,Provides rooms for comfortable stay,"$693,000,000",,Gurugram
86,GoMechanic,2016.0,Automobile & Technology,Find automobile repair and maintenance service...,"$5,000,000",Series B,Delhi
87,Spinny,2015.0,Automobile,Online car retailer,"$50,000,000",,Delhi
88,Ess Kay Fincorp,,Banking,Organised Non-Banking Finance Company,"$33,000,000",,Rajasthan


In [105]:
nwdata2020 = data2020[['Company Name','Founded','Industry','About Company','Amount','Stage','Location']]
nwdata2020.tail(5)

Unnamed: 0,Company Name,Founded,Industry,About Company,Amount,Stage,Location
1050,Leverage Edu,,Edtech,AI enabled marketplace that provides career gu...,1500000.0,,Delhi
1051,EpiFi,,Fintech,It offers customers with a single interface fo...,13200000.0,Seed Round,
1052,Purplle,2012.0,Cosmetics,Online makeup and beauty products retailer,8000000.0,,Mumbai
1053,Shuttl,2015.0,Transport,App based bus aggregator serice,8043000.0,Series C,Delhi
1054,Pando,2017.0,Logitech,Networked logistics management software,9000000.0,Series A,Chennai


In [106]:
nwdata2021 = data2021[['Company Name','Founded','Industry','About Company','Amount','Stage','Location']]
nwdata2021.tail(5)

Unnamed: 0,Company Name,Founded,Industry,About Company,Amount,Stage,Location
1204,Gigforce,2019.0,Staffing & Recruiting,A gig/on-demand staffing company.,$3000000,Pre-series A,Gurugram
1205,Vahdam,2015.0,Food & Beverages,VAHDAM is among the world’s first vertically i...,$20000000,Series D,New Delhi
1206,Leap Finance,2019.0,Financial Services,International education loans for high potenti...,$55000000,Series C,Bangalore
1207,CollegeDekho,2015.0,EdTech,"Collegedekho.com is Student’s Partner, Friend ...",$26000000,Series B,Gurugram
1208,WeRize,2019.0,Financial Services,India’s first socially distributed full stack ...,$8000000,Series A,Bangalore


In [107]:
#adding a year column to each dataset
data2018["Year"]= 2018

In [108]:
#adding a year column to each dataset
nwdata2019["Year"]= 2019

In [109]:
#adding a year column to each dataset
nwdata2020["Year"]= 2020

In [110]:
#adding a year column to each dataset
nwdata2021["Year"]= 2021

Merging the data

In [111]:
combineddata = pd.concat([data2018, nwdata2019, nwdata2020,nwdata2021], ignore_index=True)
combineddata.sample(10)

Unnamed: 0,Company Name,Industry,Stage,Amount,Location,About Company,Year,Founded
2310,Convosight,SaaS startup,Series A,$9000000,New Delhi,A mission - to help community builders become ...,2021,2019.0
2209,Trell,Social commerce,Series B,$45000000,Bangalore,India's Largest Lifestyle Social Commerce plat...,2021,2016.0
526,Bombay Shaving,Ecommerce,,"$6,300,000",,Provides a range of male grooming products,2019,
2082,DeHaat,AgriTech,Series C,"$30,000,000",Patna,DeHaat connects farmers to suppliers and buyer...,2021,2012.0
2354,Green Cure,HealthCare,Pre-series A,$Undisclosed,New Delhi,World's first company to combine German Engine...,2021,2015.0
1606,Tea Trunk,Beverage,,220000.0,Goa,Global premium tea brand,2020,2013.0
2240,PeakPerformer,EdTech,Early seed,$undisclosed,Bangalore,"A goal-oriented, outcome-driven platform to ma...",2021,2020.0
379,Signzy,"Computer, FinTech, Risk Management, Software",Series A,"₹240,000,000","Bangalore, Karnataka, India",Signzy are creating 'building blocks for a Dig...,2018,
2664,Fraazo,Food & Beverages,Series B,$50000000,Mumbai,FRAAZO is Mumbai's favourite App for Fresh Veg...,2021,2020.0
908,Fanball XI,Gaming,Seed,1000000.0,New Delhi,The gaming company mostly focused on fantasy f...,2020,


Cleaning the combined data

In [112]:
combineddata.nunique()

Company Name     2214
Industry          873
Stage              75
Amount            774
Location          172
About Company    2691
Year                4
Founded            34
dtype: int64

since we have 873 unique industries, I am going to re classify the industries to 5 major categories

In [124]:
# Standardize the "Stage" column to lowercase
combineddata['Stage'] = combineddata['Stage'].str.lower()

In [143]:
# Combine similar Seed categories
combineddata['Stage'].replace({'seed round': 'seed', 'seed funding': 'seed', 'seed+': 'seed', 'pre-seed round':'seed', 'pre seed round':'seed', 'Seed':'seed', 'pre-seed':'seed','early seed':'seed','seed a':'seed','seed round & series a':'seed','seed fund':'seed'}, inplace=True)

In [139]:
# Combine similar Series A categories
combineddata['Stage'].replace({'pre series a': 'series a', 'pre-series a': 'series a', 'pre-series a1': 'series a', 'seies a':'series a', 'series a+':'series a', 'post series a':'series a', 'pre- series a':'series a', 'pre series a1':'series a', 'series a-1':'series a','series a2':'series a'}, inplace=True)

In [141]:
# Combine similar Series B categories
combineddata['Stage'].replace({'pre-series b': 'series b', 'pre series b': 'series b', 'series b2': 'series b', 'series b3':'series b', 'series b+':'series b'}, inplace=True)

In [None]:
# Combine similar all the other various types of funding to Others category
combineddata['Stage'].replace({'debt': 'other', 'angel': 'other', 'venture - series unknown': 'other', 'debt financing':'other', 'private equity':'other','corporate round':'other','bridge':'other','angel round':'other','grant':'other':'post-ipo equity':'other','secondary market':'other','pre-series':'other','$1200000':'other','undisclosed':'other','post-ipo debt':'other','$300000':'other','$6000000':'other','$1000000':'other',''}, inplace=True)

In [144]:
Stage_Vcounts = combineddata.Stage.value_counts().reset_index()
Stage_Vcounts.columns = ['stage','count']
Stage_Vcounts

Unnamed: 0,stage,count
0,seed,768
1,series a,601
2,series b,156
3,series c,114
4,series d,50
5,debt,45
6,angel,37
7,venture - series unknown,37
8,series e,30
9,debt financing,13
