> #### *This dataset has funding information of the Indian startups from January 2015 to recent 2020. It includes columns with the date funded, the city the startup is based out of, the names of the funders, and the amount invested (in USD)*[](http://)

Interactive WebApp Link: [https://share.streamlit.io/satyampd/startups-in-india-eda/app.py](https://share.streamlit.io/satyampd/startups-in-india-eda/app.py)
> You can change the numbers in above WebApp, eg., you can see top 3 or top 15 within same image using slidebar.


WebApp has been created using open-source app framework called streamlit, with the help of streamlit we can create web-interface within few hours, also we can deploy the app within miniutes using https://share.streamlit.io/

Code for app can be found here: https://github.com/Satyampd/Startups-in-India-EDA/blob/master/app.py
> Note: Code is almost same as this notebook, it just has 5-10 extra lines of code for streamlit.

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [None]:
import plotly.graph_objects as go
import matplotlib.pyplot as plt

### **Reading the datset and setting the name of columns with data.columns**

In [None]:
data=pd.read_csv('../input/indian-startup-funding/startup_funding.csv')
data.columns=['SNo', 'Date', 'StartupName', 'IndustryVertical', 'SubVertical',
       'CityLocation', 'InvestorsName', 'InvestmentType', 'AmountInUSD',
       'Remarks']

In [None]:
#Lets look the shape of dataset
data.shape

In [None]:
#Now, Lets have a view at top two rows of dataset
data.head(2)

## **Lets remove inconsistency**


In [None]:
data['StartupName'] = data['StartupName'].apply(lambda x: (str(x).replace("\\\\","")))
data['StartupName'] = data['StartupName'].apply(lambda x: (str(x).replace("\"","")))

In [None]:
for i in range(0,len(data["IndustryVertical"])):
    if data["IndustryVertical"][i] in ["ECommerce",
                                       "ecommerce",
                                       "Ecommerce", 
                                       "E-Commerce",
                                       "E-commerce"]:
        data["IndustryVertical"][i]="eCommerce"
        
for i in range(0,len(data["StartupName"])):
    if data["StartupName"][i] in ["Ola",
                                  "Ola Cabs", 
                                  "Olacabs"]:
        data["StartupName"][i]="Ola"  
    elif data["StartupName"][i] =="Flipkart.com":
        data["StartupName"][i]="Flipkart"    
    elif data["StartupName"][i] =="Paytm Marketplace":
        data["StartupName"][i]="Paytm"   
for i in range(0,len(data["StartupName"])):
  if data["InvestorsName"][i] in ['Undisclosed investors',
                                  'Undisclosed Investors',
                                  'Undisclosed',
                                  'Undisclosed investor',
                                  'Undisclosed Investor',
                                  'undisclosed investors']:
    data["InvestorsName"][i]="Undisclosed"
    
for i in range(0,len(data["StartupName"])):
    if data["StartupName"][i] in ["OYO",
                                  "OYO Rooms", 
                                  "OyoRooms", 
                                  "Oyorooms", 
                                  "Oyo",
                                 "Oyo Rooms"]:
        data["StartupName"][i]= "OYO Rooms"
    elif data["StartupName"][i] in ["Byjuxe2x80x99s",
                                    "BYJU'S"]:
        data["StartupName"][i]= "Byju's"    
    
for i in range  (0,len(data["CityLocation"])):
    if data["CityLocation"][i] in ["New Delhi",
                                   "Delhi",
                                   "Noida", 
                                   "Gurugram",
                                   "Gurgaon"]:
        data["CityLocation"][i]="NCR"
    elif data["CityLocation"][i]=="Bangalore":
        data["CityLocation"][i]="Bengaluru"

data.loc[data['CityLocation'].isin(['\\\\xc2\\\\xa0Noida', '\\xc2\\xa0Noida']), 'CityLocation'] = 'Noida'
data.loc[data['CityLocation'].isin(['\\\\xc2\\\\xa0Bangalore', '\\xc2\\xa0Bangalore', 'Bangalore']), 'CityLocation'] = 'Bengaluru'
data.loc[data['CityLocation'].isin(['\\\\xc2\\\\xa0New Delhi', '\\xc2\\xa0New Delhi']), 'CityLocation'] = 'New Delhi'
data.loc[data['CityLocation'].isin(['\\\\xc2\\\\xa0Gurgaon', 'Gurugram']), 'CityLocation'] = 'Gurgaon'
data.loc[data['CityLocation'].isin(['\\\\xc2\\\\xa0Mumbai', '\\xc2\\xa0Mumbai']), 'CityLocation'] = 'Mumbai'
        

data.loc[data['IndustryVertical'] == "\\\\xc2\\\\xa0News Aggregator mobile app", 'IndustryVertical'] = 'News Aggregator mobile app'
data.loc[data['IndustryVertical'] == "\\\\xc2\\\\xa0Online Jewellery Store", 'IndustryVertical'] = 'Online Jewellery Store'
data.loc[data['IndustryVertical'] == "\\\\xc2\\\\xa0Fashion Info Aggregator App", 'IndustryVertical'] = 'Fashion Info Aggregator App'
data.loc[data['IndustryVertical'] == "\\\\xc2\\\\xa0Online Study Notes Marketplace", 'IndustryVertical'] = 'Online Study Notes Marketplace'
data.loc[data['IndustryVertical'] == "\\\\xc2\\\\xa0Warranty Programs Service Administration", 'IndustryVertical'] = 'Warranty Programs Service Administration'
data.loc[data['IndustryVertical'] == "\\\\xc2\\\\xa0Pre-School Chain", 'IndustryVertical'] = 'Pre-School Chain'
data.loc[data['IndustryVertical'] == "\\\\xc2\\\\xa0Premium Loyalty Rewards Point Management", 'IndustryVertical'] = 'Premium Loyalty Rewards Point Management'
data.loc[data['IndustryVertical'] == "\\\\xc2\\\\xa0Contact Center Software Platform", 'IndustryVertical'] = 'Contact Center Software Platform'
data.loc[data['IndustryVertical'] == "\\\\xc2\\\\xa0Casual Dining restaurant Chain", 'IndustryVertical'] = 'Casual Dining restaurant Chain'
data.loc[data['IndustryVertical'] == "\\\\xc2\\\\xa0Online Grocery Delivery", 'IndustryVertical'] = 'Online Grocery Delivery'
data.loc[data['IndustryVertical'] == "Online home d\\\\xc3\\\\xa9cor marketplace", 'IndustryVertical'] = 'Online home decor marketplace'
data.loc[data['IndustryVertical'].isin(["Fin-Tech"]), 'IndustryVertical'] = 'FinTech'   

data.loc[data['InvestorsName'].isin(['Undisclosed investors', 'Undisclosed', 'undisclosed investors', 'Undisclosed Investor', 'Undisclosed investors']), 'InvestorsName'] = 'Undisclosed Investors'
data.loc[data['InvestorsName'] == "\\\\xc2\\\\xa0Tiger Global", 'InvestorsName'] = 'Tiger Global'
data.loc[data['InvestorsName'] == "\\\\xc2\\\\xa0IndianIdeas.com", 'InvestorsName'] = 'IndianIdeas'
data.loc[data['InvestorsName'] == "\\\\xc2\\\\xa0IvyCap Ventures, Accel Partners, Dragoneer Investment Group", 'InvestorsName'] = 'IvyCap Ventures, Accel Partners, Dragoneer Investment Group'
data.loc[data['InvestorsName'] == "\\\\xc2\\\\xa0Goldman Sachs", 'InvestorsName'] = 'Goldman Sachs'

#Recent cleaning code is taken from  from jagannathrk notebook.

> ### One has to be mindful that in Python (or NumPy), the nan's don’t compare equal, but None's do.

In [None]:
# Lets see percentage of present values in every column
#data.isnull().sum()
p=((data.count()/data.isna().count())*100)
display(p)
# Alternate: Lets see percentage of missing values in every column
m=(100-(data.count()/data.isna().count())*100)
display(m)

In [None]:
# As we can see, Remarks column has the 86.235217% missing value, so we will drop it.
data.drop("Remarks", axis=1, inplace=True)

## **Now explore about IndustryVertical (Industry they belong)**

In [None]:
IV=data.IndustryVertical
data.IndustryVertical.nunique()

#### There are total 815 unique industry vertical, but here we will focus on top 10

In [None]:
label10=np.arange(0,10)
top=data["IndustryVertical"].value_counts().head(10)
fig=go.Figure(data=[go.Bar(y=top.values,x=top.index, marker={'color':label10})])
fig.update_layout(autosize=True ,plot_bgcolor='rgb(275, 275, 275)')
fig.data[0].marker.line.width = 3
fig.data[0].marker.line.color = "black"  
fig

## Lets explore the amount of Funding startups received

In [None]:
display(data["AmountInUSD"][0])
display(data["AmountInUSD"][0]*2)

In [None]:
# As we can, column's values are working as String. 

In [None]:
import re

data.drop([2602,2603,2604,2605,2606,2607,2608,2609,2610,2611], inplace = True)
data.reset_index(drop=True, inplace=True)

for i in range (0, len(data["AmountInUSD"])):
    data["AmountInUSD"][i]=re.sub('\D',"",str(data["AmountInUSD"][i]))
data["AmountInUSD"]=pd.to_numeric(data["AmountInUSD"])

for i in range (0, len(data["StartupName"])):
    data["StartupName"][i]=re.sub('xc2xa0',"",str(data["StartupName"][i]))

In [None]:
temp=data[["StartupName","AmountInUSD"]].groupby("StartupName").sum().sort_values(by="AmountInUSD", ascending=False)
top=temp.head(10)
print(top)

Rapido data is wrong, it is in INR.
> Link to article: [Bike taxi app Rapido raises Rs 391 Cr](https://yourstory.com/2019/08/startup-funding-bike-taxi-app-rapido-series-b-westbridge-capital)

In [None]:
data["AmountInUSD"][data["StartupName"]=="Rapido Bike Taxi"]=data["AmountInUSD"]/71.19
temp=data[["StartupName","AmountInUSD"]].groupby("StartupName").sum().sort_values(by="AmountInUSD", ascending=False)
top=temp.head(10)
print(top)

In [None]:
label10=np.arange(0,10)
fig=go.Figure(data=[go.Bar(y=top.AmountInUSD,x=top.index, marker={'color':label10})])
fig.update_layout(autosize=True ,plot_bgcolor='rgb(275, 275, 275)')
fig.data[0].marker.line.width = 3
fig.data[0].marker.line.color = "black" 
fig.show()

## **Lets see the Pie-chart for top 7 different types of Funding**

In [None]:
data['InvestmentType'] = data['InvestmentType'].apply(lambda x: (str(x).replace("\\\\n"," ")))
                                                                                   
for i in range(0, len(data["InvestmentType"])):
    if data["InvestmentType"][i] in ["Seed/ Angel Funding","Seed / Angel Funding","Seed/Angel Funding",
                                       "Seed / Angle Funding", "Angel / Seed Funding"]:
        data["InvestmentType"][i]="Seed Angel Funding"
    
  

typ=data["InvestmentType"].value_counts().head(7)


In [None]:
colrs = ['gold', 'mediumturquoise', 'darkorange', 'lightgreen']

fig=go.Figure(data=[go.Pie(labels=typ.index,values=typ.values)])
fig.update_traces(hoverinfo='label+percent', textinfo='label+percent', textfont_size=10,
                  marker=dict(colors=colrs))
fig.data[0].marker.line.width = 3
fig.data[0].marker.line.color = "black"                
fig.update_layout(height=600,autosize=True ,plot_bgcolor='rgb(275, 275, 275)')
fig.show()

## **Top 10 Investors**

In [None]:
i=data['InvestorsName'].value_counts().head(11).reset_index()
i.columns=["InvestorsName", "Number"]
i.head(2)

In [None]:
# We will remove Undisclosed row
i.drop(0,axis=0,inplace=True)

In [None]:
fig=go.Figure(data=[go.Scatter(x=i.InvestorsName,y=i.Number,mode='markers',marker_size=(i.Number)*3,  marker={'color':label10})])
fig.update_layout(autosize=True ,plot_bgcolor='rgb(275, 275, 275)')
fig.data[0].marker.line.width = 3
fig.data[0].marker.line.color = "black"
fig.show()

## **Top 10 Cities with Highest number of Startups**

In [None]:
cities=data["CityLocation"].value_counts().head(10).reset_index()
cities.columns=["City","Number"]
fig=go.Figure(data=[go.Scatter(x=cities.City,y=cities.Number,mode='markers',marker_size=(cities.Number)/6,  marker={'color':label10})])
fig.data[0].marker.line.width = 3
fig.data[0].marker.line.color = "black" 
fig.update_layout(autosize=True,plot_bgcolor='rgb(275, 275, 275)')
fig.show()

## **Lets see most common words in SubVertical using WordCloud**

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
text=[]
for i in range (0, len(data["SubVertical"])):
  if type(data["SubVertical"][i])==str:
    text.append(data["SubVertical"][i])
   
text=" ".join(text) 
text = text.split(" ")
# text=set(text)
text=" ".join(text) 

In [None]:
wordcloud = WordCloud( max_words=200, background_color="white",collocations=False, width=1600, height=800).generate(text)
plt.figure(figsize=(20,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

### **End of Notebook**