# IMPORTING ALL REQUIRED LIBRARY

In [114]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sea

# Accessing Dataset


In [115]:
df=pd.read_csv('startup_fund.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3044 entries, 0 to 3043
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Sr No              3044 non-null   int64 
 1   Date dd/mm/yyyy    3044 non-null   object
 2   Startup Name       3044 non-null   object
 3   Industry Vertical  2873 non-null   object
 4   SubVertical        2108 non-null   object
 5   City  Location     2864 non-null   object
 6   Investors Name     3020 non-null   object
 7   InvestmentnType    3040 non-null   object
 8   Amount in USD      2084 non-null   object
 9   Remarks            419 non-null    object
dtypes: int64(1), object(9)
memory usage: 237.9+ KB


# Count Of Null

In [116]:
df.isnull().sum()

Sr No                   0
Date dd/mm/yyyy         0
Startup Name            0
Industry Vertical     171
SubVertical           936
City  Location        180
Investors Name         24
InvestmentnType         4
Amount in USD         960
Remarks              2625
dtype: int64

# Data Cleaning

In [117]:
df.head()

Unnamed: 0,Sr No,Date dd/mm/yyyy,Startup Name,Industry Vertical,SubVertical,City Location,Investors Name,InvestmentnType,Amount in USD,Remarks
0,1,09/01/2020,BYJU’S,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,200000000,
1,2,13/01/2020,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C,8048394,
2,3,09/01/2020,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B,18358860,
3,4,02/01/2020,https://www.wealthbucket.in/,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A,3000000,
4,5,02/01/2020,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round,1800000,


## Removing Strip Space And Convert All Objects in Lowercase

In [118]:
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df = df.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
df.head()


Unnamed: 0,Sr No,Date dd/mm/yyyy,Startup Name,Industry Vertical,SubVertical,City Location,Investors Name,InvestmentnType,Amount in USD,Remarks
0,1,09/01/2020,byju’s,e-tech,e-learning,bengaluru,tiger global management,private equity round,200000000,
1,2,13/01/2020,shuttl,transportation,app based shuttle service,gurgaon,susquehanna growth equity,series c,8048394,
2,3,09/01/2020,mamaearth,e-commerce,retailer of baby and toddler products,bengaluru,sequoia capital india,series b,18358860,
3,4,02/01/2020,https://www.wealthbucket.in/,fintech,online investment,new delhi,vinod khatumal,pre-series a,3000000,
4,5,02/01/2020,fashor,fashion and apparel,embroiled clothes for women,mumbai,sprout venture partners,seed round,1800000,


## Rename a Column


In [119]:
df.rename(columns={'InvestmentnType':'InvestmentType','Date dd/mm/yyyy':'Date'},inplace=True)
df.head()

Unnamed: 0,Sr No,Date,Startup Name,Industry Vertical,SubVertical,City Location,Investors Name,InvestmentType,Amount in USD,Remarks
0,1,09/01/2020,byju’s,e-tech,e-learning,bengaluru,tiger global management,private equity round,200000000,
1,2,13/01/2020,shuttl,transportation,app based shuttle service,gurgaon,susquehanna growth equity,series c,8048394,
2,3,09/01/2020,mamaearth,e-commerce,retailer of baby and toddler products,bengaluru,sequoia capital india,series b,18358860,
3,4,02/01/2020,https://www.wealthbucket.in/,fintech,online investment,new delhi,vinod khatumal,pre-series a,3000000,
4,5,02/01/2020,fashor,fashion and apparel,embroiled clothes for women,mumbai,sprout venture partners,seed round,1800000,


# Handling Missing Value

## Remove 4 rows 

Because there is only for null space so i remove them


In [120]:
df.dropna(subset=['InvestmentType'],inplace=True)
df.head()


Unnamed: 0,Sr No,Date,Startup Name,Industry Vertical,SubVertical,City Location,Investors Name,InvestmentType,Amount in USD,Remarks
0,1,09/01/2020,byju’s,e-tech,e-learning,bengaluru,tiger global management,private equity round,200000000,
1,2,13/01/2020,shuttl,transportation,app based shuttle service,gurgaon,susquehanna growth equity,series c,8048394,
2,3,09/01/2020,mamaearth,e-commerce,retailer of baby and toddler products,bengaluru,sequoia capital india,series b,18358860,
3,4,02/01/2020,https://www.wealthbucket.in/,fintech,online investment,new delhi,vinod khatumal,pre-series a,3000000,
4,5,02/01/2020,fashor,fashion and apparel,embroiled clothes for women,mumbai,sprout venture partners,seed round,1800000,


## Remove Strip Space in Columns

In [121]:
df.columns=df.columns.str.strip()

## Handling MIssing Values


In [122]:
df.fillna({
    'City  Location':'Unknown',
    'Investors Name':'Undisclosed',
    'Industry Vertical': 'Unrecognized',
    'SubVertical':'other'
},inplace=True)



In [123]:
df.isnull().sum()

Sr No                   0
Date                    0
Startup Name            0
Industry Vertical       0
SubVertical             0
City  Location          0
Investors Name          0
InvestmentType          0
Amount in USD         959
Remarks              2621
dtype: int64

### 🔢 Converting 'AmountInUSD' to Numeric

In [124]:
df['Amount in USD']=pd.to_numeric(df['Amount in USD'].replace('[^0-9]','',regex=True),errors='coerce')
df.head()

Unnamed: 0,Sr No,Date,Startup Name,Industry Vertical,SubVertical,City Location,Investors Name,InvestmentType,Amount in USD,Remarks
0,1,09/01/2020,byju’s,e-tech,e-learning,bengaluru,tiger global management,private equity round,200000000.0,
1,2,13/01/2020,shuttl,transportation,app based shuttle service,gurgaon,susquehanna growth equity,series c,8048394.0,
2,3,09/01/2020,mamaearth,e-commerce,retailer of baby and toddler products,bengaluru,sequoia capital india,series b,18358860.0,
3,4,02/01/2020,https://www.wealthbucket.in/,fintech,online investment,new delhi,vinod khatumal,pre-series a,3000000.0,
4,5,02/01/2020,fashor,fashion and apparel,embroiled clothes for women,mumbai,sprout venture partners,seed round,1800000.0,


## Handling missing amount of Amount in USD 

In [125]:
median_amount=df['Amount in USD'].median()
df.fillna({'Amount in USD':median_amount},inplace=True)
df.head()

Unnamed: 0,Sr No,Date,Startup Name,Industry Vertical,SubVertical,City Location,Investors Name,InvestmentType,Amount in USD,Remarks
0,1,09/01/2020,byju’s,e-tech,e-learning,bengaluru,tiger global management,private equity round,200000000.0,
1,2,13/01/2020,shuttl,transportation,app based shuttle service,gurgaon,susquehanna growth equity,series c,8048394.0,
2,3,09/01/2020,mamaearth,e-commerce,retailer of baby and toddler products,bengaluru,sequoia capital india,series b,18358860.0,
3,4,02/01/2020,https://www.wealthbucket.in/,fintech,online investment,new delhi,vinod khatumal,pre-series a,3000000.0,
4,5,02/01/2020,fashor,fashion and apparel,embroiled clothes for women,mumbai,sprout venture partners,seed round,1800000.0,


In [126]:
df.isnull().sum()

Sr No                   0
Date                    0
Startup Name            0
Industry Vertical       0
SubVertical             0
City  Location          0
Investors Name          0
InvestmentType          0
Amount in USD           0
Remarks              2621
dtype: int64

## Drop Remarks Column
because it is not useful for numeric EDA



In [127]:
df=df.drop('Remarks',axis=1)

In [128]:
df.columns = df.columns.str.strip()           # Remove leading/trailing spaces
df.columns = df.columns.str.replace('\s+', ' ', regex=True)  # Replace multiple spaces with a single space

# Standardize Texts

In [129]:
df['City Location'].unique()

array(['bengaluru', 'gurgaon', 'new delhi', 'mumbai', 'chennai', 'pune',
       'noida', 'faridabad', 'san francisco', 'san jose,', 'amritsar',
       'delhi', 'kormangala', 'hyderabad', 'burnsville', 'menlo park',
       'gurugram', 'palo alto', 'santa monica', 'singapore', 'taramani',
       'andheri', 'chembur', 'haryana', 'new york', 'karnataka',
       'mumbai/bengaluru', 'bhopal', 'bengaluru and gurugram',
       'india/singapore', 'jaipur', 'india/us', 'nagpur', 'indore',
       'new york, bengaluru', 'california', 'india', 'ahemadabad',
       'rourkela', 'srinagar', 'bhubneswar', 'chandigarh',
       'delhi & cambridge', 'kolkatta', 'kolkata', 'coimbatore',
       'bangalore', 'udaipur', 'Unknown', 'ahemdabad', 'bhubaneswar',
       'ahmedabad', 'surat', 'goa', 'uttar pradesh', 'nw delhi', 'gaya',
       'vadodara', 'trivandrum', 'missourie', 'panaji', 'gwalior',
       'karur', 'udupi', 'kochi', 'agra', 'bangalore/ bangkok', 'hubli',
       'kerala', 'kozhikode', 'us', 'silig

In [130]:
df['City Location'].value_counts()


City Location
bangalore             700
mumbai                566
new delhi             421
gurgaon               286
Unknown               180
                     ... 
ahemdabad               1
kolkatta                1
delhi & cambridge       1
bhubneswar              1
dallas / hyderabad      1
Name: count, Length: 111, dtype: int64

In [146]:
df['City Location'] = df['City Location'].str.replace(r'\s+', ' ', regex=True)
df['City Location'] = df['City Location'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

In [147]:
# 1. Normalize: lowercase and strip spaces
df['City Location'] = df['City Location'].astype(str).str.lower().str.strip()

# 2. Optional: See unique values before replacement
print(df['City Location'].unique())

# 3. Replace all common variations
df['City Location'] = df['City Location'].replace({
    'bangaluru': 'bangalore',
    'mumbai ': 'mumbai',      # remove space
    'mumbai': 'mumbai',
    'delhi': 'new delhi',
    'new delhi': 'new delhi',
    'nw delhi': 'new delhi',
    'gurgaon': 'gurugram',
    'kolkatta': 'kolkata',
    'ncr': 'new delhi',
    'noida': 'new delhi',
    'hyderabad ': 'hyderabad'  # remove space
})


['bengaluru' 'gurugram' 'new delhi' 'mumbai' 'chennai' 'pune' 'faridabad'
 'san francisco' 'san jose,' 'amritsar' 'kormangala' 'hyderabad'
 'burnsville' 'menlo park' 'palo alto' 'santa monica' 'singapore'
 'taramani' 'andheri' 'chembur' 'haryana' 'new york' 'karnataka'
 'mumbai/bengaluru' 'bhopal' 'bengaluru and gurugram' 'india/singapore'
 'jaipur' 'india/us' 'nagpur' 'indore' 'new york, bengaluru' 'california'
 'india' 'ahemadabad' 'rourkela' 'srinagar' 'bhubneswar' 'chandigarh'
 'delhi & cambridge' 'kolkata' 'coimbatore' 'udaipur' 'unknown'
 'ahemdabad' 'bhubaneswar' 'ahmedabad' 'surat' 'goa' 'uttar pradesh'
 'gaya' 'vadodara' 'trivandrum' 'missourie' 'panaji' 'gwalior' 'karur'
 'udupi' 'kochi' 'agra' 'bangalore/ bangkok' 'hubli' 'kerala' 'kozhikode'
 'us' 'siliguri' 'usa' 'lucknow' 'kanpur' 'sfo / bangalore' 'london'
 'seattle / bangalore' 'pune/seattle' 'pune / dubai' 'bangalore / sfo'
 'varanasi' 'new delhi / us' 'mumbai / uk' 'jodhpur' 'hyderabad/usa'
 'boston' 'bangalore / palo

In [148]:
print(df['City Location'].value_counts())


City Location
bengaluru             841
mumbai                566
new delhi             548
gurugram              336
unknown               180
                     ... 
usa                     1
lucknow                 1
menlo park              1
sfo / bangalore         1
dallas / hyderabad      1
Name: count, Length: 105, dtype: int64


In [149]:
print(df[df['City Location'].str.contains('bengaluru', na=False)]['City Location'].unique())



['bengaluru' 'mumbai/bengaluru' 'bengaluru and gurugram'
 'new york, bengaluru']


# Converting Date TO Date Time

In [150]:
df['Date']=pd.to_datetime(df['Date'],errors='coerce')
df.head()

Unnamed: 0,Sr No,Date,Startup Name,Industry Vertical,SubVertical,City Location,Investors Name,InvestmentType,Amount in USD
0,1,2020-09-01,byju’s,e-tech,e-learning,bengaluru,tiger global management,private equity round,200000000.0
1,2,NaT,shuttl,transportation,app based shuttle service,gurugram,susquehanna growth equity,series c,8048394.0
2,3,2020-09-01,mamaearth,e-commerce,retailer of baby and toddler products,bengaluru,sequoia capital india,series b,18358860.0
3,4,2020-02-01,https://www.wealthbucket.in/,fintech,online investment,new delhi,vinod khatumal,pre-series a,3000000.0
4,5,2020-02-01,fashor,fashion and apparel,embroiled clothes for women,mumbai,sprout venture partners,seed round,1800000.0


## Handling NAT

In [151]:
df['Date'].isna().sum()


np.int64(1750)

### Give Default Date to NAT

In [152]:
df['Date'] = df['Date'].fillna(pd.Timestamp('2015-01-01'))
print(df['Date'])
df.head()

0      2020-09-01
1      2015-01-01
2      2020-09-01
3      2020-02-01
4      2020-02-01
          ...    
3039   2015-01-01
3040   2015-01-01
3041   2015-01-01
3042   2015-01-01
3043   2015-01-01
Name: Date, Length: 3040, dtype: datetime64[ns]


Unnamed: 0,Sr No,Date,Startup Name,Industry Vertical,SubVertical,City Location,Investors Name,InvestmentType,Amount in USD
0,1,2020-09-01,byju’s,e-tech,e-learning,bengaluru,tiger global management,private equity round,200000000.0
1,2,2015-01-01,shuttl,transportation,app based shuttle service,gurugram,susquehanna growth equity,series c,8048394.0
2,3,2020-09-01,mamaearth,e-commerce,retailer of baby and toddler products,bengaluru,sequoia capital india,series b,18358860.0
3,4,2020-02-01,https://www.wealthbucket.in/,fintech,online investment,new delhi,vinod khatumal,pre-series a,3000000.0
4,5,2020-02-01,fashor,fashion and apparel,embroiled clothes for women,mumbai,sprout venture partners,seed round,1800000.0


In [153]:
df['Date'].isna().sum()


np.int64(0)

## Extract year from Date


In [154]:
df['Year']=df['Date'].dt.year
df.head()


Unnamed: 0,Sr No,Date,Startup Name,Industry Vertical,SubVertical,City Location,Investors Name,InvestmentType,Amount in USD,Year
0,1,2020-09-01,byju’s,e-tech,e-learning,bengaluru,tiger global management,private equity round,200000000.0,2020
1,2,2015-01-01,shuttl,transportation,app based shuttle service,gurugram,susquehanna growth equity,series c,8048394.0,2015
2,3,2020-09-01,mamaearth,e-commerce,retailer of baby and toddler products,bengaluru,sequoia capital india,series b,18358860.0,2020
3,4,2020-02-01,https://www.wealthbucket.in/,fintech,online investment,new delhi,vinod khatumal,pre-series a,3000000.0,2020
4,5,2020-02-01,fashor,fashion and apparel,embroiled clothes for women,mumbai,sprout venture partners,seed round,1800000.0,2020


## Load Cleaned data in Dataset

In [155]:
df.to_csv('cleaned_startup_funding.csv', index=False)
