In [1]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('startup_funding.csv')

# Display the first few rows to confirm it loaded correctly
df.head()

Unnamed: 0,Sr No,Date dd/mm/yyyy,Startup Name,Industry Vertical,SubVertical,City Location,Investors Name,InvestmentnType,Amount in USD,Remarks
0,1,09/01/2020,BYJU’S,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,200000000,
1,2,13/01/2020,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C,8048394,
2,3,09/01/2020,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B,18358860,
3,4,02/01/2020,https://www.wealthbucket.in/,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A,3000000,
4,5,02/01/2020,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round,1800000,


In [2]:
# Get a summary of the DataFrame, including data types and non-null counts
df.info()

# Get descriptive statistics for numerical columns
df.describe()

# Check for the total number of missing values in each column
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3044 entries, 0 to 3043
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Sr No              3044 non-null   int64 
 1   Date dd/mm/yyyy    3044 non-null   object
 2   Startup Name       3044 non-null   object
 3   Industry Vertical  2873 non-null   object
 4   SubVertical        2108 non-null   object
 5   City  Location     2864 non-null   object
 6   Investors Name     3020 non-null   object
 7   InvestmentnType    3040 non-null   object
 8   Amount in USD      2084 non-null   object
 9   Remarks            419 non-null    object
dtypes: int64(1), object(9)
memory usage: 237.9+ KB


Sr No                   0
Date dd/mm/yyyy         0
Startup Name            0
Industry Vertical     171
SubVertical           936
City  Location        180
Investors Name         24
InvestmentnType         4
Amount in USD         960
Remarks              2625
dtype: int64

In [3]:
# Strip leading/trailing whitespace from all string columns
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.strip()

# Standardize city names (e.g., 'Bangalore' to 'Bengaluru')
df['CityLocation'] = df['CityLocation'].replace('Bangalore', 'Bengaluru')

# Standardize investment types (e.g., 'Seed Funding' to 'Seed')
df['InvestmentType'] = df['InvestmentType'].replace('Seed Funding', 'Seed')
df['InvestmentType'] = df['InvestmentType'].replace('Angel Funding', 'Angel')

# Check the unique values to see if cleaning was successful
print("Unique Cities after cleaning:")
print(df['CityLocation'].unique())

print("\nUnique Investment Types after cleaning:")
print(df['InvestmentType'].unique())

KeyError: 'CityLocation'

In [4]:
print(df.columns)

Index(['Sr No', 'Date dd/mm/yyyy', 'Startup Name', 'Industry Vertical',
       'SubVertical', 'City  Location', 'Investors Name', 'InvestmentnType',
       'Amount in USD', 'Remarks'],
      dtype='object')


In [5]:
# Clean all column names by stripping whitespace
df.columns = df.columns.str.strip()

# Now, the column for city is named 'City Location'
# Let's verify the new column names
print(df.columns)

Index(['Sr No', 'Date dd/mm/yyyy', 'Startup Name', 'Industry Vertical',
       'SubVertical', 'City  Location', 'Investors Name', 'InvestmentnType',
       'Amount in USD', 'Remarks'],
      dtype='object')


In [6]:
# Standardize city names
df['City Location'] = df['City Location'].replace('Bangalore', 'Bengaluru')

# Standardize investment types (note the column name is 'InvestmentnType' with an 'n')
df['InvestmentnType'] = df['InvestmentnType'].replace('Seed Funding', 'Seed')
df['InvestmentnType'] = df['InvestmentnType'].replace('Angel Funding', 'Angel')

# Check the unique values to see if cleaning was successful
print("Unique Cities after cleaning:")
print(df['City Location'].unique())

# Count the number of startups per city
startups_per_city = df['City Location'].value_counts().head(10)

# Create a bar chart
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 7))
startups_per_city.plot(kind='bar', color='lightcoral')
plt.title('Top 10 Cities with the Most Startups Funded')
plt.xlabel('City')
plt.ylabel('Number of Startups')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

KeyError: 'City Location'