In [100]:
import pandas as pd
from typing import Any
import re

## CLEANING TASKS <br>
1.	Standardize column names (snake_case) <br>
2.	Strip whitespace from all string fields <br>
3.	Normalize missing values (NaN, empty string, “-”, “NA”, “n/a”) <br>
4.	Convert Amount In USD to numeric <br>
5.	Convert Date to datetime <br>
6.	Clean city names (Bangalore/Bengaluru, Bombay/Mumbai, etc.) <br>
7.	Extract year from Date <br>
8.	Standardize Investment Type (Seed/Seed Funding → Seed) <br>
9.	Split multiple investors into lists <br>
10.	Clean Startup Name (remove suffixes like “ Pvt Ltd”, “ Limited”) <br>



In [101]:
df = pd.read_csv('/Users/olixstudios/Documents/workspace/Projects/indian-startups-transformations/data/starup.csv')
first_column_name = df.columns[0]
df = df.drop(columns=[first_column_name], errors="ignore")

In [102]:
def clean_column_names(df: pd.DataFrame) -> pd.DataFrame:
    def _snake_case(s: Any) -> str:
        if not isinstance(s, str):
            s = str(s)
        s  = s.strip()
        s = s.replace(" ", "_")
        s = re.sub(r"[^\w\s]", "", s)  
        return s.lower()
    df =  df.copy()
    df.columns = [_snake_case(col) for col in df.columns]
    return df

In [103]:
def normalize_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    MISSING_TOKENS = ["N/A", "n/a", "NA", "na", "-", "--", " ", ""]
    df = df.copy()
    df.replace(to_replace=MISSING_TOKENS, value=pd.NA, inplace=True)
    return df


In [104]:
# df["amount"].fillna(0)

In [105]:
# df["amount"] = df["amount"].str.replace(",", "")
# df["amount"] = pd.to_numeric(df["amount"], errors='coerce')
# df


In [106]:
def clean_amount_column(df: pd.DataFrame, col: str = "amount") -> pd.DataFrame:
    df = df.copy()
    df[col] = df[col].astype(str).fillna("")
    #df[col] = df[col].str.replace(",", "")
    df[col] = df[col].str.strip()
    #df[col] = df[col].str.replace(r"[\$,€£¥]", "", regex=True)
    df[col] = df[col].str.replace(r"[^0-9\.\-]", '', regex=True).replace('', pd.NA)
    df[col] = pd.to_numeric(df[col], errors='coerce')
    return df


In [107]:
def parse_dates(df: pd.DataFrame, col: str = "date") -> pd.DataFrame:
    df = df.copy()
    df[col] = pd.to_datetime(df[col], errors='coerce') 
    return df

In [108]:
def date_analysis(df: pd.DataFrame, date_col: str = "date") -> pd.DataFrame:
    df = df.copy()
    df["year"] = df[date_col].dt.year
    df["date_missing"] = df[date_col].isna()
    return df

In [109]:
print('orig cols:', df.columns.tolist())
df = clean_column_names(df)
print('clean cols:', df.columns.tolist())
df = normalize_missing_values(df)
print('null counts:', df.isna().sum().to_dict())
df = clean_amount_column(df, col='amount')   # or amount_in_usd
print('amount dtype:', df['amount'].dtype)
df = parse_dates(df, col='date')
print('date dtype:', df['date'].dtype)
print(df[['amount','date']].head().to_dict())

orig cols: ['Date', 'Startup', 'Industry', 'Location', 'Investor', 'Type', 'Amount']
clean cols: ['date', 'startup', 'industry', 'location', 'investor', 'type', 'amount']
null counts: {'date': 5, 'startup': 5, 'industry': 5, 'location': 5, 'investor': 35, 'type': 12, 'amount': 57}
amount dtype: float64
date dtype: datetime64[ns]
{'amount': {0: 460000000.0, 1: 300000000.0, 2: 343000000.0, 3: 83000000.0, 4: 7400000.0}, 'date': {0: Timestamp('2021-01-04 00:00:00'), 1: Timestamp('2021-05-04 00:00:00'), 2: NaT, 3: Timestamp('2021-07-04 00:00:00'), 4: NaT}}


In [110]:
import numpy as np
temp = df['type'].unique()
temp = pd.DataFrame(temp)
temp



Unnamed: 0,0
0,Series F
1,Series E
2,Series J
3,Series D
4,Venture
...,...
64,Mezzanine
65,Series B (Extension)
66,Equity Based Funding
67,Private Funding


In [111]:
temp = df['type'].groupby(df['type']).size()
temp

type
 Venture - Series Unknown    1
Angel                        7
Angel Round                  1
Bridge Funding               1
Bridge Round                 1
                            ..
Venture-Series Unknown       2
pre-Seed                     1
pre-Series A                 5
pre-Series B                 1
pre-series A                 1
Name: type, Length: 68, dtype: int64

In [112]:
temp = df['type']
temp

0                 Series F
1                 Series E
2                 Series J
3                 Series D
4                  Venture
              ...         
611         Private Equity
612         Private Equity
613         Private Equity
614         Private Equity
615    Seed/ Angel Funding
Name: type, Length: 616, dtype: object

In [113]:
CANONICAL_PATTERNS = {
    r"^angel.*": "angel",
    r".*angel funding.*": "angel",
    r"^seed.*": "seed",
    r".*seed funding.*": "seed",
    r".*seed / angel.*": "seed",
    r"^pre[\s\-]?seed.*": "pre_seed",
    r"^pre[\s\-]?series a.*": "pre_seed",   # or "pre_series_a"
    r"^series [a-z0-9].*": "series",
    r".*venture.*": "venture",
    r".*private equity.*": "private_equity",
    r".*debt.*": "debt",
    r".*equity.*": "equity",
    r".*mezzanine.*": "mezzanine",
    r".*m&a.*": "ma",
    r".*bridge.*": "bridge",
    r".*secondary market.*": "secondary_market",
    r".*in progress.*": "in_progress",
    r".*unspecified.*": "unspecified",
}

def canonical_investment_type(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.lower().str.strip()

    out = s.replace(CANONICAL_PATTERNS, regex=True)
    out = out.where(out.isin(set(CANONICAL_PATTERNS.values())), other="other")

    return out


In [114]:
df["type"].value_counts()

type
Private Equity         181
Seed/ Angel Funding    119
Series A                45
Series B                31
Series C                28
                      ... 
Venture Series           1
Funding                  1
Pre-series A             1
Series G                 1
Pre Seed                 1
Name: count, Length: 68, dtype: int64

In [115]:

df["type_canonical"] = canonical_investment_type(df["type"])
df["type_canonical"].value_counts()

type_canonical
equity              188
series              147
angel               135
seed                 44
other                26
debt                 24
pre_seed             22
venture              20
in_progress           3
secondary_market      2
bridge                2
unspecified           1
ma                    1
mezzanine             1
Name: count, dtype: int64

In [116]:
SPLIT_PATTERN = r",\s*|;\s*|\s+&\s+"
def split_investors(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.strip()
    out = s.fillna("").str.split(SPLIT_PATTERN)
    return out

df['investor_list'] = split_investors(df['investor'])
df['investor_count'] = df['investor_list'].apply(len)
df

Unnamed: 0,date,startup,industry,location,investor,type,amount,type_canonical,investor_list,investor_count
0,2021-01-04,BYJU'S,Edu-tech,Bengaluru,Innoven Capital,Series F,460000000.0,series,[Innoven Capital],1
1,2021-05-04,Meesho,E-commerce,Bengaluru,SoftBank Vision Fund 2,Series E,300000000.0,series,[SoftBank Vision Fund 2],1
2,NaT,Swiggy,Online Food Delivery,Bengaluru,"Amansa Holdings, Carmignac, Falcon Edge Capita...",Series J,343000000.0,series,"[Amansa Holdings, Carmignac, Falcon Edge Capit...",5
3,2021-07-04,Groww,FinTech,Bengaluru,"MC Global Edtech, B Capital, Baron, others",Series D,83000000.0,series,"[MC Global Edtech, B Capital, Baron, others]",4
4,NaT,Beldara,E-commerce,Mumbai,Hindustan Media Ventures,Venture,7400000.0,venture,[Hindustan Media Ventures],1
...,...,...,...,...,...,...,...,...,...,...
611,NaT,TheCapitalNet,Fin-Tech,Hyderabad,Lindwall Family Investments LLC (LFI),Private Equity,500000.0,equity,[Lindwall Family Investments LLC (LFI)],1
612,NaT,Shuttl,Consumer Internet,Gurugram,Amazon Alexa Fund & Dentsu Ventures,Private Equity,11000000.0,equity,"[Amazon Alexa Fund, Dentsu Ventures]",2
613,NaT,Cure Fit,Consumer Internet,Bengaluru,"IDG Ventures, Accel Partners, Kalaari Capital ...",Private Equity,120000000.0,equity,"[IDG Ventures, Accel Partners, Kalaari Capital...",3
614,NaT,Five Star Group,Fin-Tech,Chennai,"TPG, Norwest Venture Partners, Sequoia Capital...",Private Equity,100000000.0,equity,"[TPG, Norwest Venture Partners, Sequoia Capita...",4


In [123]:
temp = df['startup'].nunique() == len(df['startup'])
print(df['startup'].nunique())
print(len(df['startup']))
temp

326
616


False

In [None]:
gp = df['startup'].groupby(df['startup']).size().sort_values(ascending=False)
pd.DataFrame(gp)
gp

startup
Nykaa                6
Shuttl               6
LetsTransport        5
Udaan                5
BYJU'S               5
                    ..
 InCred Finance      1
LetsTransport\n\n    1
Licious              1
Lo! Foods            1
 Guiddoo             1
Name: startup, Length: 326, dtype: int64

In [131]:
SUFFIX_PATTERN = r"""
    \s+(
        pvt\.?\s*ltd      |   # Pvt Ltd / Pvt. Ltd
        private\s*limited |   # Private Limited
        ltd\.?            |   # Ltd / Ltd.
        limited           |   # Limited
        inc\.?            |   # Inc / Inc.
        incorporated      |
        corp\.?           |
        corporation
    )\s*$
"""    


def clean_startup_name(name: str) -> str:
    if not isinstance(name, str):
        return name

    # Normalize spaces and case
    s = name.strip()
    s = re.sub(SUFFIX_PATTERN, "", s, flags=re.IGNORECASE | re.VERBOSE)
    return s.strip()
df["startup_clean"] = df["startup"].apply(clean_startup_name)
df

Unnamed: 0,date,startup,industry,location,investor,type,amount,type_canonical,investor_list,investor_count,startup_clean
0,2021-01-04,BYJU'S,Edu-tech,Bengaluru,Innoven Capital,Series F,460000000.0,series,[Innoven Capital],1,BYJU'S
1,2021-05-04,Meesho,E-commerce,Bengaluru,SoftBank Vision Fund 2,Series E,300000000.0,series,[SoftBank Vision Fund 2],1,Meesho
2,NaT,Swiggy,Online Food Delivery,Bengaluru,"Amansa Holdings, Carmignac, Falcon Edge Capita...",Series J,343000000.0,series,"[Amansa Holdings, Carmignac, Falcon Edge Capit...",5,Swiggy
3,2021-07-04,Groww,FinTech,Bengaluru,"MC Global Edtech, B Capital, Baron, others",Series D,83000000.0,series,"[MC Global Edtech, B Capital, Baron, others]",4,Groww
4,NaT,Beldara,E-commerce,Mumbai,Hindustan Media Ventures,Venture,7400000.0,venture,[Hindustan Media Ventures],1,Beldara
...,...,...,...,...,...,...,...,...,...,...,...
611,NaT,TheCapitalNet,Fin-Tech,Hyderabad,Lindwall Family Investments LLC (LFI),Private Equity,500000.0,equity,[Lindwall Family Investments LLC (LFI)],1,TheCapitalNet
612,NaT,Shuttl,Consumer Internet,Gurugram,Amazon Alexa Fund & Dentsu Ventures,Private Equity,11000000.0,equity,"[Amazon Alexa Fund, Dentsu Ventures]",2,Shuttl
613,NaT,Cure Fit,Consumer Internet,Bengaluru,"IDG Ventures, Accel Partners, Kalaari Capital ...",Private Equity,120000000.0,equity,"[IDG Ventures, Accel Partners, Kalaari Capital...",3,Cure Fit
614,NaT,Five Star Group,Fin-Tech,Chennai,"TPG, Norwest Venture Partners, Sequoia Capital...",Private Equity,100000000.0,equity,"[TPG, Norwest Venture Partners, Sequoia Capita...",4,Five Star Group
