In [41]:
import pandas as pd
from typing import Any
import re

## CLEANING TASKS
	1.	Standardize column names (snake_case)
	2.	Strip whitespace from all string fields
	3.	Normalize missing values (NaN, empty string, “-”, “NA”, “n/a”)
	4.	Convert Amount In USD to numeric
	5.	Convert Date to datetime
    6.	Clean city names (Bangalore/Bengaluru, Bombay/Mumbai, etc.)
	7.	Extract year from Date
	8.	Standardize Investment Type (Seed/Seed Funding → Seed)
	9.	Split multiple investors into lists
	10.	Clean Startup Name (remove suffixes like “ Pvt Ltd”, “ Limited”)


In [42]:
df = pd.read_csv('/Users/olixstudios/Documents/workspace/Projects/indian-startups-transformations/data/starup.csv')

In [43]:
def clean_column_names(df: pd.DataFrame) -> pd.DataFrame:
    def _snake_case(s: Any) -> str:
        if not isinstance(s, str):
            s = str(s)
        s  = s.strip()
        s = s.replace(" ", "_")
        s = re.sub(r"[^\w\s]", "", s)  
        return s.lower()
    df =  df.copy()
    df.columns = [_snake_case(col) for col in df.columns]
    return df

In [44]:
df = clean_column_names(df)
df

Unnamed: 0,unnamed_0,date,startup,industry,location,investor,type,amount
0,0,01-04-2021,BYJU'S,Edu-tech,Bengaluru,Innoven Capital,Series F,460000000
1,1,05-04-2021,Meesho,E-commerce,Bengaluru,SoftBank Vision Fund 2,Series E,300000000
2,2,14-04-2021,Swiggy,Online Food Delivery,Bengaluru,"Amansa Holdings, Carmignac, Falcon Edge Capita...",Series J,343000000
3,3,07-04-2021,Groww,FinTech,Bengaluru,"MC Global Edtech, B Capital, Baron, others",Series D,83000000
4,4,14-04-2021,Beldara,E-commerce,Mumbai,Hindustan Media Ventures,Venture,7400000
...,...,...,...,...,...,...,...,...
611,611,26-07-2018,TheCapitalNet,Fin-Tech,Hyderabad,Lindwall Family Investments LLC (LFI),Private Equity,500000
612,612,30-07-2018,Shuttl,Consumer Internet,Gurugram,Amazon Alexa Fund & Dentsu Ventures,Private Equity,11000000
613,613,30-07-2018,Cure Fit,Consumer Internet,Bengaluru,"IDG Ventures, Accel Partners, Kalaari Capital ...",Private Equity,120000000
614,614,31-07-2018,Five Star Group,Fin-Tech,Chennai,"TPG, Norwest Venture Partners, Sequoia Capital...",Private Equity,100000000


In [45]:
def normalize_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    MISSING_TOKENS = ["N/A", "n/a", "NA", "na", "-", "--", " ", ""]
    df = df.copy()
    df.replace(to_replace=MISSING_TOKENS, value=pd.NA, inplace=True)
    return df

df = normalize_missing_values(df)
df


Unnamed: 0,unnamed_0,date,startup,industry,location,investor,type,amount
0,0,01-04-2021,BYJU'S,Edu-tech,Bengaluru,Innoven Capital,Series F,460000000
1,1,05-04-2021,Meesho,E-commerce,Bengaluru,SoftBank Vision Fund 2,Series E,300000000
2,2,14-04-2021,Swiggy,Online Food Delivery,Bengaluru,"Amansa Holdings, Carmignac, Falcon Edge Capita...",Series J,343000000
3,3,07-04-2021,Groww,FinTech,Bengaluru,"MC Global Edtech, B Capital, Baron, others",Series D,83000000
4,4,14-04-2021,Beldara,E-commerce,Mumbai,Hindustan Media Ventures,Venture,7400000
...,...,...,...,...,...,...,...,...
611,611,26-07-2018,TheCapitalNet,Fin-Tech,Hyderabad,Lindwall Family Investments LLC (LFI),Private Equity,500000
612,612,30-07-2018,Shuttl,Consumer Internet,Gurugram,Amazon Alexa Fund & Dentsu Ventures,Private Equity,11000000
613,613,30-07-2018,Cure Fit,Consumer Internet,Bengaluru,"IDG Ventures, Accel Partners, Kalaari Capital ...",Private Equity,120000000
614,614,31-07-2018,Five Star Group,Fin-Tech,Chennai,"TPG, Norwest Venture Partners, Sequoia Capital...",Private Equity,100000000


In [46]:
# df["amount"].fillna(0)

In [47]:
# df["amount"] = df["amount"].str.replace(",", "")
# df["amount"] = pd.to_numeric(df["amount"], errors='coerce')
# df


In [48]:
def clean_amount_column(df: pd.DataFrame, col: str = "amount") -> pd.DataFrame:
    df = df.copy()
    df[col] = df[col].fillna(0)
    #df[col] = df[col].str.replace(",", "")
    df[col] = df[col].str.strip()
    #df[col] = df[col].str.replace(r"[\$,€£¥]", "", regex=True)
    df[col] = df[col].str.replace(r"[^0-9\.\-]", "", regex=True)
    df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

df = clean_amount_column(df, col="amount")
df

Unnamed: 0,unnamed_0,date,startup,industry,location,investor,type,amount
0,0,01-04-2021,BYJU'S,Edu-tech,Bengaluru,Innoven Capital,Series F,460000000.0
1,1,05-04-2021,Meesho,E-commerce,Bengaluru,SoftBank Vision Fund 2,Series E,300000000.0
2,2,14-04-2021,Swiggy,Online Food Delivery,Bengaluru,"Amansa Holdings, Carmignac, Falcon Edge Capita...",Series J,343000000.0
3,3,07-04-2021,Groww,FinTech,Bengaluru,"MC Global Edtech, B Capital, Baron, others",Series D,83000000.0
4,4,14-04-2021,Beldara,E-commerce,Mumbai,Hindustan Media Ventures,Venture,7400000.0
...,...,...,...,...,...,...,...,...
611,611,26-07-2018,TheCapitalNet,Fin-Tech,Hyderabad,Lindwall Family Investments LLC (LFI),Private Equity,500000.0
612,612,30-07-2018,Shuttl,Consumer Internet,Gurugram,Amazon Alexa Fund & Dentsu Ventures,Private Equity,11000000.0
613,613,30-07-2018,Cure Fit,Consumer Internet,Bengaluru,"IDG Ventures, Accel Partners, Kalaari Capital ...",Private Equity,120000000.0
614,614,31-07-2018,Five Star Group,Fin-Tech,Chennai,"TPG, Norwest Venture Partners, Sequoia Capital...",Private Equity,100000000.0


In [49]:
def parse_dates(df: pd.DataFrame, col: str = "date") -> pd.DataFrame:
    df = df.copy()
    df[col] = pd.to_datetime(df[col], errors='coerce')
    return df
df = parse_dates(df, col="date")
df.dtypes

unnamed_0             int64
date         datetime64[ns]
startup              object
industry             object
location             object
investor             object
type                 object
amount              float64
dtype: object

In [50]:
out = df.copy()
for col in df.columns:
    if out[col].dtype == "object":
        print(out[col].dtype)
        out[col] = out[col].str.strip()



object
object
object
object
object


In [51]:
out.dtypes == "object"


unnamed_0    False
date         False
startup       True
industry      True
location      True
investor      True
type          True
amount       False
dtype: bool

In [52]:
missOut = df.copy()
missOut.isnull().sum()
missOut.isna().sum()

unnamed_0      0
date         291
startup        5
industry       5
location       5
investor      35
type          12
amount        66
dtype: int64

In [53]:
missing_tokens = ["", "-", "NA", "N/A", "na", "n/a", "Na", "NaN", "nan"]
missOut = missOut.replace(missing_tokens, pd.NA)
missOut.isna()


Unnamed: 0,unnamed_0,date,startup,industry,location,investor,type,amount
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,True,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
611,False,True,False,False,False,False,False,False
612,False,True,False,False,False,False,False,False
613,False,True,False,False,False,False,False,False
614,False,True,False,False,False,False,False,False
