In [1]:
import pandas as pd

In [2]:
# -------------------------------
# Step 0: Load master table
# -------------------------------
df = pd.read_csv("../data/master_table.csv")

# --- Safety check: handle alternate column names ---
if 'nav' in df.columns and 'nav_value' not in df.columns:
    df.rename(columns={'nav': 'nav_value'}, inplace=True)
if 'date' in df.columns and 'nav_date' not in df.columns:
    df.rename(columns={'date': 'nav_date'}, inplace=True)

# -------------------------------
# Step 1: Ensure NAV column is numeric
# -------------------------------
df['nav_value'] = pd.to_numeric(df['nav_value'], errors='coerce')

# -------------------------------
# Step 2: Ensure date column is datetime
# -------------------------------
df['nav_date'] = pd.to_datetime(df['nav_date'])

# -------------------------------
# Step 3: Sort by fund and date
# -------------------------------
df.sort_values(by=['fund_name', 'nav_date'], inplace=True)
df.reset_index(drop=True, inplace=True)

# -------------------------------
# Step 4: Forward-fill missing NAVs (weekends/holidays)
# -------------------------------
df = (
    df.groupby('fund_name', group_keys=False)
      .apply(lambda g: g.set_index('nav_date')
                      .asfreq('D')
                      .ffill())
      .reset_index()
)

# Re-fill fund_name after forward-fill
df['fund_name'].ffill(inplace=True)

# -------------------------------
# Step 5: Rename columns before saving CSV (in-place)
# -------------------------------
df.rename(columns={
    'nav_date': 'date',
    'nav_value': 'nav'
}, inplace=True)

# -------------------------------
# Step 6: Save cleaned master table (overwrite existing)
# -------------------------------
df.to_csv("../data/master_table.csv", index=False)

print(f"[OK] Master table cleaned, forward-filled, and saved. Total rows: {len(df)}")

# Optional preview
df.head(10)


[OK] Master table cleaned, forward-filled, and saved. Total rows: 36929


Unnamed: 0,date,fund_name,nav
0,2013-01-02,DSP Midcap Fund – Direct Plan – Growth,20.542
1,2013-01-03,DSP Midcap Fund – Direct Plan – Growth,20.619
2,2013-01-04,DSP Midcap Fund – Direct Plan – Growth,20.648
3,2013-01-05,DSP Midcap Fund – Direct Plan – Growth,20.648
4,2013-01-06,DSP Midcap Fund – Direct Plan – Growth,20.648
5,2013-01-07,DSP Midcap Fund – Direct Plan – Growth,20.687
6,2013-01-08,DSP Midcap Fund – Direct Plan – Growth,20.72
7,2013-01-09,DSP Midcap Fund – Direct Plan – Growth,20.658
8,2013-01-10,DSP Midcap Fund – Direct Plan – Growth,20.577
9,2013-01-11,DSP Midcap Fund – Direct Plan – Growth,20.265


In [3]:
# -------------------------------
# Step 7: Clean fund_name minimally (remove 'Growth' or 'Direct Plan', strip extra spaces or trailing hyphens)
# -------------------------------
df['fund_name'] = df['fund_name'].str.replace(r'\s*–?\s*Direct Plan', '', regex=True)
df['fund_name'] = df['fund_name'].str.replace(r'\s*–?\s*Growth', '', regex=True)
df['fund_name'] = df['fund_name'].str.replace(r'[\s\-]+$', '', regex=True)  # remove trailing spaces or hyphens
df['fund_name'] = df['fund_name'].str.strip()

# -------------------------------
# Step 8: Add industry-standard cap & fund_type
# -------------------------------
cap_mapping = {
    "Nippon India Small Cap Fund": "Small Cap",
    "DSP Midcap Fund": "Mid Cap",
    "HDFC Large and Mid Cap Fund": "Large & Mid Cap",
    "SBI Large & Midcap Fund": "Large & Mid Cap",
    "ICICI Prudential Large Cap Fund (erstwhile Bluechip Fund)": "Large Cap",
    "UTI Nifty 50 Index Fund": "Large Cap",
    "ICICI Prudential Balanced Advantage Fund": "Multi / Hybrid",
    "HDFC Corporate Bond Fund": "Debt"
}

type_mapping = {
    "Nippon India Small Cap Fund": "Equity",
    "DSP Midcap Fund": "Equity",
    "HDFC Large and Mid Cap Fund": "Equity",
    "SBI Large & Midcap Fund": "Equity",
    "ICICI Prudential Large Cap Fund (erstwhile Bluechip Fund)": "Equity",
    "UTI Nifty 50 Index Fund": "Index / Passive",
    "ICICI Prudential Balanced Advantage Fund": "Balanced / Hybrid",
    "HDFC Corporate Bond Fund": "Debt"
}

df['cap'] = df['fund_name'].map(cap_mapping)
df['fund_type'] = df['fund_name'].map(type_mapping)

# -------------------------------
# Step 9: Save updated master table
# -------------------------------
df.to_csv("../data/master_table.csv", index=False)

print("[OK] Added 'cap' and 'fund_type' columns to master_table.csv after cleaning fund_name")
df[['fund_name', 'cap', 'fund_type']].drop_duplicates()


[OK] Added 'cap' and 'fund_type' columns to master_table.csv after cleaning fund_name


Unnamed: 0,fund_name,cap,fund_type
0,DSP Midcap Fund,Mid Cap,Equity
4684,HDFC Corporate Bond Fund,Debt,Debt
9369,HDFC Large and Mid Cap Fund,Large & Mid Cap,Equity
13509,ICICI Prudential Balanced Advantage Fund,Multi / Hybrid,Balanced / Hybrid
18193,ICICI Prudential Large Cap Fund (erstwhile Blu...,Large Cap,Equity
22877,Nippon India Small Cap Fund,Small Cap,Equity
27561,SBI Large & Midcap Fund,Large & Mid Cap,Equity
32245,UTI Nifty 50 Index Fund,Large Cap,Index / Passive
