In [5]:
import pandas as pd

# --- Load files (paths from your upload notes) ---
returns = pd.read_csv("20250810_funds_open-end-fund-performance.csv")
profiles = pd.read_csv("20250810_funds_open-end-fund-profile.csv")

# --- Pick a join key that exists in BOTH (prefers stable IDs over names) ---
possible_keys = ["With Intelligence Fund ID", "Fund ID", "Fund Code", "Fund Name"]
join_key = next(k for k in possible_keys if k in returns.columns and k in profiles.columns)

# --- Pick a strategy column from profiles ---
possible_strategy_cols = ["Strategy", "Primary Strategy", "Main Strategy", "Investment Strategy", "Fund Strategy"]
strategy_col = next(c for c in possible_strategy_cols if c in profiles.columns)

# --- Minimal cleanup on the key if it's Fund Name (helps reduce near-dup name mismatches) ---
def _norm_name(s: pd.Series) -> pd.Series:
    return (
        s.astype(str)
         .str.strip()
         .str.replace(r"\s+", " ", regex=True)
         .str.replace(r"[’']", "'", regex=True)   # unify apostrophes
    )

if join_key == "Fund Name":
    returns["_key"]  = _norm_name(returns[join_key])
    profiles["_key"] = _norm_name(profiles[join_key])
    key_col = "_key"
else:
    key_col = join_key

# --- Collapse profiles to one row per key with a single strategy (first non-null) ---
prof_strat = (
    profiles[[key_col, strategy_col]]
    .dropna(subset=[key_col])
    .sort_values([key_col])              # deterministic
    .groupby(key_col, as_index=False)
    .agg(**{ "strategy": (strategy_col, "first") })
)

# --- Merge strategy onto monthly returns ---
df = returns.merge(prof_strat, on=key_col, how="left")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7830 entries, 0 to 7829
Data columns (total 84 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   Fund ID                                          7830 non-null   int64  
 1   Fund Name                                        7830 non-null   object 
 2   Manager Name                                     7830 non-null   object 
 3   Manager ID                                       7830 non-null   int64  
 4   Fund AuM (m)                                     6301 non-null   float64
 5   Fund AuM Date                                    6301 non-null   object 
 6   Strategy AuM (m)                                 3040 non-null   float64
 7   Strategy AuM Date                                3035 non-null   object 
 8   Last 3 Months                                    7440 non-null   object 
 9   Worst Month                   