In [1]:
import pandas as pd

# Load datasets
static_data = pd.read_csv(r"datasets\static_client_data.csv")
historical_data = pd.read_csv(r"datasets\time_series_data.csv")
target_data = pd.read_csv(r"datasets\target_data.csv")

In [2]:
# Merge with target data (only taking 'recommended_strategy')
merged_data = static_data.merge(target_data[["client_id", "recommended_strategy"]], on="client_id", how="left")

# Check the final dataset
print(merged_data.head())

                              client_id  age gender employment_status  \
0  96c4c0a3-bb3f-4ac1-81ad-0850cd29911f   63  Other          Salaried   
1  35fb4c11-fb1a-4eeb-addc-bd6ff6cb7934   43   Male          Salaried   
2  e5aafbe0-c869-41d9-acf1-1b019363e449   56  Other          Salaried   
3  43b978dd-4dd5-4f21-96d2-63ab16c814a3   37  Other           Retired   
4  abe77866-df1b-4a5c-ad96-eb78dff4ffc9   42   Male     Self-Employed   

   annual_income  debt_to_income_ratio  financial_knowledge_score  \
0       61244.14                  0.49                          5   
1      111338.35                  0.39                          1   
2       27581.32                  0.12                          5   
3       64813.50                  0.44                          4   
4      108668.65                  0.35                          2   

      investment_goals risk_appetite  investment_horizon_years  dependents  \
0           Retirement        Medium                         9      

In [3]:
merged_data.isnull().sum()

client_id                    0
age                          0
gender                       0
employment_status            0
annual_income                0
debt_to_income_ratio         0
financial_knowledge_score    0
investment_goals             0
risk_appetite                0
investment_horizon_years     0
dependents                   0
preferred_asset_classes      0
savings_rate                 0
net_worth                    0
recommended_strategy         0
dtype: int64

In [4]:
merged_data.duplicated().sum()

np.int64(0)

In [5]:
merged_data["income_to_networth_ratio"] = merged_data["annual_income"] / (merged_data["net_worth"] + 1e-6)
merged_data["adjusted_debt_to_income"] = merged_data["debt_to_income_ratio"] * merged_data["annual_income"]

In [10]:
merged_data["annual_income"] = merged_data["annual_income"].clip(lower=0)
merged_data["net_worth"] = merged_data["net_worth"].clip(lower=0)

In [11]:
import numpy as np

merged_data["age_group"] = pd.cut(
    merged_data["age"], bins=[18, 35, 55, np.inf], labels=["Young", "Mid-age", "Senior"], include_lowest=True
)

merged_data["income_group"] = pd.cut(
    merged_data["annual_income"], bins=[0, 50000, 150000, np.inf], labels=["Low", "Medium", "High"], include_lowest=True
)

merged_data["net_worth_level"] = pd.cut(
    merged_data["net_worth"], bins=[0, 50000, 200000, np.inf], labels=["Poor", "Stable", "Wealthy"], include_lowest=True
)

In [9]:
merged_data.columns

Index(['client_id', 'age', 'gender', 'employment_status', 'annual_income',
       'debt_to_income_ratio', 'financial_knowledge_score', 'investment_goals',
       'risk_appetite', 'investment_horizon_years', 'dependents',
       'preferred_asset_classes', 'savings_rate', 'net_worth',
       'recommended_strategy', 'income_to_networth_ratio',
       'adjusted_debt_to_income', 'age_group', 'income_group',
       'net_worth_level'],
      dtype='object')

In [12]:
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.feature_selection import chi2, SelectKBest

# Convert string representation of lists to actual lists (if needed)
merged_data["preferred_asset_classes"] = merged_data["preferred_asset_classes"].apply(eval)  # Only if stored as a string

# One-Hot Encoding for List-Type Column
mlb = MultiLabelBinarizer()
one_hot_asset_classes = pd.DataFrame(mlb.fit_transform(merged_data["preferred_asset_classes"]), columns=mlb.classes_)

# Merge one-hot encoded columns with the dataset
merged_data = pd.concat([merged_data.drop(columns=["preferred_asset_classes"]), one_hot_asset_classes], axis=1)

In [13]:
categorical_cols = [
    "gender", "employment_status", "investment_goals", "risk_appetite",
    "age_group", "income_group", "net_worth_level"
] + list(mlb.classes_)  # Include one-hot encoded asset classes

encoder = LabelEncoder()
for col in categorical_cols + ["recommended_strategy"]:  # Include target variable
    merged_data[col] = encoder.fit_transform(merged_data[col])

In [14]:
X = merged_data[categorical_cols]  # Feature set
y = merged_data["recommended_strategy"]  # Target variable

# Chi-Square Feature Selection
chi2_selector = SelectKBest(chi2, k="all")  # Select all for ranking
chi2_selector.fit(X, y)

# Get Chi2 scores and p-values
feature_importance = pd.DataFrame({
    "Feature": categorical_cols,
    "Chi2 Score": chi2_selector.scores_,
    "P-value": chi2_selector.pvalues_
}).sort_values(by="Chi2 Score", ascending=False)

# Print feature importance results
print(feature_importance)

              Feature  Chi2 Score   P-value
1   employment_status    4.053416  0.131769
4           age_group    1.337551  0.512336
2    investment_goals    0.837528  0.657859
7               Bonds    0.226241  0.893043
9        Mutual Funds    0.168171  0.919353
3       risk_appetite    0.149767  0.927852
8                ETFs    0.148987  0.928213
0              gender    0.134586  0.934921
11             Stocks    0.107243  0.947791
10        Real Estate    0.085335  0.958230
6     net_worth_level    0.062290  0.969335
5        income_group    0.002733  0.998634
