In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.feature_selection import chi2, SelectKBest, mutual_info_classif, f_classif
from sklearn.ensemble import RandomForestClassifier

# Load datasets
static_data = pd.read_csv(r"datasets\static_client_data.csv")
target_data = pd.read_csv(r"datasets\target_data.csv")

# Merge datasets
merged_data = static_data.merge(target_data[["client_id", "recommended_strategy"]], on="client_id", how="left")

# Feature Engineering
merged_data["income_to_networth_ratio"] = merged_data["annual_income"] / (merged_data["net_worth"] + 1e-6)
merged_data["adjusted_debt_to_income"] = merged_data["debt_to_income_ratio"] * merged_data["annual_income"]

# Clip numerical values to avoid negatives
merged_data["annual_income"] = merged_data["annual_income"].clip(lower=0)
merged_data["net_worth"] = merged_data["net_worth"].clip(lower=0)

# Binning age, income, and net worth
merged_data["age_group"] = pd.cut(merged_data["age"], bins=[18, 35, 55, np.inf], labels=["Young", "Mid-age", "Senior"], include_lowest=True)
merged_data["income_group"] = pd.cut(merged_data["annual_income"], bins=[0, 50000, 150000, np.inf], labels=["Low", "Medium", "High"], include_lowest=True)
merged_data["net_worth_level"] = pd.cut(merged_data["net_worth"], bins=[0, 50000, 200000, np.inf], labels=["Poor", "Stable", "Wealthy"], include_lowest=True)

# Handle list-type column (preferred_asset_classes)
merged_data["preferred_asset_classes"] = merged_data["preferred_asset_classes"].apply(eval)  # Convert string lists to real lists
mlb = MultiLabelBinarizer()
one_hot_asset_classes = pd.DataFrame(mlb.fit_transform(merged_data["preferred_asset_classes"]), columns=mlb.classes_)
merged_data = pd.concat([merged_data.drop(columns=["preferred_asset_classes"]), one_hot_asset_classes], axis=1)

# Identify categorical and numerical features
categorical_cols = [
    "gender", "employment_status", "investment_goals", "risk_appetite",
    "age_group", "income_group", "net_worth_level"
] + list(mlb.classes_)

numerical_cols = [
    "age", "annual_income", "debt_to_income_ratio", "financial_knowledge_score",
    "investment_horizon_years", "savings_rate", "net_worth",
    "income_to_networth_ratio", "adjusted_debt_to_income"
]

# Label encode categorical features (including target variable)
encoder = LabelEncoder()
for col in categorical_cols + ["recommended_strategy"]:
    merged_data[col] = encoder.fit_transform(merged_data[col])

# Define feature matrix and target variable
X = merged_data[categorical_cols + numerical_cols]
y = merged_data["recommended_strategy"]

# ---- Feature Selection ----

# 1️⃣ Chi-Square for Categorical Features
chi2_selector = SelectKBest(chi2, k="all")
chi2_selector.fit(merged_data[categorical_cols], y)
chi2_results = pd.DataFrame({
    "Feature": categorical_cols,
    "Chi2 Score": chi2_selector.scores_,
    "P-value": chi2_selector.pvalues_
}).sort_values(by="Chi2 Score", ascending=False)

# 2️⃣ ANOVA F-Test for Numerical Features
f_values, p_values = f_classif(merged_data[numerical_cols], y)
anova_results = pd.DataFrame({
    "Feature": numerical_cols,
    "F-Score": f_values,
    "P-value": p_values
}).sort_values(by="F-Score", ascending=False)

# 3️⃣ Mutual Information for All Features
mi_scores = mutual_info_classif(X, y, discrete_features="auto")
mi_results = pd.DataFrame({
    "Feature": X.columns,
    "MI Score": mi_scores
}).sort_values(by="MI Score", ascending=False)

# 4️⃣ Random Forest Feature Importance
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)
rf_importance = pd.DataFrame({
    "Feature": X.columns,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

# Display Results
print("🔹 Chi-Square Feature Importance:\n", chi2_results)
print("\n🔹 ANOVA (F-Test) Feature Importance:\n", anova_results)
print("\n🔹 Mutual Information Feature Importance:\n", mi_results)
print("\n🔹 Random Forest Feature Importance:\n", rf_importance)


🔹 Chi-Square Feature Importance:
               Feature  Chi2 Score   P-value
1   employment_status    4.053416  0.131769
4           age_group    1.337551  0.512336
2    investment_goals    0.837528  0.657859
7               Bonds    0.226241  0.893043
9        Mutual Funds    0.168171  0.919353
3       risk_appetite    0.149767  0.927852
8                ETFs    0.148987  0.928213
0              gender    0.134586  0.934921
11             Stocks    0.107243  0.947791
10        Real Estate    0.085335  0.958230
6     net_worth_level    0.062290  0.969335
5        income_group    0.002733  0.998634

🔹 ANOVA (F-Test) Feature Importance:
                      Feature   F-Score   P-value
5               savings_rate  1.465720  0.230961
6                  net_worth  1.223366  0.294282
0                        age  0.849076  0.427841
3  financial_knowledge_score  0.549639  0.577176
1              annual_income  0.326068  0.721764
2       debt_to_income_ratio  0.321209  0.725279
7   income_t

In [2]:
selected_features = [
    "income_to_networth_ratio", "adjusted_debt_to_income", "net_worth",
    "annual_income", "debt_to_income_ratio", "age",
    "investment_horizon_years", "savings_rate", "financial_knowledge_score",
    "investment_goals", "employment_status"
]

In [3]:
# Keep only selected features + target variable
final_data = merged_data[selected_features + ["recommended_strategy"]]

In [4]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical target variable
le = LabelEncoder()
final_data["recommended_strategy"] = le.fit_transform(final_data["recommended_strategy"])

# Encode categorical features
for col in ["investment_goals", "employment_status"]:
    final_data[col] = le.fit_transform(final_data[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data["recommended_strategy"] = le.fit_transform(final_data["recommended_strategy"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data[col] = le.fit_transform(final_data[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data[col] = le.fit_transform(final_data[col])


In [6]:
final_data.head()

Unnamed: 0,income_to_networth_ratio,adjusted_debt_to_income,net_worth,annual_income,debt_to_income_ratio,age,investment_horizon_years,savings_rate,financial_knowledge_score,investment_goals,employment_status,recommended_strategy
0,0.405734,30009.6286,150946.53,61244.14,0.49,63,9,0.09,5,2,1,2
1,0.103827,43421.9565,1072347.37,111338.35,0.39,43,19,0.29,1,1,1,2
2,0.155501,3309.7584,177370.9,27581.32,0.12,56,13,0.07,5,3,1,2
3,0.123734,28517.94,523811.23,64813.5,0.44,37,3,0.22,4,2,0,1
4,0.115768,38034.0275,938672.67,108668.65,0.35,42,19,0.21,2,0,2,1


In [7]:
from sklearn.model_selection import train_test_split

# Separate features and target variable
X = final_data.drop(columns=["recommended_strategy"])
y = final_data["recommended_strategy"]

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.489
              precision    recall  f1-score   support

           0       0.15      0.01      0.02       393
           1       0.52      0.88      0.65      1039
           2       0.29      0.11      0.16       568

    accuracy                           0.49      2000
   macro avg       0.32      0.33      0.28      2000
weighted avg       0.38      0.49      0.39      2000



In [9]:
from sklearn.preprocessing import StandardScaler

# Initialize scaler
scaler = StandardScaler()

# Scale only numerical features
numerical_features = [
    "income_to_networth_ratio", "adjusted_debt_to_income", "net_worth",
    "annual_income", "debt_to_income_ratio", "age",
    "investment_horizon_years", "savings_rate", "financial_knowledge_score"
]

# Fit & transform on training data, transform on test data
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train Logistic Regression
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)

# Predict
y_pred_logreg = logreg.predict(X_test)

# Evaluate
print("📌 Logistic Regression Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg))

📌 Logistic Regression Performance:
Accuracy: 0.5195
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       393
           1       0.52      1.00      0.68      1039
           2       0.00      0.00      0.00       568

    accuracy                           0.52      2000
   macro avg       0.17      0.33      0.23      2000
weighted avg       0.27      0.52      0.36      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
from xgboost import XGBClassifier

# Initialize and train XGBoost
xgb = XGBClassifier(n_estimators=100, learning_rate=0.05, random_state=42)
xgb.fit(X_train, y_train)

# Predict
y_pred_xgb = xgb.predict(X_test)

# Evaluate
print("📌 XGBoost Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

📌 XGBoost Performance:
Accuracy: 0.5105
              precision    recall  f1-score   support

           0       0.17      0.01      0.01       393
           1       0.52      0.96      0.68      1039
           2       0.26      0.03      0.05       568

    accuracy                           0.51      2000
   macro avg       0.32      0.33      0.25      2000
weighted avg       0.38      0.51      0.37      2000

