In [1]:
import pandas as pd

# Load datasets
static_data = pd.read_csv(r"datasets\static_client_data.csv")
historical_data = pd.read_csv(r"datasets\time_series_data.csv")
target_data = pd.read_csv(r"datasets\target_data.csv")

In [2]:
# Drop the 'month' column since it's not needed for averaging
historical_avg = historical_data.drop(columns=["month"]).groupby("client_id").mean().reset_index()

In [3]:
# Merge static data and averaged historical data
merged_data = static_data.merge(historical_avg, on="client_id", how="left")

# Merge with target data (only taking 'recommended_strategy')
merged_data = merged_data.merge(target_data[["client_id", "recommended_strategy"]], on="client_id", how="left")

# Check the final dataset
print(merged_data.head())

                              client_id  age gender employment_status  \
0  96c4c0a3-bb3f-4ac1-81ad-0850cd29911f   63  Other          Salaried   
1  35fb4c11-fb1a-4eeb-addc-bd6ff6cb7934   43   Male          Salaried   
2  e5aafbe0-c869-41d9-acf1-1b019363e449   56  Other          Salaried   
3  43b978dd-4dd5-4f21-96d2-63ab16c814a3   37  Other           Retired   
4  abe77866-df1b-4a5c-ad96-eb78dff4ffc9   42   Male     Self-Employed   

   annual_income  debt_to_income_ratio  financial_knowledge_score  \
0       61244.14                  0.49                          5   
1      111338.35                  0.39                          1   
2       27581.32                  0.12                          5   
3       64813.50                  0.44                          4   
4      108668.65                  0.35                          2   

      investment_goals risk_appetite  investment_horizon_years  ...  \
0           Retirement        Medium                         9  ...   
1   

In [4]:
merged_data.isnull().sum()

client_id                      0
age                            0
gender                         0
employment_status              0
annual_income                  0
debt_to_income_ratio           0
financial_knowledge_score      0
investment_goals               0
risk_appetite                  0
investment_horizon_years       0
dependents                     0
preferred_asset_classes        0
savings_rate                   0
net_worth                      0
portfolio_value                0
equity_allocation_pct          0
fixed_income_allocation_pct    0
monthly_contribution           0
market_volatility_index        0
macroeconomic_score            0
sentiment_index                0
recommended_strategy           0
dtype: int64

In [5]:
merged_data.duplicated().sum()

np.int64(0)

In [6]:
merged_data["income_to_networth_ratio"] = merged_data["annual_income"] / (merged_data["net_worth"] + 1e-6)
merged_data["adjusted_debt_to_income"] = merged_data["debt_to_income_ratio"] * merged_data["annual_income"]
merged_data["investment_savings_ratio"] = merged_data["portfolio_value"] / (merged_data["savings_rate"] + 1e-6)

In [7]:
merged_data["annual_income"] = merged_data["annual_income"].clip(lower=0)
merged_data["net_worth"] = merged_data["net_worth"].clip(lower=0)

In [8]:
import numpy as np

merged_data["age_group"] = pd.cut(
    merged_data["age"], bins=[18, 35, 55, np.inf], labels=["Young", "Mid-age", "Senior"], include_lowest=True
)

merged_data["income_group"] = pd.cut(
    merged_data["annual_income"], bins=[0, 50000, 150000, np.inf], labels=["Low", "Medium", "High"], include_lowest=True
)

merged_data["net_worth_level"] = pd.cut(
    merged_data["net_worth"], bins=[0, 50000, 200000, np.inf], labels=["Poor", "Stable", "Wealthy"], include_lowest=True
)

In [9]:
merged_data["total_financial_score"] = (
    merged_data["financial_knowledge_score"] + 
    merged_data["macroeconomic_score"] + 
    merged_data["sentiment_index"]
)
merged_data["total_allocation_pct"] = merged_data["equity_allocation_pct"] + merged_data["fixed_income_allocation_pct"]

In [10]:
merged_data.columns

Index(['client_id', 'age', 'gender', 'employment_status', 'annual_income',
       'debt_to_income_ratio', 'financial_knowledge_score', 'investment_goals',
       'risk_appetite', 'investment_horizon_years', 'dependents',
       'preferred_asset_classes', 'savings_rate', 'net_worth',
       'portfolio_value', 'equity_allocation_pct',
       'fixed_income_allocation_pct', 'monthly_contribution',
       'market_volatility_index', 'macroeconomic_score', 'sentiment_index',
       'recommended_strategy', 'income_to_networth_ratio',
       'adjusted_debt_to_income', 'investment_savings_ratio', 'age_group',
       'income_group', 'net_worth_level', 'total_financial_score',
       'total_allocation_pct'],
      dtype='object')

In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.feature_selection import chi2, SelectKBest, mutual_info_classif, f_classif
from sklearn.ensemble import RandomForestClassifier

# Handle list-type column (preferred_asset_classes)
merged_data["preferred_asset_classes"] = merged_data["preferred_asset_classes"].apply(
    lambda x: eval(x) if isinstance(x, str) else x
)
mlb = MultiLabelBinarizer()
one_hot_asset_classes = pd.DataFrame(mlb.fit_transform(merged_data["preferred_asset_classes"]), columns=mlb.classes_)
merged_data = pd.concat([merged_data.drop(columns=["preferred_asset_classes"]), one_hot_asset_classes], axis=1)

# Identify categorical and numerical features
categorical_cols = [
    "gender", "employment_status", "investment_goals", "risk_appetite",
    "age_group", "income_group", "net_worth_level"
] + list(mlb.classes_)

numerical_cols = [
    "age", "annual_income", "debt_to_income_ratio", "financial_knowledge_score",
    "investment_horizon_years", "savings_rate", "net_worth",
    "income_to_networth_ratio", "adjusted_debt_to_income", 'portfolio_value', 'equity_allocation_pct', 'fixed_income_allocation_pct', 
    'monthly_contribution', 'market_volatility_index', 'macroeconomic_score', 'sentiment_index', 'total_financial_score',
    'total_allocation_pct'
]

# Label encode categorical features (including target variable)
encoder = LabelEncoder()
for col in categorical_cols + ["recommended_strategy"]:
    merged_data[col] = encoder.fit_transform(merged_data[col])

# Define feature matrix and target variable
X = merged_data[categorical_cols + numerical_cols]
y = merged_data["recommended_strategy"]

# ---- Feature Selection ----

# 1️⃣ Chi-Square for Categorical Features
chi2_selector = SelectKBest(chi2, k="all")
chi2_selector.fit(merged_data[categorical_cols], y)
chi2_results = pd.DataFrame({
    "Feature": categorical_cols,
    "Chi2 Score": chi2_selector.scores_,
    "P-value": chi2_selector.pvalues_
}).sort_values(by="Chi2 Score", ascending=False)

# 2️⃣ ANOVA F-Test for Numerical Features
f_values, p_values = f_classif(merged_data[numerical_cols], y)
anova_results = pd.DataFrame({
    "Feature": numerical_cols,
    "F-Score": f_values,
    "P-value": p_values
}).sort_values(by="F-Score", ascending=False)

# 3️⃣ Mutual Information for All Features
mi_scores = mutual_info_classif(X, y, discrete_features="auto")
mi_results = pd.DataFrame({
    "Feature": X.columns,
    "MI Score": mi_scores
}).sort_values(by="MI Score", ascending=False)

# 4️⃣ Random Forest Feature Importance
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)
rf_importance = pd.DataFrame({
    "Feature": X.columns,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

# Display Results
print("🔹 Chi-Square Feature Importance:\n", chi2_results)
print("\n🔹 ANOVA (F-Test) Feature Importance:\n", anova_results)
print("\n🔹 Mutual Information Feature Importance:\n", mi_results)
print("\n🔹 Random Forest Feature Importance:\n", rf_importance)

  f = msb / msw


🔹 Chi-Square Feature Importance:
               Feature  Chi2 Score   P-value
1   employment_status    4.053416  0.131769
4           age_group    1.337551  0.512336
2    investment_goals    0.837528  0.657859
7               Bonds    0.226241  0.893043
9        Mutual Funds    0.168171  0.919353
3       risk_appetite    0.149767  0.927852
8                ETFs    0.148987  0.928213
0              gender    0.134586  0.934921
11             Stocks    0.107243  0.947791
10        Real Estate    0.085335  0.958230
6     net_worth_level    0.062290  0.969335
5        income_group    0.002733  0.998634

🔹 ANOVA (F-Test) Feature Importance:
                         Feature   F-Score   P-value
5                  savings_rate  1.465720  0.230961
9               portfolio_value  1.245824  0.287748
6                     net_worth  1.223366  0.294282
0                           age  0.849076  0.427841
14          macroeconomic_score  0.805871  0.446728
3     financial_knowledge_score  0.549639  

In [14]:
# Choose top 10 from each method
top_chi2 = set(chi2_results.head(10)["Feature"])
top_anova = set(anova_results.head(10)["Feature"])
top_mi = set(mi_results.head(10)["Feature"])
top_rf = set(rf_importance.head(10)["Feature"])

# Union of top features (or intersection if you want stricter selection)
final_features = list(top_chi2.union(top_anova).union(top_mi).union(top_rf))

print("🔹 Final Selected Features:\n", final_features)

🔹 Final Selected Features:
 ['total_financial_score', 'monthly_contribution', 'risk_appetite', 'Bonds', 'ETFs', 'savings_rate', 'equity_allocation_pct', 'employment_status', 'annual_income', 'income_to_networth_ratio', 'financial_knowledge_score', 'age_group', 'Real Estate', 'age', 'portfolio_value', 'adjusted_debt_to_income', 'market_volatility_index', 'Stocks', 'Mutual Funds', 'macroeconomic_score', 'sentiment_index', 'income_group', 'fixed_income_allocation_pct', 'net_worth_level', 'investment_goals', 'net_worth', 'gender']


In [15]:
X = merged_data[final_features]
y = merged_data["recommended_strategy"]  # Replace with your actual target column

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)
lr_preds = lr.predict(X_test_scaled)

print("📊 Logistic Regression Report:")
print("Accuracy:", accuracy_score(y_test, lr_preds))
print(classification_report(y_test, lr_preds))

📊 Logistic Regression Report:
Accuracy: 0.503
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       393
           1       0.50      1.00      0.67      1006
           2       0.00      0.00      0.00       601

    accuracy                           0.50      2000
   macro avg       0.17      0.33      0.22      2000
weighted avg       0.25      0.50      0.34      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [20]:
from xgboost import XGBClassifier

xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb.fit(X_train, y_train)
xgb_preds = xgb.predict(X_test)

print("📊 XGBoost Report:")
print("Accuracy:", accuracy_score(y_test, xgb_preds))
print(classification_report(y_test, xgb_preds))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


📊 XGBoost Report:
Accuracy: 0.4725
              precision    recall  f1-score   support

           0       0.22      0.05      0.09       393
           1       0.52      0.78      0.62      1006
           2       0.36      0.22      0.28       601

    accuracy                           0.47      2000
   macro avg       0.36      0.35      0.33      2000
weighted avg       0.41      0.47      0.41      2000



In [25]:
from lightgbm import LGBMClassifier

lgbm_model = LGBMClassifier(random_state=42)
lgbm_model.fit(X_train, y_train)

y_pred_lgbm = lgbm_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_lgbm))
print("📊 LightGBM Classification Report:")
print(classification_report(y_test, y_pred_lgbm))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001516 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3172
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 27
[LightGBM] [Info] Start training from score -1.626457
[LightGBM] [Info] Start training from score -0.687911
[LightGBM] [Info] Start training from score -1.201476
Accuracy: 0.479
📊 LightGBM Classification Report:
              precision    recall  f1-score   support

           0       0.22      0.02      0.04       393
           1       0.51      0.88      0.64      1006
           2       0.32      0.11      0.17       601

    accuracy                           0.48      2000
   macro avg       0.35      0.34      0.28      2000
weighted avg       0.39      0.48      0.38      2000



In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

logistic_model = Pipeline([
    ('scale', StandardScaler()),  # Standardizing is important before polynomial
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('clf', LogisticRegression(max_iter=1000))
])

logistic_model.fit(X_train, y_train)

y_pred_log = logistic_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("📊 Logistic Regression (Poly Features) Report:")
print(classification_report(y_test, y_pred_log))

Accuracy: 0.4735
📊 Logistic Regression (Poly Features) Report:
              precision    recall  f1-score   support

           0       0.28      0.04      0.07       393
           1       0.50      0.86      0.64      1006
           2       0.29      0.10      0.15       601

    accuracy                           0.47      2000
   macro avg       0.36      0.34      0.28      2000
weighted avg       0.40      0.47      0.38      2000



In [27]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier

rf_clf = RandomForestClassifier(random_state=42)
lgbm_clf = LGBMClassifier(random_state=42)
xgb_clf = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

voting_clf = VotingClassifier(estimators=[
    ('rf', rf_clf), ('lgbm', lgbm_clf), ('xgb', xgb_clf)
], voting='soft')

voting_clf.fit(X_train, y_train)

y_pred_vote = voting_clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_vote))
print("📊 Voting Classifier Report:")
print(classification_report(y_test, y_pred_vote))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002350 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3172
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 27
[LightGBM] [Info] Start training from score -1.626457
[LightGBM] [Info] Start training from score -0.687911
[LightGBM] [Info] Start training from score -1.201476


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.489
📊 Voting Classifier Report:
              precision    recall  f1-score   support

           0       0.21      0.01      0.02       393
           1       0.51      0.90      0.65      1006
           2       0.34      0.11      0.17       601

    accuracy                           0.49      2000
   macro avg       0.35      0.34      0.28      2000
weighted avg       0.40      0.49      0.38      2000



In [11]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer

df_processed = merged_data.copy()

# Categorical feature lists
label_encode_cols = ["age_group", "income_group", "net_worth_level", "risk_appetite", "recommended_strategy"]  # Ordinal categorical
one_hot_encode_cols = ["investment_goals", "employment_status", "gender"]  # Nominal categorical
multi_label_cols = ["preferred_asset_classes"]  # Multi-label categorical

# Step 2: Label Encoding for Ordinal Categorical Features
encoders = {}
for col in label_encode_cols:
    encoder = LabelEncoder()
    df_processed[col + "_encoded"] = encoder.fit_transform(df_processed[col])
    df_processed.drop(columns=[col], inplace=True)  # Remove original column
    encoders[col] = encoder  # Store encoder for later use

# Step 3: One-Hot Encoding for Nominal Categorical Features
df_processed = pd.get_dummies(df_processed, columns=one_hot_encode_cols)

# Step 4: MultiLabel Binarization for Multi-Label Features
for col in multi_label_cols:
    df_processed[col] = df_processed[col].apply(eval)  # Convert string lists to real lists
    mlb = MultiLabelBinarizer()
    mlb_encoded = pd.DataFrame(mlb.fit_transform(df_processed[col]), columns=[col + "_" + c for c in mlb.classes_])
    
    df_processed = df_processed.join(mlb_encoded).drop(columns=[col])  # Drop original multi-label column

In [12]:
print(df_processed.columns)

Index(['client_id', 'age', 'annual_income', 'debt_to_income_ratio',
       'financial_knowledge_score', 'investment_horizon_years', 'dependents',
       'savings_rate', 'net_worth', 'portfolio_value', 'equity_allocation_pct',
       'fixed_income_allocation_pct', 'monthly_contribution',
       'market_volatility_index', 'macroeconomic_score', 'sentiment_index',
       'income_to_networth_ratio', 'adjusted_debt_to_income',
       'investment_savings_ratio', 'total_financial_score',
       'total_allocation_pct', 'age_group_encoded', 'income_group_encoded',
       'net_worth_level_encoded', 'risk_appetite_encoded',
       'recommended_strategy_encoded', 'investment_goals_Education',
       'investment_goals_Home Purchase', 'investment_goals_Retirement',
       'investment_goals_Wealth Accumulation', 'employment_status_Retired',
       'employment_status_Salaried', 'employment_status_Self-Employed',
       'employment_status_Unemployed', 'gender_Female', 'gender_Male',
       'gender_Othe

In [13]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()

y = df_processed["recommended_strategy_encoded"]  # Target variable
X = df_processed.drop(columns=["client_id", "recommended_strategy", "recommended_strategy_encoded"], errors="ignore")

# Step 6: Standard Scaling
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

In [14]:
import numpy as np

# Compute correlation matrix
corr_matrix = X_scaled_df.corr().abs()

# Find highly correlated features (threshold = 0.85)
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
correlated_features = [column for column in upper.columns if any(upper[column] > 0.85)]

# Drop highly correlated features
X_scaled_df.drop(columns=correlated_features, inplace=True)

print(f"Dropped {len(correlated_features)} highly correlated features.")

Dropped 2 highly correlated features.


In [16]:
from sklearn.feature_selection import mutual_info_classif

# Compute Mutual Information scores
mi_scores = mutual_info_classif(X_scaled_df, y)
mi_scores = pd.Series(mi_scores, index=X_scaled_df.columns).sort_values(ascending=False)

# Select top 20 most important features
top_features = mi_scores[:20].index

# Reduce dataset to selected features
X_selected = X_scaled_df[top_features]

print(f"Selected {len(top_features)} most important features.")
print(X_selected.columns)

Selected 20 most important features.
Index(['gender_Male', 'annual_income', 'preferred_asset_classes_ETFs',
       'equity_allocation_pct', 'net_worth_level_encoded',
       'investment_goals_Wealth Accumulation', 'monthly_contribution',
       'market_volatility_index', 'investment_goals_Retirement', 'dependents',
       'portfolio_value', 'investment_horizon_years',
       'preferred_asset_classes_Real Estate', 'gender_Other',
       'employment_status_Self-Employed', 'investment_goals_Education',
       'macroeconomic_score', 'total_allocation_pct',
       'preferred_asset_classes_Bonds', 'employment_status_Salaried'],
      dtype='object')


In [22]:
from sklearn.decomposition import PCA

# Keep top 10 principal components
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_selected)

print(f"Explained variance by top 10 components: {sum(pca.explained_variance_ratio_):.2f}")

Explained variance by top 10 components: 0.76


In [23]:
feature_importance = np.abs(pca.components_).sum(axis=0)
top_features_indices = np.argsort(feature_importance)[::-1][:10]  # Get top 10 indices
top_features = X_selected.columns[top_features_indices]  # Get feature names

In [24]:
# ✅ Create a new DataFrame with the top 10 selected features
X_final = X_selected[top_features]

# 📌 Display the final 10 selected features
print("Top 10 Features selected by PCA from MI-selected features:")
print(top_features.tolist())

Top 10 Features selected by PCA from MI-selected features:
['equity_allocation_pct', 'market_volatility_index', 'macroeconomic_score', 'monthly_contribution', 'investment_horizon_years', 'preferred_asset_classes_Real Estate', 'preferred_asset_classes_Bonds', 'annual_income', 'portfolio_value', 'preferred_asset_classes_ETFs']


In [26]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# ✅ Split Data into Train & Test (80-20 Split)
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42, stratify=y)

# ✅ Train a Model (Random Forest as an example)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# ✅ Evaluate Performance
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Final Model Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Final Model Accuracy: 0.4855
Classification Report:
              precision    recall  f1-score   support

           0       0.24      0.02      0.03       393
           1       0.51      0.90      0.65      1006
           2       0.32      0.10      0.15       601

    accuracy                           0.49      2000
   macro avg       0.36      0.34      0.28      2000
weighted avg       0.40      0.49      0.38      2000



In [27]:
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5035

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       393
           1       0.50      1.00      0.67      1006
           2       0.29      0.00      0.01       601

    accuracy                           0.50      2000
   macro avg       0.26      0.33      0.23      2000
weighted avg       0.34      0.50      0.34      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [28]:
# ✅ Apply PCA directly on the scaled data
pca = PCA(n_components=10)  # Keep top 10 principal components
X_pca = pca.fit_transform(X_scaled)

# ✅ Create a DataFrame for PCA-transformed data
X_pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(10)])

# ✅ Split Data into Train & Test
X_train, X_test, y_train, y_test = train_test_split(X_pca_df, y, test_size=0.2, random_state=42, stratify=y)

# ✅ Train a Model (Random Forest as an example)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# ✅ Evaluate Performance
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Final Model Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


Final Model Accuracy: 0.4870
Classification Report:
              precision    recall  f1-score   support

           0       0.12      0.01      0.01       393
           1       0.50      0.91      0.65      1006
           2       0.35      0.09      0.14       601

    accuracy                           0.49      2000
   macro avg       0.32      0.34      0.27      2000
weighted avg       0.38      0.49      0.37      2000



In [29]:
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5025

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       393
           1       0.50      0.99      0.67      1006
           2       0.37      0.01      0.02       601

    accuracy                           0.50      2000
   macro avg       0.29      0.33      0.23      2000
weighted avg       0.36      0.50      0.34      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
