In [111]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

import joblib


In [112]:
def show_distribution(name, labels):
    counts = labels.value_counts()
    pct = labels.value_counts(normalize=True) * 100

    dist = pd.DataFrame({
        "Count": counts,
        "Percentage (%)": pct.round(2)
    })

    print(f"\nüìä {name} Distribution")
    print(dist)

In [113]:
df = pd.read_csv("../data/synthetic/features_with_risklabel.csv")
print(f"\nüìÇ Loaded dataset: {len(df):,} records")


üìÇ Loaded dataset: 30,000 records


In [114]:
feature_cols = [
    "avgMonthlyIncome",
    "incomeCV",
    "expenseRatio",
    "emiRatio",
    "avgMonthlyBalance",
    "bounceCount",
    "accountAgeMonths",
    "PD",
    "anomalyFlag"
]

X = df[feature_cols]
y = df["riskLabel"]


In [115]:
# Label mapping
label_mapping = {
    "LOW": 0,
    "MEDIUM": 1,
    "HIGH": 2
}

reverse_label_mapping = {v: k for k, v in label_mapping.items()}

y = y.map(label_mapping)

show_distribution("FULL DATASET", y)



üìä FULL DATASET Distribution
           Count  Percentage (%)
riskLabel                       
0          13259           44.20
2          12303           41.01
1           4438           14.79


In [116]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42,
    stratify=y
)

# Training set
show_distribution("TRAIN SET", y_train)

# Test set
show_distribution("TEST SET", y_test)



üìä TRAIN SET Distribution
           Count  Percentage (%)
riskLabel                       
0          10607           44.20
2           9842           41.01
1           3551           14.80

üìä TEST SET Distribution
           Count  Percentage (%)
riskLabel                       
0           2652           44.20
2           2461           41.02
1            887           14.78


In [117]:
print("TRAINING RANDOM FOREST...\n")

rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=12,
    min_samples_leaf=50,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

print("Training Complete!")


TRAINING RANDOM FOREST...

Training Complete!


In [118]:
print("\nüìä EVALUATION METRICS\n")


y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)

print(f"\nAccuracy: {accuracy_score(y_test, y_pred):.4f}")

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

print("\n\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=labels))

roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class="ovr")
print(f"ROC-AUC (One-vs-Rest): {roc_auc:.4f}")



üìä EVALUATION METRICS


Accuracy: 0.9853

Confusion Matrix:
[[2574   78    0]
 [   6  881    0]
 [   1    3 2457]]


Classification Report:
              precision    recall  f1-score   support

         LOW       1.00      0.97      0.98      2652
      MEDIUM       0.92      0.99      0.95       887
        HIGH       1.00      1.00      1.00      2461

    accuracy                           0.99      6000
   macro avg       0.97      0.99      0.98      6000
weighted avg       0.99      0.99      0.99      6000

ROC-AUC (One-vs-Rest): 0.9997


In [119]:
print("\nüîç FEATURE IMPORTANCE ANALYSIS\n")

importances = pd.DataFrame({
    "feature": feature_cols,
    "importance": rf_model.feature_importances_
}).sort_values(by="importance", ascending=False)

print("\nTop features driving risk classification:\n")
for idx, row in importances.iterrows():
    bar_length = int(row['importance'] * 50)
    bar = '‚ñà' * bar_length
    print(f"  {row['feature']:20} {row['importance']:.4f} {bar}")



üîç FEATURE IMPORTANCE ANALYSIS


Top features driving risk classification:

  PD                   0.5418 ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
  expenseRatio         0.1755 ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
  emiRatio             0.1196 ‚ñà‚ñà‚ñà‚ñà‚ñà
  incomeCV             0.0903 ‚ñà‚ñà‚ñà‚ñà
  anomalyFlag          0.0214 ‚ñà
  avgMonthlyIncome     0.0157 
  bounceCount          0.0149 
  accountAgeMonths     0.0109 
  avgMonthlyBalance    0.0098 


### üëâ Models generate probabilities, not just decisions.
    This means our model does not only know the risk class, but also the probability of that class, so when new data comes, the model uses both learned patterns and probability estimation to classify customers more accurately.

In [120]:
# Risk Scoring with Class Probabilities for full dataset
X_full = df[feature_cols].copy()

risk_pred_encoded = rf_model.predict(X_full)
risk_probabilities = rf_model.predict_proba(X_full)

# Convert to labels
risk_pred_labels = [reverse_label_mapping[pred] for pred in risk_pred_encoded]

# Add to dataframe
df['prob_low'] = risk_probabilities[:, 0]
df['prob_medium'] = risk_probabilities[:, 1]
df['prob_high'] = risk_probabilities[:, 2]


In [121]:
# 1. Save the trained model
joblib.dump(rf_model, "../models/risk_random_forest.joblib")
print("‚úì Saved: risk_random_forest.joblib")

# 2. Save feature list
joblib.dump(feature_cols, "../models/risk_features.joblib")
print("‚úì Saved: risk_features.joblib")

print("\nAll artifacts saved successfully!")

# Save
df.to_csv("../data/synthetic/features_with_risk_predictions.csv", index=False)

print("\nRisk predictions saved successfully!")


‚úì Saved: risk_random_forest.joblib
‚úì Saved: risk_features.joblib

All artifacts saved successfully!

Risk predictions saved successfully!


In [None]:
# Customer data
customer_data = pd.DataFrame({
    'avgMonthlyIncome': [350000],      # ‚Çπ3.5 Lakh/month
    'incomeCV': [0.25],                # Some income variation
    'expenseRatio': [0.45],            # 45% expenses
    'emiRatio': [0.18],                # 18% EMI
    'avgMonthlyBalance': [95000],      # ‚Çπ95k average balance
    'bounceCount': [1],                # 1 payment bounce
    'accountAgeMonths': [24],          # 2 years old account
    'PD': [0.08],                      # 8% probability of default
    'anomalyFlag': [0]                 # No anomaly
})

# Predict
prediction_encoded = rf_model.predict(customer_data)[0]
probabilities = rf_model.predict_proba(customer_data)[0]
predicted_label = reverse_label_mapping[prediction_encoded]

# Display results
print(f"\nüéØ Risk Level: {predicted_label}")
print(f"\nüìä Probabilities:")
print(f"   LOW:    {probabilities[0]:.2%}")
print(f"   MEDIUM: {probabilities[1]:.2%}")
print(f"   HIGH:   {probabilities[2]:.2%}")
print(f"\n‚úÖ Confidence: {max(probabilities):.2%}")


üéØ Risk Level: LOW

üìä Probabilities:
   LOW:    93.70%
   MEDIUM: 4.84%
   HIGH:   1.46%

‚úÖ Confidence: 93.70%
