In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Load your dataset
df = pd.read_excel("combined_delinquency_data.xlsx")  # Replace with your actual file

# Define features and target
X = df.drop("Delinquent_Account", axis=1)  # Replace with your actual target column name
y = df["Delinquent_Account"]

# Split data with stratify to keep class balance in train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# List categorical columns, including Month_1 to Month_6 if they have string status values
categorical_cols = [
    'Employment_Status', 'Credit_Card_Type', 'Location',
    'Month_1', 'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6'
]

# Numerical columns are all others
numerical_cols = [col for col in X.columns if col not in categorical_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ]
)

# Pipeline with preprocessing, SMOTE, and Logistic Regression
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42))
])

# Grid of parameters to search
param_grid = {
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [None, 10, 20],
    'rf__min_samples_split': [2, 5],
    'rf__min_samples_leaf': [1, 2]
}

# GridSearchCV setup
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=2
)

# Fit GridSearch to training data
grid_search.fit(X_train, y_train)

# Best model evaluation on test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

print("Best hyperparameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best hyperparameters: {'rf__max_depth': 20, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 100}
Accuracy: 0.99375
Confusion Matrix:
 [[84  0]
 [ 1 75]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        84
           1       1.00      0.99      0.99        76

    accuracy                           0.99       160
   macro avg       0.99      0.99      0.99       160
weighted avg       0.99      0.99      0.99       160

ROC AUC Score: 0.9939692982456141


In [4]:
# ===== Predict Risk for Full Dataset =====
y_pred_full = grid_search.best_estimator_.predict(X)  # X is full feature set

# Add the predictions to the original DataFrame
df_with_labels = df.copy()
df_with_labels['Risk Status'] = pd.Series(y_pred_full).map({0: 'Not at Risk', 1: 'At Risk'})

# Save to Excel
df_with_labels.to_excel('full_dataset_with_risk_status_rcf.xlsx', index=False)

print("✅ Full dataset with risk status saved to 'full_dataset_with_risk_status.xlsx'")


✅ Full dataset with risk status saved to 'full_dataset_with_risk_status.xlsx'


In [5]:
import pandas as pd

# ===== Predict Risk for Full Dataset =====
y_pred_full = grid_search.best_estimator_.predict(X)  # X = full features

# Add risk status to DataFrame
df_with_labels = df.copy()
df_with_labels['Risk Status'] = pd.Series(y_pred_full).map({0: 'Not at Risk', 1: 'At Risk'})

# ===== Split into two DataFrames =====
df_at_risk = df_with_labels[df_with_labels['Risk Status'] == 'At Risk']
df_not_risk = df_with_labels[df_with_labels['Risk Status'] == 'Not at Risk']

# ===== Save to separate Excel files =====
df_at_risk.to_excel('at_risk.xlsx', index=False)
df_not_risk.to_excel('not_at_risk.xlsx', index=False)

print("✅ Files saved:")
print(" - at_risk.xlsx (people at risk)")
print(" - not_at_risk.xlsx (people not at risk)")


✅ Files saved:
 - at_risk.xlsx (people at risk)
 - not_at_risk.xlsx (people not at risk)
