In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Load your dataset
df = pd.read_excel("combined_delinquency_data.xlsx")  # Replace with your actual file

# Define features and target
X = df.drop("Delinquent_Account", axis=1)  # Replace with your actual target column name
y = df["Delinquent_Account"]

# Split data with stratify to keep class balance in train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# List categorical columns, including Month_1 to Month_6 if they have string status values
categorical_cols = [
    'Employment_Status', 'Credit_Card_Type', 'Location',
    'Month_1', 'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6'
]

# Numerical columns are all others
numerical_cols = [col for col in X.columns if col not in categorical_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ]
)

# Pipeline with preprocessing, SMOTE, and Logistic Regression
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('xgb', XGBClassifier(eval_metric='logloss',random_state=42))
])

# Grid of parameters to search
param_grid = {
    'xgb__n_estimators': [100, 200],
    'xgb__max_depth': [3, 5, 7],
    'xgb__learning_rate': [0.01, 0.1],
    'xgb__subsample': [0.8, 1.0],
    'xgb__colsample_bytree': [0.8, 1.0]
}


# GridSearchCV setup
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=2
)

# Fit GridSearch to training data
grid_search.fit(X_train, y_train)

# Best model evaluation on test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

print("Best hyperparameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best hyperparameters: {'xgb__colsample_bytree': 1.0, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 7, 'xgb__n_estimators': 200, 'xgb__subsample': 0.8}
Accuracy: 0.94375


In [4]:
y_pred_full = grid_search.best_estimator_.predict(X)

df_with_labels = df.copy()
df_with_labels['Risk Status'] = pd.Series(y_pred_full, index=df.index).map({0: 'Not at Risk', 1: 'At Risk'})

# ===== 8. Split into Two Files =====
df_at_risk = df_with_labels[df_with_labels['Risk Status'] == 'At Risk']
df_not_risk = df_with_labels[df_with_labels['Risk Status'] == 'Not at Risk']

df_at_risk.to_excel('at_risk_xgb.xlsx', index=False)
df_not_risk.to_excel('not_at_risk_xgb.xlsx', index=False)

print("\n✅ Files saved:")
print(" - at_risk_xgb.xlsx (people at risk)")
print(" - not_at_risk_xgb.xlsx (people not at risk)")


✅ Files saved:
 - at_risk_xgb.xlsx (people at risk)
 - not_at_risk_xgb.xlsx (people not at risk)


In [7]:
y_pred_full = grid_search.best_estimator_.predict(X)

# Add "Risk Status" column
df["Risk Status"] = ["At Risk" if p == 1 else "Not at Risk" for p in y_pred_full]

# Save single Excel file
df.to_excel("full_risk_labels_xgb.xlsx", index=False)
print("✅ Full dataset with risk labels saved to 'full_risk_labels.xlsx'")

✅ Full dataset with risk labels saved to 'full_risk_labels.xlsx'
