In [3]:
import pandas as pd

df = pd.read_excel('combined_delinquency_data.xlsx')

In [4]:
df.head()

Unnamed: 0,Age,Income,Credit_Score,Credit_Utilization,Missed_Payments,Delinquent_Account,Loan_Balance,Debt_to_Income_Ratio,Employment_Status,Account_Tenure,Credit_Card_Type,Location,Month_1,Month_2,Month_3,Month_4,Month_5,Month_6
0,56.0,165580.0,398.0,0.390502,3.0,0,16310.0,0.317396,EMP,18.0,Student,Los Angeles,Late,Late,Missed,Late,Missed,Late
1,69.0,100999.0,493.0,0.312444,6.0,1,17401.0,0.196093,Self-employed,0.0,Standard,Phoenix,Missed,Missed,Late,Missed,On-time,On-time
2,46.0,188416.0,500.0,0.35993,0.0,0,13761.0,0.301655,Self-employed,1.0,Platinum,Chicago,Missed,Late,Late,On-time,Missed,Late
3,32.0,101672.0,413.0,0.3714,3.0,0,88778.0,0.264794,Unemployed,15.0,Platinum,Phoenix,Late,Missed,Late,Missed,Late,Late
4,60.0,38524.0,487.0,0.234716,2.0,0,13316.0,0.510583,Self-employed,11.0,Standard,Phoenix,Missed,On-time,Missed,Late,Late,Late


In [5]:
df.tail()

Unnamed: 0,Age,Income,Credit_Score,Credit_Utilization,Missed_Payments,Delinquent_Account,Loan_Balance,Debt_to_Income_Ratio,Employment_Status,Account_Tenure,Credit_Card_Type,Location,Month_1,Month_2,Month_3,Month_4,Month_5,Month_6
795,64.675789,103395.740892,820.84288,0.634702,0.0,1,11323.469859,0.439049,Unemployed,18.579385,Business,Phoenix,Late,Late,Late,Late,Late,On-time
796,49.075082,188139.804517,646.674075,0.53965,0.024283,1,74268.686238,0.413185,Employed,6.150152,Student,New York,Missed,Missed,On-time,Late,Missed,On-time
797,60.34344,78435.545754,416.185391,0.432,2.959652,1,59393.957036,0.203895,Unemployed,0.0,Business,New York,On-time,Late,Missed,On-time,On-time,On-time
798,50.376755,196337.739449,666.909137,0.743451,6.0,1,81303.841415,0.4526,Self-employed,9.987892,Platinum,Los Angeles,On-time,On-time,Missed,Missed,Late,Missed
799,48.004598,70728.638807,388.415219,0.657876,0.149148,1,99620.0,0.410453,retired,15.629085,Business,Houston,Missed,Late,On-time,Late,Late,On-time


In [6]:
df.isnull().sum()

Age                     0
Income                  0
Credit_Score            0
Credit_Utilization      0
Missed_Payments         0
Delinquent_Account      0
Loan_Balance            0
Debt_to_Income_Ratio    0
Employment_Status       0
Account_Tenure          0
Credit_Card_Type        0
Location                0
Month_1                 0
Month_2                 0
Month_3                 0
Month_4                 0
Month_5                 0
Month_6                 0
dtype: int64

In [7]:
df.columns

Index(['Age', 'Income', 'Credit_Score', 'Credit_Utilization',
       'Missed_Payments', 'Delinquent_Account', 'Loan_Balance',
       'Debt_to_Income_Ratio', 'Employment_Status', 'Account_Tenure',
       'Credit_Card_Type', 'Location', 'Month_1', 'Month_2', 'Month_3',
       'Month_4', 'Month_5', 'Month_6'],
      dtype='object')

In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Load your dataset
df = pd.read_excel("combined_delinquency_data.xlsx")  # Replace with your actual file

# Define features and target
X = df.drop("Delinquent_Account", axis=1)  # Replace with your actual target column name
y = df["Delinquent_Account"]

# List categorical columns, including Month_1 to Month_6 if they have string status values
categorical_cols = [
    'Employment_Status', 'Credit_Card_Type', 'Location',
    'Month_1', 'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6'
]

# Numerical columns are all others
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Preprocessing pipelines for numeric and categorical data
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Pipeline with preprocessing, SMOTE, and Logistic Regression
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Safe parameter grid — only compatible penalty and solver combos
param_grid = {
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear'],  # liblinear supports l1 and l2
    'classifier__C': [0.1, 1, 10],
    'classifier__class_weight': [None, 'balanced']
}

# Split data with stratify to keep class balance in train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# GridSearchCV setup
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=2
)

# Fit GridSearch to training data
grid_search.fit(X_train, y_train)

# Best model evaluation on test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

print("Best hyperparameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best hyperparameters: {'classifier__C': 10, 'classifier__class_weight': None, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Accuracy: 0.58125
Confusion Matrix:
 [[39 45]
 [22 54]]
              precision    recall  f1-score   support

           0       0.64      0.46      0.54        84
           1       0.55      0.71      0.62        76

    accuracy                           0.58       160
   macro avg       0.59      0.59      0.58       160
weighted avg       0.59      0.58      0.58       160

ROC AUC Score: 0.6242167919799498
