<a href="https://colab.research.google.com/github/TanviMhetre/Delinquency-of-Credit-Card-Holders/blob/main/Same_data_test_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# 🚀 Predictive Modeling for Delinquency using Logistic Regression + SMOTE

import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# ---------------------------
# Step 1: Load the Dataset
# ---------------------------
data = pd.read_csv("/content/Delinquency_prediction_dataset.csv")

print("✅ Data Loaded Successfully!")
print("Shape of dataset:", data.shape)
print("\nPreview of dataset:\n", data.head())

# ---------------------------
# Step 2: Handle Missing Values (if any)
# ---------------------------
data = data.dropna()  # or use imputation if needed

# ---------------------------
# Step 3: Define Features and Target
# ---------------------------
# Replace 'Delinquent' with your actual target column name
target_col = 'Delinquent_Account'
# Identify categorical columns to exclude from scaling
categorical_cols = ['Customer_ID', 'Employment_Status', 'Credit_Card_Type', 'Location', 'Month_1', 'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6']
X = data.drop(columns=[target_col] + categorical_cols)
y = data[target_col]

# ---------------------------
# Step 4: Shuffle the Dataset
# ---------------------------
X, y = shuffle(X, y, random_state=42)

# ---------------------------
# Step 5: Feature Scaling
# ---------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ---------------------------
# Step 6: Apply SMOTE for Class Balancing
# ---------------------------
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

print("\nClass distribution after SMOTE:")
print(y_resampled.value_counts())

# ---------------------------
# Step 7: Logistic Regression Model
# ---------------------------
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_resampled, y_resampled)

# ---------------------------
# Step 8: Predict on the Same (Shuffled) Data
# ---------------------------
y_pred = model.predict(X_resampled)

# ---------------------------
# Step 9: Model Evaluation
# ---------------------------
print("\nModel Performance on Same Data:")
print("Accuracy:", accuracy_score(y_resampled, y_pred))
print("Precision:", precision_score(y_resampled, y_pred))
print("Recall:", recall_score(y_resampled, y_pred))
print("F1 Score:", f1_score(y_resampled, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_resampled, y_pred))
print("\nClassification Report:\n", classification_report(y_resampled, y_pred))

✅ Data Loaded Successfully!
Shape of dataset: (500, 19)

Preview of dataset:
   Customer_ID  Age    Income  Credit_Score  Credit_Utilization  \
0    CUST0001   56  165580.0         398.0            0.390502   
1    CUST0002   69  100999.0         493.0            0.312444   
2    CUST0003   46  188416.0         500.0            0.359930   
3    CUST0004   32  101672.0         413.0            0.371400   
4    CUST0005   60   38524.0         487.0            0.234716   

   Missed_Payments  Delinquent_Account  Loan_Balance  Debt_to_Income_Ratio  \
0                3                   0       16310.0              0.317396   
1                6                   1       17401.0              0.196093   
2                0                   0       13761.0              0.301655   
3                3                   0       88778.0              0.264794   
4                2                   0       13316.0              0.510583   

  Employment_Status  Account_Tenure Credit_Card_Type    