<a href="https://colab.research.google.com/github/TanviMhetre/Delinquency-of-Credit-Card-Holders/blob/main/Delinquency_of_Credit_Card_Holders.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
    roc_auc_score,
)

# 1. Load dataset
# If the CSV file is in the same folder, provide the filename below.
df = pd.read_csv("/content/Delinquency_prediction_dataset.csv")

# Quick peek
print("Rows, cols:", df.shape)
print(df.head(3))

# 2. Select features
features = [
    "Credit_Utilization",
    "Missed_Payments",
    "Income",
    "Debt_to_Income_Ratio",
    "Account_Tenure",
]
X = df[features].copy()

# 3. Define target variable
y = df["Delinquent_Account"].copy()

# Basic target cleaning: ensure binary numeric (0/1)
# Some datasets may already be numeric; enforce it properly.
# If target contains non-numeric values like 'Yes'/'No' convert accordingly.
if y.dtype == object:
    # try converting to numeric
    y_numeric = pd.to_numeric(y, errors="coerce")
    if y_numeric.isna().all():
        # fallback: map common text to 0/1
        mapping = {}
        unique_vals = y.dropna().unique().tolist()
        print("Target unique values:", unique_vals)
        # If values look like 'Yes'/'No' or 'Y'/'N' or '1'/'0', adapt mapping:
        # We'll map any distinct non-null value to integer labels using factorize
        y = pd.Series(pd.factorize(y.fillna("MISSING"))[0])
    else:
        y = y_numeric.fillna(0).astype(int)
else:
    # If already numeric, ensure integers and handle NaNs
    y = pd.to_numeric(y, errors="coerce").fillna(0).astype(int)

# Check distribution of target
print("Target value counts:\n", y.value_counts())

# 4. Preprocessing: handle missing values and scaling
# Missing values appear in Income and Loan_Balance and others in dataset.
# Here we impute numeric features with median.
num_imputer = SimpleImputer(strategy="median")
X_imputed = pd.DataFrame(num_imputer.fit_transform(X), columns=features)

# Some rows may still contain extreme or invalid values; it's okay for logistic regression.
# Scale features (recommended for regularized logistic regression)
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_imputed), columns=features)

# 5. Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

model = LogisticRegression(solver="liblinear", random_state=42)
model.fit(X_train, y_train)


# 7. Predict and evaluate
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
cm = confusion_matrix(y_test, y_pred)

print("\nEvaluation on test set:")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1-score: {f1:.4f}")
print("Confusion matrix:")
print(cm)
print("\nClassification report:")
print(classification_report(y_test, y_pred, zero_division=0))

if y_proba is not None:
    try:
        roc_auc = roc_auc_score(y_test, y_proba)
        print(f"ROC AUC: {roc_auc:.4f}")
    except Exception as e:
        print("Could not compute ROC AUC:", e)

# Optional: show feature coefficients (importance)
coef_df = pd.DataFrame(
    {"feature": features, "coefficient": model.coef_.ravel()}
).sort_values(by="coefficient", key=abs, ascending=False)
print("\nFeature coefficients:")
print(coef_df)

# Print test set size and positive class prevalence for context
print("\nTest set size:", len(y_test))
print("Positive class rate in test set:", y_test.mean())

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
    roc_auc_score,
)

# 1. Load dataset
# If the CSV file is in the same folder, provide the filename below.
df = pd.read_csv("/content/Delinquency_prediction_dataset.csv")

# Quick peek
print("Rows, cols:", df.shape)
print(df.head(3))

# 2. Select features
features = [
    "Credit_Utilization",
    "Missed_Payments",
    "Income",
    "Debt_to_Income_Ratio",
    "Account_Tenure",
]
X = df[features].copy()

# 3. Define target variable
y = df["Delinquent_Account"].copy()

# Basic target cleaning: ensure binary numeric (0/1)
# Some datasets may already be numeric; enforce it properly.
# If target contains non-numeric values like 'Yes'/'No' convert accordingly.
if y.dtype == object:
    # try converting to numeric
    y_numeric = pd.to_numeric(y, errors="coerce")
    if y_numeric.isna().all():
        # fallback: map common text to 0/1
        mapping = {}
        unique_vals = y.dropna().unique().tolist()
        print("Target unique values:", unique_vals)
        # If values look like 'Yes'/'No' or 'Y'/'N' or '1'/'0', adapt mapping:
        # We'll map any distinct non-null value to integer labels using factorize
        y = pd.Series(pd.factorize(y.fillna("MISSING"))[0])
    else:
        y = y_numeric.fillna(0).astype(int)
else:
    # If already numeric, ensure integers and handle NaNs
    y = pd.to_numeric(y, errors="coerce").fillna(0).astype(int)

# Check distribution of target
print("Target value counts:\n", y.value_counts())

# 4. Preprocessing: handle missing values and scaling
# Missing values appear in Income and Loan_Balance and others in dataset.
# Here we impute numeric features with median.
num_imputer = SimpleImputer(strategy="median")
X_imputed = pd.DataFrame(num_imputer.fit_transform(X), columns=features)

# Some rows may still contain extreme or invalid values; it's okay for logistic regression.
# Scale features (recommended for regularized logistic regression)
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_imputed), columns=features)

# 🟢 Apply SMOTE to balance the training data
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_res.value_counts())

# 5. Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# 6. Fit logistic regression model on resampled data
model = LogisticRegression(solver="liblinear", random_state=42)
model.fit(X_res, y_res)



# 7. Predict and evaluate
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
cm = confusion_matrix(y_test, y_pred)

print("\nEvaluation on test set:")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1-score: {f1:.4f}")
print("Confusion matrix:")
print(cm)
print("\nClassification report:")
print(classification_report(y_test, y_pred, zero_division=0))

if y_proba is not None:
    try:
        roc_auc = roc_auc_score(y_test, y_proba)
        print(f"ROC AUC: {roc_auc:.4f}")
    except Exception as e:
        print("Could not compute ROC AUC:", e)

# Optional: show feature coefficients (importance)
coef_df = pd.DataFrame(
    {"feature": features, "coefficient": model.coef_.ravel()}
).sort_values(by="coefficient", key=abs, ascending=False)
print("\nFeature coefficients:")
print(coef_df)

# Print test set size and positive class prevalence for context
print("\nTest set size:", len(y_test))
print("Positive class rate in test set:", y_test.mean())

Rows, cols: (500, 19)
  Customer_ID  Age    Income  Credit_Score  Credit_Utilization  \
0    CUST0001   56  165580.0         398.0            0.390502   
1    CUST0002   69  100999.0         493.0            0.312444   
2    CUST0003   46  188416.0         500.0            0.359930   

   Missed_Payments  Delinquent_Account  Loan_Balance  Debt_to_Income_Ratio  \
0                3                   0       16310.0              0.317396   
1                6                   1       17401.0              0.196093   
2                0                   0       13761.0              0.301655   

  Employment_Status  Account_Tenure Credit_Card_Type     Location Month_1  \
0               EMP              18          Student  Los Angeles    Late   
1     Self-employed               0         Standard      Phoenix  Missed   
2     Self-employed               1         Platinum      Chicago  Missed   

  Month_2 Month_3  Month_4  Month_5  Month_6  
0    Late  Missed     Late   Missed     Late