In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [2]:
train = pd.read_csv('../../data/raw/train.csv')
train.head()

Unnamed: 0,customer_id,name,age,gender,owns_car,owns_house,no_of_children,net_yearly_income,no_of_days_employed,occupation_type,total_family_members,migrant_worker,yearly_debt_payments,credit_limit,credit_limit_used(%),credit_score,prev_defaults,default_in_last_6months,credit_card_default
0,CST_115179,ita Bose,46,F,N,Y,0.0,107934.04,612.0,Unknown,1.0,1.0,33070.28,18690.93,73,544.0,2,1,1
1,CST_121920,Alper Jonathan,29,M,N,Y,0.0,109862.62,2771.0,Laborers,2.0,0.0,15329.53,37745.19,52,857.0,0,0,0
2,CST_109330,Umesh Desai,37,M,N,Y,0.0,230153.17,204.0,Laborers,2.0,0.0,48416.6,41598.36,43,650.0,0,0,0
3,CST_128288,Rie,39,F,N,Y,0.0,122325.82,11941.0,Core staff,2.0,0.0,22574.36,32627.76,20,754.0,0,0,0
4,CST_151355,McCool,46,M,Y,Y,0.0,387286.0,1459.0,Core staff,1.0,0.0,38282.95,52950.64,75,927.0,0,0,0


In [3]:
#choosing these four features for initial logist regression based on correlation heatmap
# Select features and target
features = ['credit_limit_used(%)', 'credit_score', 'prev_defaults', 'default_in_last_6months']
target = 'credit_card_default'

X = train[features]
y = train[target]

# Drop any rows with NaNs in either features or target
X = X.dropna()
y = y[X.index]  # keep only corresponding target values

In [4]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [5]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [6]:
y_val_pred = model.predict(X_val)
y_val_prob = model.predict_proba(X_val)[:,1]

# Classification report
print(classification_report(y_val, y_val_pred))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

# ROC-AUC
roc_auc = roc_auc_score(y_val, y_val_prob)
print(f"ROC-AUC Score: {roc_auc:.3f}")

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      8365
           1       0.96      0.77      0.85       739

    accuracy                           0.98      9104
   macro avg       0.97      0.88      0.92      9104
weighted avg       0.98      0.98      0.98      9104

Confusion Matrix:
[[8343   22]
 [ 172  567]]
ROC-AUC Score: 0.993


Very strong baseline logistic regression with no imputation or data augmentation. This may in part be driven by the size of the dataset and with over 45,000 rows. While precision is great for both positive and negative default classes, recall for for instances of credit card default has the most room for improvement. With proper imputation of missing data and data augmentation to tackle the inherit class imbalance of the dataset, we will hopefully be able to improve these metrics and build an even stronger model.