In [1]:
import pandas as pd
import numpy as np
import logging
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

In [7]:
cleaned_train_df = pd.read_pickle('data/cleaned_train.pkl')
cleaned_test_df = pd.read_pickle('data/cleaned_test.pkl')

In [16]:
# Prepare feature matrix X and target variable y 
X = cleaned_train_df.drop(columns=['default_status', 'Applicant_ID'])  # drop ID and target
y = cleaned_train_df['default_status']
logger.info(f"\nFeatures shape: {X.shape} \nTarget distribution:\n{y.value_counts()}")


2025-07-04 21:58:28,780 - INFO - 
Features shape: (56000, 50), 
Target distribution:
default_status
0    42285
1    13715
Name: count, dtype: int64


In [17]:
# Split data for training and validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

logger.info(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}")

2025-07-04 22:06:15,204 - INFO - Train shape: (44800, 50), Validation shape: (11200, 50)


In [19]:
# Scale features for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [20]:
# Train Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)
logger.info("Logistic Regression model trained.")

2025-07-04 22:09:58,262 - INFO - Logistic Regression model trained.


In [21]:
# Predict probabilities for validation set
y_val_probs = model.predict_proba(X_val_scaled)[:, 1]

In [22]:
# Evaluate using ROC AUC
roc_auc = roc_auc_score(y_val, y_val_probs)
logger.info(f"Validation ROC-AUC score: {roc_auc:.4f}")

2025-07-04 22:10:56,103 - INFO - Validation ROC-AUC score: 0.8315


In [24]:
# Classification report and confusion matrix at threshold 0.5
y_val_preds = (y_val_probs >= 0.5).astype(int)
logger.info("Classification report:\n" + classification_report(y_val, y_val_preds))

2025-07-04 22:19:33,746 - INFO - Classification report:
              precision    recall  f1-score   support

           0       0.83      0.93      0.88      8457
           1       0.65      0.42      0.51      2743

    accuracy                           0.80     11200
   macro avg       0.74      0.67      0.69     11200
weighted avg       0.79      0.80      0.79     11200



In [25]:
logger.info(f"Confusion Matrix:\n{confusion_matrix(y_val, y_val_preds)}")

2025-07-04 22:19:38,450 - INFO - Confusion Matrix:
[[7838  619]
 [1593 1150]]
