In [1]:
import logging
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: '@rpath/libomp.dylib'\n  Referenced from: '/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/xgboost/lib/libxgboost.dylib'\n  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/usr/lib/libomp.dylib' (no such file)"]


In [18]:
cleaned_train_df = pd.read_pickle('../data/cleaned_train.pkl')
cleaned_test_df = pd.read_pickle('../data/cleaned_test.pkl')

In [19]:
X = cleaned_train_df.drop(columns=['default_status', 'Applicant_ID'])  # drop ID and target
y = cleaned_train_df['default_status']
logger.info(f"\nFeatures shape: {X.shape} \nTarget distribution:\n{y.value_counts()}")


2025-07-05 00:09:35,116 - INFO - 
Features shape: (56000, 50) 
Target distribution:
default_status
0    42285
1    13715
Name: count, dtype: int64


In [20]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

logger.info(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}")

2025-07-05 00:09:39,279 - INFO - Train shape: (44800, 50), Validation shape: (11200, 50)


In [21]:
# Scale features for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [22]:

model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)
logger.info("Logistic Regression model trained.")

2025-07-05 00:09:42,979 - INFO - Logistic Regression model trained.


In [23]:
# Predict probabilities for validation set
y_val_probs = model.predict_proba(X_val_scaled)[:, 1]

In [24]:
# Evaluate using ROC AUC
roc_auc = roc_auc_score(y_val, y_val_probs)
logger.info(f"Validation ROC-AUC score: {roc_auc:.4f}")

2025-07-05 00:10:01,962 - INFO - Validation ROC-AUC score: 0.8315


In [25]:
# Classification report and confusion matrix at threshold 0.5
y_val_preds = (y_val_probs >= 0.5).astype(int)
logger.info("Classification report:\n" + classification_report(y_val, y_val_preds))

2025-07-05 00:10:05,250 - INFO - Classification report:
              precision    recall  f1-score   support

           0       0.83      0.93      0.88      8457
           1       0.65      0.42      0.51      2743

    accuracy                           0.80     11200
   macro avg       0.74      0.67      0.69     11200
weighted avg       0.79      0.80      0.79     11200



In [26]:
logger.info(f"Confusion Matrix:\n{confusion_matrix(y_val, y_val_preds)}")

2025-07-05 00:10:08,273 - INFO - Confusion Matrix:
[[7838  619]
 [1593 1150]]


In [28]:
os.makedirs('../outputs', exist_ok=True)

In [30]:
# Testing on test dataset
X_test = cleaned_test_df.drop(columns=['Applicant_ID'])
X_test_scaled = scaler.transform(X_test)
predictions = model.predict_proba(X_test_scaled)[:,1]

result = {
    'Applicant_ID': cleaned_test_df['Applicant_ID'],
    'default_status': predictions
}
result = pd.DataFrame(result)
logger.info("Predictions on test dataset completed.")
result.to_csv('../outputs/logistic_regression.csv', index=False)


2025-07-05 00:12:23,049 - INFO - Predictions on test dataset completed.
