# 

# Phase 1: Setup & Load Data

In [1]:
# 1️⃣ Import libraries
import pandas as pd
import numpy as np

In [2]:
# 2️⃣ Load dataset (adjust path if downloaded manually)
data_path = "Loan_default.csv"
df = pd.read_csv(data_path)

In [3]:
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (255347, 18)


Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


# Phase 2: Data Cleaning & Inspection

In [4]:
# Check column names and data types
print("Columns and data types:")
print(df.dtypes)

Columns and data types:
LoanID             object
Age                 int64
Income              int64
LoanAmount          int64
CreditScore         int64
MonthsEmployed      int64
NumCreditLines      int64
InterestRate      float64
LoanTerm            int64
DTIRatio          float64
Education          object
EmploymentType     object
MaritalStatus      object
HasMortgage        object
HasDependents      object
LoanPurpose        object
HasCoSigner        object
Default             int64
dtype: object


In [5]:
# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())


Missing values per column:
LoanID            0
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
Default           0
dtype: int64


In [6]:
# Basic statistics for numeric columns
print("\nDescriptive statistics:")
print(df.describe())


Descriptive statistics:
                 Age         Income     LoanAmount    CreditScore  \
count  255347.000000  255347.000000  255347.000000  255347.000000   
mean       43.498306   82499.304597  127578.865512     574.264346   
std        14.990258   38963.013729   70840.706142     158.903867   
min        18.000000   15000.000000    5000.000000     300.000000   
25%        31.000000   48825.500000   66156.000000     437.000000   
50%        43.000000   82466.000000  127556.000000     574.000000   
75%        56.000000  116219.000000  188985.000000     712.000000   
max        69.000000  149999.000000  249999.000000     849.000000   

       MonthsEmployed  NumCreditLines   InterestRate       LoanTerm  \
count   255347.000000   255347.000000  255347.000000  255347.000000   
mean        59.541976        2.501036      13.492773      36.025894   
std         34.643376        1.117018       6.636443      16.969330   
min          0.000000        1.000000       2.000000      12.000000  

In [7]:
# Check unique values for categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f"\nUnique values in {col}:")
    print(df[col].value_counts())


Unique values in LoanID:
LoanID
I38PQUQS96    1
WGB0GD3150    1
XATSFC5YHN    1
CQXDW5VBAG    1
6AIVUNAJG8    1
             ..
ZBK3GDL2LI    1
SP7XY2LPYA    1
Q2DO8ENMV1    1
5EXD8N4MT4    1
ZTH91CGL0B    1
Name: count, Length: 255347, dtype: int64

Unique values in Education:
Education
Bachelor's     64366
High School    63903
Master's       63541
PhD            63537
Name: count, dtype: int64

Unique values in EmploymentType:
EmploymentType
Part-time        64161
Unemployed       63824
Self-employed    63706
Full-time        63656
Name: count, dtype: int64

Unique values in MaritalStatus:
MaritalStatus
Married     85302
Divorced    85033
Single      85012
Name: count, dtype: int64

Unique values in HasMortgage:
HasMortgage
Yes    127677
No     127670
Name: count, dtype: int64

Unique values in HasDependents:
HasDependents
Yes    127742
No     127605
Name: count, dtype: int64

Unique values in LoanPurpose:
LoanPurpose
Business     51298
Home         51286
Education    51005
Other   

# Phase 3: Feature Preprocessing

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [15]:
# Define features and target
target = "Default"
features = [col for col in df.columns if col not in ["LoanID", target]]

In [16]:
# Identify numeric and categorical columns
numeric_features = df[features].select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df[features].select_dtypes(include=['object']).columns.tolist()

print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)

Numeric features: ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio']
Categorical features: ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']


In [17]:
# Split data into train and test
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [18]:
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Training set shape: (191510, 16)
Test set shape: (63837, 16)


In [19]:
# Preprocessing pipeline
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features)
])

print("\nData preprocessing setup complete.")


Data preprocessing setup complete.


# Phase 4: Baseline Credit Risk Model

In [26]:
# Create pipeline: preprocessing + Logistic Regression
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(solver='saga',max_iter=200,n_jobs=-1))
])

In [27]:
# Train the model 
model.fit(X_train, y_train) 

In [28]:
# Predict on test set 
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:,1] 

In [29]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_proba)
print("ROC-AUC:", roc_auc)

# Detailed Classification Report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8845497125491486
ROC-AUC: 0.7428975228870218

Classification Report:
               precision    recall  f1-score   support

           0       0.89      1.00      0.94     56424
           1       0.58      0.02      0.04      7413

    accuracy                           0.88     63837
   macro avg       0.73      0.51      0.49     63837
weighted avg       0.85      0.88      0.83     63837


Confusion Matrix:
 [[56315   109]
 [ 7261   152]]


In [30]:
import pandas as pd

# Only works if you have a numeric preprocessor
coefficients = model.named_steps['classifier'].coef_[0]
feature_importance = pd.DataFrame({'feature': numeric_features, 'coef': coefficients})
feature_importance = feature_importance.sort_values(by='coef', key=abs, ascending=False)
print(feature_importance)

          feature      coef
0             Age -0.577617
6    InterestRate  0.450590
1          Income -0.334381
4  MonthsEmployed -0.331852
2      LoanAmount  0.294375
3     CreditScore -0.121384
5  NumCreditLines  0.098376
8        DTIRatio  0.066645
7        LoanTerm  0.003271


In [34]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__solver': ['lbfgs', 'saga']
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

{'classifier__C': 10, 'classifier__solver': 'lbfgs'}


In [35]:
import joblib
joblib.dump(model, 'logistic_model.pkl')

['logistic_model.pkl']