In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import classification_report
from torch.utils.data import Dataset, DataLoader , TensorDataset


import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [5]:
path_to_py = os.path.abspath('../src')
print(path_to_py)

/Users/gedelasnigda/loan_default/src


In [9]:
# Import test data
test_df = pd.read_csv(os.path.join(path_to_py,"data","testing_df.csv"),low_memory=False)
test_df.shape

(102505, 23)

In [10]:
test_df.head(2)

Unnamed: 0,id,member_id,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,desc,purpose,...,inq_last_6mths,mths_since_recent_inq,revol_util,total_bc_limit,mths_since_last_major_derog,tot_hi_cred_lim,tot_cur_bal,application_approved_flag,internal_score,bad_flag
0,20000001,22419852,10000,36 months,22.15%,8 years,RENT,37000.0,,debt_consolidation,...,1,3.0,73.10%,16200,,14877.17028,36809,1,131,
1,20000002,22349118,1400,36 months,18.24%,6 years,RENT,41000.0,,other,...,0,9.0,11.50%,4000,,4097.30477,19536,1,19,


In [11]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102505 entries, 0 to 102504
Data columns (total 23 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           102505 non-null  int64  
 1   member_id                    102505 non-null  int64  
 2   loan_amnt                    102505 non-null  int64  
 3   term                         102505 non-null  object 
 4   int_rate                     102505 non-null  object 
 5   emp_length                   97184 non-null   object 
 6   home_ownership               102505 non-null  object 
 7   annual_inc                   102505 non-null  float64
 8   desc                         15194 non-null   object 
 9   purpose                      102505 non-null  object 
 10  percent_bc_gt_75             101459 non-null  float64
 11  bc_util                      101463 non-null  float64
 12  dti                          102505 non-null  float64
 13 

In [27]:
# Function to preprocess the data (One-hot encoding, Scaling)
def prefitted_preprocessor(df, target, fitted_preprocessor):
    """
    Preprocesses data by handling categorical variables (one-hot encoding),
    numerical variables (scaling), and returns transformed data.
    """
    y = df[target]
    X = df.drop(columns=[target])
    
    # Use the pre-fitted preprocessor for test data
    X_transformed = fitted_preprocessor.transform(X)

    return X_transformed

In [16]:
def data_cleaning(df):
    
    # Final set of columns
    features = ['loan_amnt', 'term', 'int_rate', 'emp_length', 'home_ownership',
       'annual_inc', 'purpose', 'bc_util', 'dti', 'inq_last_6mths',
       'mths_since_recent_inq', 'revol_util', 'mths_since_last_major_derog',
       'tot_hi_cred_lim', 'tot_cur_bal', 'internal_score', 'bad_flag']
    
    # Select only the final set of features being used
    df = df[features].copy()

    # Changing datatypes
    df['term'] = df['term'].str.replace(' months', '', regex=False)
    df['term'] = pd.to_numeric(df['term'], errors='coerce')     

    df['int_rate'] = df['int_rate'].str.replace('%', '', regex=False) 
    df['int_rate'] = pd.to_numeric(df['int_rate'], errors='coerce')   

    df['emp_length'] = df['emp_length'].replace({'< 1 year': '0 years', '10+ years': '10 years'})
    df['emp_length'] = df['emp_length'].str.extract('(\d+)').astype(float)

    df['revol_util'] = df['revol_util'].str.replace('%', '', regex=False)  
    df['revol_util'] = pd.to_numeric(df['revol_util'], errors='coerce') 

    # Creating new feature: Active Account ('active_acc')
    df['active_acc'] = df.apply(
    lambda row: 0 if pd.isnull(row['tot_cur_bal']) and pd.isnull(row['tot_hi_cred_lim']) and pd.isnull(row['total_bc_limit']) else 1,
    axis=1 )

    # Handling missing values
    df['mths_since_last_major_derog'] = df['mths_since_last_major_derog'].fillna(999)

    df.loc[
    (df['mths_since_recent_inq'].isnull()) & (df['inq_last_6mths'] == 0),
    'mths_since_recent_inq'] = 99
    df['mths_since_recent_inq'] = df.apply(
    lambda row: 6 if np.isnan(row['mths_since_recent_inq']) and row['inq_last_6mths'] == 1 else
                3 if np.isnan(row['mths_since_recent_inq']) and row['inq_last_6mths'] == 2 else
                1 if np.isnan(row['mths_since_recent_inq']) and row['inq_last_6mths'] >= 3 else
                row['mths_since_recent_inq'], axis=1 )
    
    df['tot_cur_bal'] = df['tot_cur_bal'].fillna(0)

    df['bc_util'] = df['bc_util'].fillna(0)

    emp_length_median = 6
    df['emp_length'].fillna(emp_length_median, inplace=True)

    df['tot_hi_cred_lim'] = df['tot_hi_cred_lim'].fillna(0)

    revol_util_median=60.5
    df['revol_util'] = df['revol_util'].fillna(revol_util_median)
    

    # Creating new feature: loan-to-income ratio ('lti')
    df["lti"] = df["loan_amnt"] / df["annual_inc"]

    return df

In [17]:
clean_df = data_cleaning(test_df)

In [33]:
import joblib

# Load the preprocessor
fitted_preprocessor = joblib.load(os.path.join(path_to_py,"Model Results",'preprocessor.pkl'))
print("Preprocessor loaded successfully.")
fitted_preprocessor

Preprocessor loaded successfully.


In [32]:
target = 'bad_flag'
X_transformed = prefitted_preprocessor(clean_df, target, fitted_preprocessor)

# Check the shape of X_tranformer
print("Shape of X_tranformer: ",X_transformed.shape)

Shape of X_tranformer:  (102505, 33)


In [39]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LoanDefaultModel(nn.Module):
    def __init__(self, input_dim, hidden_dims=[128, 64, 32]):
        super(LoanDefaultModel, self).__init__()
        
        # Input Layer with Batch Normalization
        self.input_bn = nn.BatchNorm1d(input_dim)
        
        # Create layers dynamically
        layers = []
        prev_dim = input_dim
        
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(0.3)
            ])
            prev_dim = hidden_dim
        
        self.hidden_layers = nn.Sequential(*layers)
        
        # Output Layer
        self.output_layer = nn.Linear(hidden_dims[-1], 1)
        
    def forward(self, x):
        # Input normalization
        x = self.input_bn(x)
        
        # Hidden layers
        x = self.hidden_layers(x)
        
        # Output layer with sigmoid activation
        return torch.sigmoid(self.output_layer(x))

Using device: cpu


In [40]:
# Initialize the model
input_dim = 33  # number of features
model = LoanDefaultModel(input_dim) 

# Load the saved best_model into the model
model.load_state_dict(torch.load(os.path.join(path_to_py,"Model Results",'best_model.pth')))
model.eval()  # Set the model to evaluation mode for inference

print("Model loaded successfully.")


Model loaded successfully.


In [49]:
def get_predictions(model, X_transformed):
    """
    Function to perform inference (prediction) on the test data.
    """

    # Check for GPU availability
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Convert the transformed data to a PyTorch tensor and move to the same device as the model
    X_test_tensor = torch.tensor(X_transformed, dtype=torch.float32).to(device)

    # Perform inference (prediction) on the test data
    with torch.no_grad():  # Disable gradient calculation to save memory and computation
        model.eval()  # Ensure the model is in evaluation mode
        outputs = model(X_test_tensor)

        # If your model's output is a probability (e.g., for binary classification), apply a sigmoid function
        #predictions = torch.sigmoid(outputs).cpu().numpy()  # Move the predictions back to CPU if on GPU
    
    predictions_binary = (outputs.cpu().numpy() >= 0.5).astype(int)  # If it's a binary classification problem


    # Convert the predictions into binary classes (for binary classification)
    #predictions_binary = (predictions >= 0.5).astype(int)  # If it's a binary classification problem

    return predictions_binary

predictions = get_predictions(model, X_transformed)
print("Predictions:", predictions)


Predictions: [[1]
 [1]
 [0]
 ...
 [0]
 [1]
 [1]]


In [62]:
test_df[target] = predictions
# Percentage of 1s and 0s
percentages = {k: round(v, 2) for k, v in test_df[target].value_counts(normalize=True).multiply(100).items()}
percentages

{0: 66.75, 1: 33.25}

In [63]:
test_df.head()

Unnamed: 0,id,member_id,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,desc,purpose,...,inq_last_6mths,mths_since_recent_inq,revol_util,total_bc_limit,mths_since_last_major_derog,tot_hi_cred_lim,tot_cur_bal,application_approved_flag,internal_score,bad_flag
0,20000001,22419852,10000,36 months,22.15%,8 years,RENT,37000.0,,debt_consolidation,...,1,3.0,73.10%,16200,,14877.17028,36809,1,131,1
1,20000002,22349118,1400,36 months,18.24%,6 years,RENT,41000.0,,other,...,0,9.0,11.50%,4000,,4097.30477,19536,1,19,1
2,20000003,22398818,7000,36 months,12.49%,3 years,RENT,68900.0,,debt_consolidation,...,0,11.0,48.10%,11900,80.0,12688.49516,241465,1,92,0
3,20000004,22419015,18000,60 months,16.29%,9 years,MORTGAGE,41000.0,,debt_consolidation,...,1,0.0,38.10%,7600,73.0,7908.799817,179757,1,235,1
4,20000005,22388614,12000,36 months,12.99%,10+ years,MORTGAGE,64000.0,,home_improvement,...,0,,57.90%,21000,,19378.56106,31953,1,157,0


In [64]:
test_df.to_csv(os.path.join(path_to_py,"Model Results","test_predictions.csv"))
print("Saved the predictions successfully!")

Saved the predictions successfully!
