In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, roc_curve
import matplotlib.pyplot as plt
import re

In [3]:
lending = pd.read_csv('lending_club_info.csv')

In [4]:
lending.head()

Unnamed: 0,LoanStatNew,Description
0,loan_amnt,The listed amount of the loan applied for by t...
1,term,The number of payments on the loan. Values are...
2,int_rate,Interest Rate on the loan
3,installment,The monthly payment owed by the borrower if th...
4,grade,LC assigned loan grade


In [5]:
lending.isnull().sum()

LoanStatNew    0
Description    0
dtype: int64

In [6]:
loan = pd.read_csv('lending_club_loan_two.csv')

In [6]:
loan.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,...,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,application_type,mort_acc,pub_rec_bankruptcies,address
0,10000.0,36 months,11.44,329.48,B,B4,Marketing,10+ years,RENT,117000.0,...,16.0,0.0,36369.0,41.8,25.0,w,INDIVIDUAL,0.0,0.0,"0174 Michelle Gateway\nMendozaberg, OK 22690"
1,8000.0,36 months,11.99,265.68,B,B5,Credit analyst,4 years,MORTGAGE,65000.0,...,17.0,0.0,20131.0,53.3,27.0,f,INDIVIDUAL,3.0,0.0,"1076 Carney Fort Apt. 347\nLoganmouth, SD 05113"
2,15600.0,36 months,10.49,506.97,B,B3,Statistician,< 1 year,RENT,43057.0,...,13.0,0.0,11987.0,92.2,26.0,f,INDIVIDUAL,0.0,0.0,"87025 Mark Dale Apt. 269\nNew Sabrina, WV 05113"
3,7200.0,36 months,6.49,220.65,A,A2,Client Advocate,6 years,RENT,54000.0,...,6.0,0.0,5472.0,21.5,13.0,f,INDIVIDUAL,0.0,0.0,"823 Reid Ford\nDelacruzside, MA 00813"
4,24375.0,60 months,17.27,609.33,C,C5,Destiny Management Inc.,9 years,MORTGAGE,55000.0,...,13.0,0.0,24584.0,69.8,43.0,f,INDIVIDUAL,1.0,0.0,"679 Luna Roads\nGreggshire, VA 11650"


In [7]:
loan.isnull().sum()

loan_amnt                   0
term                        0
int_rate                    0
installment                 0
grade                       0
sub_grade                   0
emp_title               22927
emp_length              18301
home_ownership              0
annual_inc                  0
verification_status         0
issue_d                     0
loan_status                 0
purpose                     0
title                    1756
dti                         0
earliest_cr_line            0
open_acc                    0
pub_rec                     0
revol_bal                   0
revol_util                276
total_acc                   0
initial_list_status         0
application_type            0
mort_acc                37795
pub_rec_bankruptcies      535
address                     0
dtype: int64

In [8]:
loan['emp_title'].fillna('Unknown', inplace=True)

In [9]:
loan['title'].fillna('Unknown', inplace=True)

In [10]:
# Function to convert emp_length to numeric values
def convert(emp_length):
    if pd.isna(emp_length):
        return emp_length
    emp_length = emp_length.strip()
    if emp_length == '10+ years':
        return 10
    elif emp_length == '< 1 year':
        return 0.5
    else:
        match = re.match(r'(\d+) year', emp_length)
        if match:
            return float(match.group(1))
        return np.nan

In [11]:
loan['emp_length'] = loan['emp_length'].apply(convert)

In [12]:
loan['emp_length'].fillna(loan['emp_length'].median(), inplace=True)

In [13]:
loan['mort_acc'].fillna(loan['mort_acc'].median(), inplace=True)

In [14]:
loan['revol_util'].fillna(loan['revol_util'].median(), inplace=True)

In [15]:
loan['pub_rec_bankruptcies'].fillna(loan['pub_rec_bankruptcies'].median(), inplace=True)

In [16]:
loan.isnull().sum()

loan_amnt               0
term                    0
int_rate                0
installment             0
grade                   0
sub_grade               0
emp_title               0
emp_length              0
home_ownership          0
annual_inc              0
verification_status     0
issue_d                 0
loan_status             0
purpose                 0
title                   0
dti                     0
earliest_cr_line        0
open_acc                0
pub_rec                 0
revol_bal               0
revol_util              0
total_acc               0
initial_list_status     0
application_type        0
mort_acc                0
pub_rec_bankruptcies    0
address                 0
dtype: int64

In [17]:
loan.dtypes

loan_amnt               float64
term                     object
int_rate                float64
installment             float64
grade                    object
sub_grade                object
emp_title                object
emp_length              float64
home_ownership           object
annual_inc              float64
verification_status      object
issue_d                  object
loan_status              object
purpose                  object
title                    object
dti                     float64
earliest_cr_line         object
open_acc                float64
pub_rec                 float64
revol_bal               float64
revol_util              float64
total_acc               float64
initial_list_status      object
application_type         object
mort_acc                float64
pub_rec_bankruptcies    float64
address                  object
dtype: object

In [18]:
categorical = ['term', 'grade', 'sub_grade', 'home_ownership', 'verification_status', 'issue_d', 
                    'purpose', 'initial_list_status', 'application_type', 'earliest_cr_line', 'address']

In [19]:
numerical = ['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti', 'open_acc', 'pub_rec', 
                  'revol_bal', 'total_acc', 'mort_acc', 'emp_length', 'revol_util', 'pub_rec_bankruptcies']

In [20]:
x = loan.drop(columns=['loan_status'])
y = loan['loan_status']

In [21]:
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [22]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical),
        ('cat', categorical_transformer, categorical)
    ])

In [23]:
model = DecisionTreeClassifier(max_depth=5)

In [24]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])

In [25]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [26]:
pipeline.fit(x_train, y_train)

In [27]:
y_pred = pipeline.predict(x_test)

In [29]:
y_pred_proba = pipeline.predict_proba(x_test)[:, 1]

In [30]:
accuracy = accuracy_score(y_test, y_pred)

In [31]:
roc_auc = roc_auc_score(y_test, y_pred_proba)

In [32]:
print('Accuracy:', accuracy)
print('ROC-AUC:',roc_auc)

Accuracy: 0.804257253238391
ROC-AUC: 0.6934806169034133


In [33]:
conf_matrix = confusion_matrix(y_test, y_pred)

In [34]:
print('Confusion Matrix:')
conf_matrix

Confusion Matrix:


array([[  435, 15142],
       [  362, 63267]], dtype=int64)