### Given the dataset below, a model is to be created to determine whether a user returned his borrowed loan

In [2]:
#import the necessary libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
#read the dataset into pandas dataframe and view the head of the data

df = pd.read_csv('loan_data')
df.head()

Unnamed: 0.1,Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [5]:
#drop unnecessary columns

df.drop('Unnamed: 0',axis=1,inplace=True)

In [6]:
#confirm drop

df.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [7]:
#check the summary of the dataset

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   credit.policy      9578 non-null   int64  
 1   purpose            9578 non-null   object 
 2   int.rate           9578 non-null   float64
 3   installment        9578 non-null   float64
 4   log.annual.inc     9578 non-null   float64
 5   dti                9578 non-null   float64
 6   fico               9578 non-null   int64  
 7   days.with.cr.line  9578 non-null   float64
 8   revol.bal          9578 non-null   int64  
 9   revol.util         9578 non-null   float64
 10  inq.last.6mths     9578 non-null   int64  
 11  delinq.2yrs        9578 non-null   int64  
 12  pub.rec            9578 non-null   int64  
 13  not.fully.paid     9578 non-null   int64  
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB


In [8]:
#view the statistical summary of the dataset

df.describe()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
count,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0
mean,0.80497,0.12264,319.089413,10.932117,12.606679,710.846314,4560.767197,16913.96,46.799236,1.577469,0.163708,0.062122,0.160054
std,0.396245,0.026847,207.071301,0.614813,6.88397,37.970537,2496.930377,33756.19,29.014417,2.200245,0.546215,0.262126,0.366676
min,0.0,0.06,15.67,7.547502,0.0,612.0,178.958333,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.1039,163.77,10.558414,7.2125,682.0,2820.0,3187.0,22.6,0.0,0.0,0.0,0.0
50%,1.0,0.1221,268.95,10.928884,12.665,707.0,4139.958333,8596.0,46.3,1.0,0.0,0.0,0.0
75%,1.0,0.1407,432.7625,11.291293,17.95,737.0,5730.0,18249.5,70.9,2.0,0.0,0.0,0.0
max,1.0,0.2164,940.14,14.528354,29.96,827.0,17639.95833,1207359.0,119.0,33.0,13.0,5.0,1.0


In [9]:
#view the dataset columns

df.columns

Index(['credit.policy', 'purpose', 'int.rate', 'installment', 'log.annual.inc',
       'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util',
       'inq.last.6mths', 'delinq.2yrs', 'pub.rec', 'not.fully.paid'],
      dtype='object')

In [10]:
#create a list of the categorical features in the dataset

cat_feats = ['credit.policy', 'purpose','inq.last.6mths', 'delinq.2yrs', 'pub.rec']

In [11]:
#create dummies for the categorical features

final_data = pd.get_dummies(df,columns=cat_feats,drop_first=True)

In [12]:
#view the new dataframe with the dummies

final_data.head()

Unnamed: 0,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,not.fully.paid,credit.policy_1,...,delinq.2yrs_6,delinq.2yrs_7,delinq.2yrs_8,delinq.2yrs_11,delinq.2yrs_13,pub.rec_1,pub.rec_2,pub.rec_3,pub.rec_4,pub.rec_5
0,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,...,0,0,0,0,0,0,0,0,0,0


In [17]:
#view the columns of the final dataframe

final_data.columns

Index(['int.rate', 'installment', 'log.annual.inc', 'dti', 'fico',
       'days.with.cr.line', 'revol.bal', 'revol.util', 'not.fully.paid',
       'credit.policy_1', 'purpose_credit_card', 'purpose_debt_consolidation',
       'purpose_educational', 'purpose_home_improvement',
       'purpose_major_purchase', 'purpose_small_business', 'inq.last.6mths_1',
       'inq.last.6mths_2', 'inq.last.6mths_3', 'inq.last.6mths_4',
       'inq.last.6mths_5', 'inq.last.6mths_6', 'inq.last.6mths_7',
       'inq.last.6mths_8', 'inq.last.6mths_9', 'inq.last.6mths_10',
       'inq.last.6mths_11', 'inq.last.6mths_12', 'inq.last.6mths_13',
       'inq.last.6mths_14', 'inq.last.6mths_15', 'inq.last.6mths_16',
       'inq.last.6mths_17', 'inq.last.6mths_18', 'inq.last.6mths_19',
       'inq.last.6mths_20', 'inq.last.6mths_24', 'inq.last.6mths_25',
       'inq.last.6mths_27', 'inq.last.6mths_28', 'inq.last.6mths_31',
       'inq.last.6mths_32', 'inq.last.6mths_33', 'delinq.2yrs_1',
       'delinq.2yrs_2', 'd

### Modelling

In [18]:
#import the train test split

from sklearn.model_selection import train_test_split 

In [20]:
#split the data set into the target variables and independent variables

X = final_data.drop('not.fully.paid',axis=1)
y = final_data['not.fully.paid']

In [35]:
#split the dataset into training and testing data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) #,random_state=42)

In [36]:
#import the Decision Tree Classifier model

from sklearn.tree import DecisionTreeClassifier

In [37]:
#instatiate the model

dtree = DecisionTreeClassifier()

In [38]:
#train the model with the training dataset

dtree.fit(X_train, y_train)

DecisionTreeClassifier()

In [39]:
#make predictions off the testing dataset

predictions = dtree.predict(X_test)

In [40]:
#import metrics to measaure the performance of the dataset

from sklearn.metrics import classification_report,confusion_matrix

In [41]:
#view the performance of the model using the classification report and confusion matrix

print(classification_report(y_test,predictions))
print('\n')
print(confusion_matrix(y_test,predictions))

              precision    recall  f1-score   support

           0       0.85      0.83      0.84      2411
           1       0.21      0.24      0.22       463

    accuracy                           0.73      2874
   macro avg       0.53      0.53      0.53      2874
weighted avg       0.75      0.73      0.74      2874



[[1995  416]
 [ 354  109]]


In [42]:
#import the Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

In [59]:
#instatiate the model

rfc = RandomForestClassifier(n_estimators=500)

In [60]:
#train the model off the training dataset

rfc.fit(X_train,y_train)

RandomForestClassifier(n_estimators=500)

In [61]:
#make predictions with the model using the testing dataset

rfc_pred = rfc.predict(X_test)

In [62]:
#view the performance of the model using the classification report and confusion matrix

print(classification_report(y_test,rfc_pred))
print('\n')
print(confusion_matrix(y_test,rfc_pred))

              precision    recall  f1-score   support

           0       0.84      1.00      0.91      2411
           1       0.57      0.02      0.03       463

    accuracy                           0.84      2874
   macro avg       0.71      0.51      0.47      2874
weighted avg       0.80      0.84      0.77      2874



[[2405    6]
 [ 455    8]]


##### comparing the performance of the two models, the RandomForestClassifier performed better than the DecisionTreeClassifier