In [1]:
%matplotlib inline

In [2]:
import numpy as np

In [3]:
import pandas as pd

In [4]:
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_extraction.text import CountVectorizer  #DT does not take strings as input for the model fit step....

In [5]:
credit_df = pd.read_csv("credit.csv")

In [6]:
credit_df.info()  # many columns are of type object i.e. strings. These need to be converted to ordinal type

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
checking_balance        1000 non-null object
months_loan_duration    1000 non-null int64
credit_history          1000 non-null object
purpose                 1000 non-null object
amount                  1000 non-null int64
savings_balance         1000 non-null object
employment_duration     1000 non-null object
percent_of_income       1000 non-null int64
years_at_residence      1000 non-null int64
age                     1000 non-null int64
other_credit            1000 non-null object
housing                 1000 non-null object
existing_loans_count    1000 non-null int64
job                     1000 non-null object
dependents              1000 non-null int64
phone                   1000 non-null object
default                 1000 non-null object
dtypes: int64(7), object(10)
memory usage: 132.9+ KB


In [7]:
# Decision tree in Python can take only numerical / categorical colums. It cannot take string / obeject types. 
# The following code loops through each column and checks if the column type is object then converts those columns
# into categorical with each distinct value becoming a category or code.

for feature in credit_df.columns: # Loop through all columns in the dataframe
    if credit_df[feature].dtype == 'object': # Only apply for columns with categorical strings
        credit_df[feature] = pd.Categorical(credit_df[feature]).codes # Replace strings with an integer

In [8]:
credit_df.head()

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
0,1,6,0,4,1169,4,3,4,4,67,1,1,2,1,1,1,0
1,0,48,1,4,5951,2,0,2,2,22,1,1,1,1,1,0,1
2,3,12,0,3,2096,2,1,2,3,49,1,1,1,3,2,0,0
3,1,42,1,4,7882,2,1,2,4,45,1,0,1,1,2,0,0
4,1,24,3,1,4870,2,0,3,4,53,1,0,2,1,2,0,1


In [22]:
credit_df.info()
credit_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
checking_balance        1000 non-null int8
months_loan_duration    1000 non-null int64
credit_history          1000 non-null int8
purpose                 1000 non-null int8
amount                  1000 non-null int64
savings_balance         1000 non-null int8
employment_duration     1000 non-null int8
percent_of_income       1000 non-null int64
years_at_residence      1000 non-null int64
age                     1000 non-null int64
other_credit            1000 non-null int8
housing                 1000 non-null int8
existing_loans_count    1000 non-null int64
job                     1000 non-null int8
dependents              1000 non-null int64
phone                   1000 non-null int8
dtypes: int64(7), int8(9)
memory usage: 63.6 KB


Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,1.582,20.903,1.07,2.54,3271.258,2.145,1.525,2.973,2.845,35.546,0.908,1.071,1.407,1.274,1.155,0.404
std,1.253334,12.058814,1.05935,1.607789,2822.736876,1.1114,1.344315,1.118715,1.103718,11.375469,0.421561,0.531264,0.577654,0.946478,0.362086,0.490943
min,0.0,4.0,0.0,0.0,250.0,0.0,0.0,1.0,1.0,19.0,0.0,0.0,1.0,0.0,1.0,0.0
25%,0.0,12.0,0.0,1.0,1365.5,2.0,0.0,2.0,2.0,27.0,1.0,1.0,1.0,1.0,1.0,0.0
50%,1.0,18.0,1.0,3.0,2319.5,2.0,1.0,3.0,3.0,33.0,1.0,1.0,1.0,1.0,1.0,0.0
75%,3.0,24.0,1.0,4.0,3972.25,2.0,3.0,4.0,4.0,42.0,1.0,1.0,2.0,1.0,1.0,1.0
max,3.0,72.0,4.0,5.0,18424.0,4.0,4.0,4.0,4.0,75.0,2.0,2.0,4.0,3.0,2.0,1.0


In [10]:
# splitting data into training and test set for independent attributes

train_set = credit_df.head(700) # Up to the last initial training set row
test_set = credit_df.tail(300) # Past the last initial training set row

# capture the target column ("default") into separate vectors for training set and test set
train_labels = train_set.pop("default")
test_labels = test_set.pop("default")


In [11]:
# invoking the decision tree classifier function. Using 'entropy' method of finding the split columns. Other option 
# could be gini index.  Restricting the depth of the tree to 5 (no particular reason for selecting this)

#dt_model = DecisionTreeClassifier(criterion = 'entropy' , max_depth = 5, random_state = 100)
                                  
dt_model = DecisionTreeClassifier(criterion = 'entropy' )

In [12]:
dt_model.fit(train_set, train_labels)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [13]:
dt_model.score(test_set , test_labels)

0.6866666666666666

In [14]:
dt_model.score(train_set, train_labels)   #Overfit due to large complext tree. No training error , poor test performance

1.0

Ensemble Learning - Bagging

In [15]:
credit_labels = credit_df.pop("default")   #For ensemble, you do not need training and test data separately. 
                                           # bagging can use out of bag records for testing

In [21]:
# In the following lines, we call the bagging classifer with oob_score (out of bag score) set to true which false by default
# This makes the baggingclassifier use the 37% unused data for testing
# Compare the performance of the BGCL with regularized dt above. 
# Though not required, you can keep separate test data (outside the bootstrap sampling) on which we test the BGCL
# 

from sklearn.ensemble import BaggingClassifier
bgcl = BaggingClassifier(n_estimators=50, max_samples=.7 , oob_score=True)

bgcl = bgcl.fit(credit_df, credit_labels)
print(bgcl.oob_score_)

bgcl=bgcl.fit(train_set,train_labels)
print(bgcl.score(test_set,test_labels))

0.756
0.74


Regularising the Decision Tree

In [19]:
dt_model = DecisionTreeClassifier(criterion = 'entropy',  max_depth = 5 )
dt_model.fit(train_set, train_labels)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [20]:
print(dt_model.score(train_set , train_labels))

print(dt_model.score(test_set , test_labels))  # Relatively less overfit as training and test error are similar

0.7914285714285715
0.7133333333333334
