In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [3]:
credit_df = pd.read_csv("German Credit Dataset.csv")

In [4]:
credit_df.head()

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
0,< 0 DM,6,critical,furniture/appliances,1169,unknown,> 7 years,4,4,67,none,own,2,skilled,1,yes,no
1,1 - 200 DM,48,good,furniture/appliances,5951,< 100 DM,1 - 4 years,2,2,22,none,own,1,skilled,1,no,yes
2,unknown,12,critical,education,2096,< 100 DM,4 - 7 years,2,3,49,none,own,1,unskilled,2,no,no
3,< 0 DM,42,good,furniture/appliances,7882,< 100 DM,4 - 7 years,2,4,45,none,other,1,skilled,2,no,no
4,< 0 DM,24,poor,car,4870,< 100 DM,1 - 4 years,3,4,53,none,other,2,skilled,2,no,yes


In [5]:
credit_df.shape

(1000, 17)

In [6]:
credit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   checking_balance      1000 non-null   object
 1   months_loan_duration  1000 non-null   int64 
 2   credit_history        1000 non-null   object
 3   purpose               1000 non-null   object
 4   amount                1000 non-null   int64 
 5   savings_balance       1000 non-null   object
 6   employment_duration   1000 non-null   object
 7   percent_of_income     1000 non-null   int64 
 8   years_at_residence    1000 non-null   int64 
 9   age                   1000 non-null   int64 
 10  other_credit          1000 non-null   object
 11  housing               1000 non-null   object
 12  existing_loans_count  1000 non-null   int64 
 13  job                   1000 non-null   object
 14  dependents            1000 non-null   int64 
 15  phone                 1000 non-null   o

In [12]:
#change the object type to numerical type
for feature in credit_df.columns:
    if credit_df[feature].dtype == "object":
        credit_df[feature] = pd.Categorical(credit_df[feature]).codes

In [13]:
credit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   checking_balance      1000 non-null   int8 
 1   months_loan_duration  1000 non-null   int64
 2   credit_history        1000 non-null   int8 
 3   purpose               1000 non-null   int8 
 4   amount                1000 non-null   int64
 5   savings_balance       1000 non-null   int8 
 6   employment_duration   1000 non-null   int8 
 7   percent_of_income     1000 non-null   int64
 8   years_at_residence    1000 non-null   int64
 9   age                   1000 non-null   int64
 10  other_credit          1000 non-null   int8 
 11  housing               1000 non-null   int8 
 12  existing_loans_count  1000 non-null   int64
 13  job                   1000 non-null   int8 
 14  dependents            1000 non-null   int64
 15  phone                 1000 non-null   int8 
 16  default

In [15]:
X = credit_df.drop("default" , axis=1)

y = credit_df.pop("default")

KeyError: "['default'] not found in axis"

In [16]:
from sklearn.model_selection import train_test_split

In [34]:
# splitting data into training and test set for independent attributes
X_train, X_test, train_labels, test_labels = train_test_split(X,y, test_size = .30, random_state=1)

In [36]:
list(X_train)

['checking_balance',
 'months_loan_duration',
 'credit_history',
 'purpose',
 'amount',
 'savings_balance',
 'employment_duration',
 'percent_of_income',
 'years_at_residence',
 'age',
 'other_credit',
 'housing',
 'existing_loans_count',
 'job',
 'dependents',
 'phone']

In [30]:
dt_model = DecisionTreeClassifier(criterion = "gini")

In [31]:
dt_model.fit(X_train, train_labels)

DecisionTreeClassifier()

In [32]:
from sklearn import tree

In [46]:
import os
train_char_label = ["Yes","No"]
credit_file = open("C:\\Users\\HP\Documents\\DataScience\\Data Mining\\Week2\\credit_file.dot","w")
dot_data = tree.export_graphviz(dt_model, out_file=credit_file, feature_names = list(X_train), class_names = list(train_char_label))
credit_file.close()

In [49]:
dt_model.feature_importances_

array([0.14737279, 0.11315173, 0.06534548, 0.06142514, 0.18226414,
       0.07362479, 0.07515496, 0.02964658, 0.0470321 , 0.11043665,
       0.02151994, 0.01133847, 0.013862  , 0.02345989, 0.01334856,
       0.01101678])

In [50]:
print (pd.DataFrame(dt_model.feature_importances_, columns = ["Imp"], index = X_train.columns))


                           Imp
checking_balance      0.147373
months_loan_duration  0.113152
credit_history        0.065345
purpose               0.061425
amount                0.182264
savings_balance       0.073625
employment_duration   0.075155
percent_of_income     0.029647
years_at_residence    0.047032
age                   0.110437
other_credit          0.021520
housing               0.011338
existing_loans_count  0.013862
job                   0.023460
dependents            0.013349
phone                 0.011017


In [51]:
y_predict = dt_model.predict(X_test)

In [53]:
print(y_predict)

[1 1 0 0 1 1 1 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 1 0
 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 1 0 1 1 0 0 0 0 1 0 0 1 1 0 1 1 0 1 0 1
 0 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 1 1 0 0 1 0 0 0 0 0 1 1 0 0
 0 0 0 0 0 0 1 1 0 0 1 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0
 0 0 0 0 1 0 0 0 1 0 1 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0
 0 0 1 1 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0
 0 0 1 0 1 1 1 0 0 0 0 1 0 0 1 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 1 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 1 1 0 1 0 0 0
 0 0 1 0]
