## 1. Installing

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected = True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

#metrics and split
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score

from sklearn.naive_bayes import GaussianNB


## 2. Data Preperation

In [3]:
df = pd.read_csv('german_credit_data.csv', index_col=0)
df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


In [4]:
df

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad
...,...,...,...,...,...,...,...,...,...,...
995,31,female,1,own,little,,1736,12,furniture/equipment,good
996,40,male,3,own,little,little,3857,30,car,good
997,38,male,2,own,little,,804,12,radio/TV,good
998,23,male,2,free,little,little,1845,45,radio/TV,bad


In [5]:
df.isnull().sum()


Age                   0
Sex                   0
Job                   0
Housing               0
Saving accounts     183
Checking account    394
Credit amount         0
Duration              0
Purpose               0
Risk                  0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   int64 
 3   Housing           1000 non-null   object
 4   Saving accounts   817 non-null    object
 5   Checking account  606 non-null    object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
 9   Risk              1000 non-null   object
dtypes: int64(4), object(6)
memory usage: 85.9+ KB


### Missing Values Analysis

In [7]:
#mode of Saving Accounts attribute

mode_sav = df['Saving accounts'].mode()
mode_sav

0    little
Name: Saving accounts, dtype: object

In [8]:
df['Saving accounts'].fillna(mode_sav[0], inplace=True)

In [9]:
df['Saving accounts'].isnull().sum()

0

In [10]:
#mode of Checking Accouny
mode_check = df['Checking account'].mode()
mode_check

0    little
Name: Checking account, dtype: object

In [11]:
df['Checking account'].fillna(mode_check[0], inplace=True)

In [12]:
df['Checking account'].isnull().sum()

0

In [13]:
#labels
lab = df["Risk"].value_counts().keys().tolist()
#values
val = df["Risk"].value_counts().values.tolist()

trace = go.Pie(labels = lab, values = val, marker = dict(colors = [ 'royalblue','lime'], line = dict(color ="white",width =2)),
               rotation = 100,hoverinfo = "label+value+text", textinfo='label+percent',hole = .5)

layout = go.Layout(dict(title = "Risk Count",paper_bgcolor = "rgb(243,243,243)", plot_bgcolor  = "rgb(243,243,243)"))
                        
data = [trace]

fig = go.Figure(data = data,layout = layout)

py.iplot(fig)

### Label Encoding & One-Hot Encoding

In [14]:
df.Sex =  np.where(df.Sex =='male',1,0)

In [15]:
df.Dummies_Housing = pd.get_dummies(df['Housing'])
df.Dummies_Housing.head()


Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access



Unnamed: 0,free,own,rent
0,False,True,False
1,False,True,False
2,False,True,False
3,True,False,False
4,True,False,False


In [16]:
df.Dummies_Saving = pd.get_dummies(df['Saving accounts'])
df.Dummies_Saving.head()


Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access



Unnamed: 0,little,moderate,quite rich,rich
0,True,False,False,False
1,True,False,False,False
2,True,False,False,False
3,True,False,False,False
4,True,False,False,False


In [17]:
df.Dummies_Checking = pd.get_dummies(df['Checking account'])
df.Dummies_Checking.head()


Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access



Unnamed: 0,little,moderate,rich
0,True,False,False
1,False,True,False
2,True,False,False
3,True,False,False
4,True,False,False


In [18]:
df.Dummies_Purpose = pd.get_dummies(df['Purpose'])
df.Dummies_Purpose.head()


Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access



Unnamed: 0,business,car,domestic appliances,education,furniture/equipment,radio/TV,repairs,vacation/others
0,False,False,False,False,False,True,False,False
1,False,False,False,False,False,True,False,False
2,False,False,False,True,False,False,False,False
3,False,False,False,False,True,False,False,False
4,False,True,False,False,False,False,False,False


In [19]:
df = pd.concat([df,df.Dummies_Housing,df.Dummies_Saving,df.Dummies_Checking, df.Dummies_Purpose], axis=1)
df.drop(columns=['Housing', 'Saving accounts', 'Checking account', 'Purpose'], inplace=True)
df.head()

Unnamed: 0,Age,Sex,Job,Credit amount,Duration,Risk,free,own,rent,little,...,moderate,rich,business,car,domestic appliances,education,furniture/equipment,radio/TV,repairs,vacation/others
0,67,1,2,1169,6,good,False,True,False,True,...,False,False,False,False,False,False,False,True,False,False
1,22,0,2,5951,48,bad,False,True,False,True,...,True,False,False,False,False,False,False,True,False,False
2,49,1,1,2096,12,good,False,True,False,True,...,False,False,False,False,False,True,False,False,False,False
3,45,1,2,7882,42,good,True,False,False,True,...,False,False,False,False,False,False,True,False,False,False
4,53,1,2,4870,24,bad,True,False,False,True,...,False,False,False,True,False,False,False,False,False,False


In [20]:
df

Unnamed: 0,Age,Sex,Job,Credit amount,Duration,Risk,free,own,rent,little,...,moderate,rich,business,car,domestic appliances,education,furniture/equipment,radio/TV,repairs,vacation/others
0,67,1,2,1169,6,good,False,True,False,True,...,False,False,False,False,False,False,False,True,False,False
1,22,0,2,5951,48,bad,False,True,False,True,...,True,False,False,False,False,False,False,True,False,False
2,49,1,1,2096,12,good,False,True,False,True,...,False,False,False,False,False,True,False,False,False,False
3,45,1,2,7882,42,good,True,False,False,True,...,False,False,False,False,False,False,True,False,False,False
4,53,1,2,4870,24,bad,True,False,False,True,...,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,31,0,1,1736,12,good,False,True,False,True,...,False,False,False,False,False,False,True,False,False,False
996,40,1,3,3857,30,good,False,True,False,True,...,False,False,False,True,False,False,False,False,False,False
997,38,1,2,804,12,good,False,True,False,True,...,False,False,False,False,False,False,False,True,False,False
998,23,1,2,1845,45,bad,True,False,False,True,...,False,False,False,False,False,False,False,True,False,False


### Correlation and Heatmap Analysis

Để thấy rằng các features là độc lập có điều kiện với nhau (ĐK áp dụng Naive Bayes)

In [21]:
plt.figure(figsize=(20,13))
sns.heatmap(df.corr(),
            cmap='coolwarm',
            annot=True,
            fmt=".2f",
            annot_kws={'size':16},
            cbar=False)

ValueError: could not convert string to float: 'good'

<Figure size 2000x1300 with 0 Axes>

Nhận thấy mối tương quan của các feartures với nhau khá thấp -> thỏa mãn điều kiện

## 3. Modelling

In [None]:
y = df['Risk']

X = df.drop(columns=['Risk'])
X.head()

Unnamed: 0,Age,Sex,Job,Credit amount,Duration,free,own,rent,little,moderate,...,moderate.1,rich,business,car,domestic appliances,education,furniture/equipment,radio/TV,repairs,vacation/others
0,67,1,2,1169,6,0,1,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,22,0,2,5951,48,0,1,0,1,0,...,1,0,0,0,0,0,0,1,0,0
2,49,1,1,2096,12,0,1,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,45,1,2,7882,42,1,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,53,1,2,4870,24,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0


In [None]:
X.columns

Index(['Age', 'Sex', 'Job', 'Credit amount', 'Duration', 'free', 'own', 'rent',
       'little', 'moderate', 'quite rich', 'rich', 'little', 'moderate',
       'rich', 'business', 'car', 'domestic appliances', 'education',
       'furniture/equipment', 'radio/TV', 'repairs', 'vacation/others'],
      dtype='object')

In [None]:
# Spliting X and y into train and test version
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

In [None]:
from sklearn.utils import resample
from sklearn.metrics import roc_curve

GNB = GaussianNB()

# Fitting with train data
model = GNB.fit(X_train, y_train)

print("Primitive error evaluation accuracy score: ", model.score(X_train, y_train))

y_pred = model.predict(X_test)

print("Test predict accuracy score: ", accuracy_score(y_test,y_pred),"\n")

print("Confussion Matrix: \n", confusion_matrix(y_test, y_pred),"\n")

print("Classification report according to Test prediction: \n", classification_report(y_test, y_pred))

print ("Accuracy of KNN: %.2f %%" %(100*accuracy_score(y_test, y_pred)))

Primitive error evaluation accuracy score:  0.672
Test predict accuracy score:  0.648 

Confussion Matrix: 
 [[ 29  43]
 [ 45 133]] 

Classification report according to Test prediction: 
               precision    recall  f1-score   support

         bad       0.39      0.40      0.40        72
        good       0.76      0.75      0.75       178

    accuracy                           0.65       250
   macro avg       0.57      0.57      0.57       250
weighted avg       0.65      0.65      0.65       250

Accuracy of KNN: 64.80 %


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

KNN = KNeighborsClassifier(n_neighbors=10, p = 2)
KNN.fit(X_train, y_train)
y_pred = KNN.predict(X_test)
print ("Accuracy of KNN: %.2f %%" %(100*accuracy_score(y_test, y_pred)))

Accuracy of KNN: 62.40 %


Nhận xét: Bộ dữ liệu đầu vào ít (1000 mẫu) và mất cân bằng tỉ lệ giữa good và bad của Risk nên độ chính xác chưa cao và chênh lệch lớn ở kết quả good-bad của các độ đo

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
model_dt = dtc.fit(X_train, y_train)
y_pred = model_dt.predict(X_test)
print ("Accuracy of DCT: %.2f %%" %(100*accuracy_score(y_test, y_pred)))

Accuracy of DCT: 62.00 %


In [None]:
from sklearn.naive_bayes import GaussianNB
GNB = GaussianNB()
model_nb = GNB.fit(X_train, y_train)
y_pred = model_nb.predict(X_test)
print ("Accuracy of DCT: %.2f %%" %(100*accuracy_score(y_test, y_pred)))

Accuracy of DCT: 64.80 %


In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
model_lr = lr.fit(X_train, y_train)
y_pred = model_lr.predict(X_test)
print ("Accuracy of DCT: %.2f %%" %(100*accuracy_score(y_test, y_pred)))

Accuracy of DCT: 69.20 %



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=None, max_features=10, n_estimators=15, random_state=2)

model_rf = rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)
print ("Accuracy of DCT: %.2f %%" %(100*accuracy_score(y_test, y_pred)))

NameError: name 'X_train' is not defined