## Import Library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


## Read Data

In [2]:
data = pd.read_csv('german_credit_data.csv', index_col=0)

In [3]:
data

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad
...,...,...,...,...,...,...,...,...,...,...
995,31,female,1,own,little,,1736,12,furniture/equipment,good
996,40,male,3,own,little,little,3857,30,car,good
997,38,male,2,own,little,,804,12,radio/TV,good
998,23,male,2,free,little,little,1845,45,radio/TV,bad


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   int64 
 3   Housing           1000 non-null   object
 4   Saving accounts   817 non-null    object
 5   Checking account  606 non-null    object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
 9   Risk              1000 non-null   object
dtypes: int64(4), object(6)
memory usage: 85.9+ KB


In [5]:
data.describe(include='all')

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
count,1000.0,1000,1000.0,1000,817,606,1000.0,1000.0,1000,1000
unique,,2,,3,4,3,,,8,2
top,,male,,own,little,little,,,car,good
freq,,690,,713,603,274,,,337,700
mean,35.546,,1.904,,,,3271.258,20.903,,
std,11.375469,,0.653614,,,,2822.736876,12.058814,,
min,19.0,,0.0,,,,250.0,4.0,,
25%,27.0,,2.0,,,,1365.5,12.0,,
50%,33.0,,2.0,,,,2319.5,18.0,,
75%,42.0,,2.0,,,,3972.25,24.0,,


In [6]:
data['Job'].value_counts().to_frame()

Unnamed: 0_level_0,count
Job,Unnamed: 1_level_1
2,630
1,200
3,148
0,22


In [7]:
data['Risk'].value_counts().to_frame()

Unnamed: 0_level_0,count
Risk,Unnamed: 1_level_1
good,700
bad,300


In [8]:
data['Saving accounts'].value_counts().to_frame()

Unnamed: 0_level_0,count
Saving accounts,Unnamed: 1_level_1
little,603
moderate,103
quite rich,63
rich,48


In [9]:
data['Purpose'].value_counts().to_frame()

Unnamed: 0_level_0,count
Purpose,Unnamed: 1_level_1
car,337
radio/TV,280
furniture/equipment,181
business,97
education,59
repairs,22
domestic appliances,12
vacation/others,12


In [10]:
data['Checking account'].value_counts().to_frame()

Unnamed: 0_level_0,count
Checking account,Unnamed: 1_level_1
little,274
moderate,269
rich,63


In [11]:
data['Purpose'].value_counts().to_frame()

Unnamed: 0_level_0,count
Purpose,Unnamed: 1_level_1
car,337
radio/TV,280
furniture/equipment,181
business,97
education,59
repairs,22
domestic appliances,12
vacation/others,12


## Fill missing value

In [12]:
for col in data.columns:
  missing_data = data[col].isna().sum()
  missing_percent = missing_data/len(data) * 100
  print(f'Missing data of {col} is {missing_percent}"%' )

Missing data of Age is 0.0"%
Missing data of Sex is 0.0"%
Missing data of Job is 0.0"%
Missing data of Housing is 0.0"%
Missing data of Saving accounts is 18.3"%
Missing data of Checking account is 39.4"%
Missing data of Credit amount is 0.0"%
Missing data of Duration is 0.0"%
Missing data of Purpose is 0.0"%
Missing data of Risk is 0.0"%


In [13]:
def preprocess_missing_value(data):
    mode_saving_accounts = data['Saving accounts'].mode()
    data['Saving accounts'].fillna(mode_saving_accounts[0], inplace=True)

    mode_checking_account = data['Checking account'].mode()
    data['Checking account'].fillna(mode_checking_account[0], inplace=True)

In [14]:
preprocess_missing_value(data)

In [15]:
for col in data.columns:
  missing_data = data[col].isna().sum()
  missing_percent = missing_data/len(data) * 100
  print(f'Missing data of {col} is {missing_percent}"%' )

Missing data of Age is 0.0"%
Missing data of Sex is 0.0"%
Missing data of Job is 0.0"%
Missing data of Housing is 0.0"%
Missing data of Saving accounts is 0.0"%
Missing data of Checking account is 0.0"%
Missing data of Credit amount is 0.0"%
Missing data of Duration is 0.0"%
Missing data of Purpose is 0.0"%
Missing data of Risk is 0.0"%


## Change form data to numberic

In [16]:
def encoding(data):
    for column in ['Saving accounts', 'Checking account']:
        data[column] = data[column].fillna('nan')
    data['Saving accounts'] = data['Saving accounts'].fillna('nan')

    data['Sex'] = data['Sex'].apply(lambda x: 1 if x == 'male' else 0)
    data['Risk'] = data['Risk'].apply(lambda x: 1 if x == 'good' else 0)
    data['Housing'] = data['Housing'].apply(lambda x: 0 if x == 'free' else (1 if x == 'own' else 2))
    data['Saving accounts'] = data['Saving accounts'].apply(lambda x: 0 if x == 'nan' 
                                                                        else(1 if x == 'little' 
                                                                        else(2 if x == 'moderate'
                                                                        else(3 if x == 'quite rich' 
                                                                        else 4))))
                                                                        
    data['Checking account'] = data['Checking account'].apply(lambda x: 0 if x == 'nan' 
                                                                        else(1 if x == 'little' 
                                                                        else(2 if x == 'moderate'
                                                                        else(3 if x == 'quite rich' 
                                                                        else 4))))
    data['Purpose'] = data['Purpose'].apply(lambda x: 5 if x == 'car' 
                                                        else (4 if x == 'radio/TV'
                                                        else (3 if x == 'furniture/equipment'
                                                        else (2 if x == 'business'
                                                        else (1 if x == 'education'
                                                        else 0)))))

In [17]:
encoding(data)


In [18]:
data

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,1,2,1,1,1,1169,6,4,1
1,22,0,2,1,1,2,5951,48,4,0
2,49,1,1,1,1,1,2096,12,1,1
3,45,1,2,0,1,1,7882,42,3,1
4,53,1,2,0,1,1,4870,24,5,0
...,...,...,...,...,...,...,...,...,...,...
995,31,0,1,1,1,1,1736,12,3,1
996,40,1,3,1,1,1,3857,30,5,1
997,38,1,2,1,1,1,804,12,4,1
998,23,1,2,0,1,1,1845,45,4,0


## Split data

In [19]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values


In [20]:
X

array([[  67,    1,    2, ..., 1169,    6,    4],
       [  22,    0,    2, ..., 5951,   48,    4],
       [  49,    1,    1, ..., 2096,   12,    1],
       ...,
       [  38,    1,    2, ...,  804,   12,    4],
       [  23,    1,    2, ..., 1845,   45,    4],
       [  27,    1,    2, ..., 4576,   45,    5]], dtype=int64)

In [21]:
y

array([1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,

In [22]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [24]:
from sklearn.preprocessing import StandardScaler, Normalizer
nm = StandardScaler()
X_train = nm.fit_transform(X_train)

In [25]:
X_train

array([[ 2.4061873 ,  0.68205679,  0.10704816, ...,  1.19991221,
         3.29708155, -1.12341523],
       [-0.22436411,  0.68205679,  0.10704816, ..., -0.35962991,
        -0.00805096, -1.82173083],
       [ 1.26628169,  0.68205679,  0.10704816, ..., -0.73354714,
        -1.27925578,  0.97153159],
       ...,
       [-0.7504744 ,  0.68205679,  0.10704816, ...,  0.84360327,
         0.24619   ,  0.97153159],
       [-0.92584449,  0.68205679,  0.10704816, ..., -0.64792251,
        -0.77077385,  0.27321598],
       [-1.01352954, -1.46615357,  0.10704816, ..., -0.83850636,
        -1.27925578,  0.27321598]])

In [26]:
X_train.shape

(800, 9)

In [27]:
X_test = nm.fit_transform(X_test)

In [28]:
X_test.shape

(200, 9)

In [29]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

KNN = KNeighborsClassifier(n_neighbors=3, p = 2)
KNN.fit(X_train, y_train)
y_pred = KNN.predict(X_test)

print("Confussion Matrix: \n", confusion_matrix(y_test, y_pred),"\n")

print("Classification report according to Test prediction: \n", classification_report(y_test, y_pred))

print ("Accuracy of KNN: %.2f %%" %(100*accuracy_score(y_test, y_pred)))


Confussion Matrix: 
 [[ 14  45]
 [ 23 118]] 

Classification report according to Test prediction: 
               precision    recall  f1-score   support

           0       0.38      0.24      0.29        59
           1       0.72      0.84      0.78       141

    accuracy                           0.66       200
   macro avg       0.55      0.54      0.53       200
weighted avg       0.62      0.66      0.63       200

Accuracy of KNN: 66.00 %


## Hyperparameter Tuning

In [30]:
from sklearn.model_selection import GridSearchCV

neighbor = range (5, 52, 2)

grid_params = { 'n_neighbors' : neighbor,
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=3, n_jobs = -1)
g_res = gs.fit(X_train, y_train)
g_res.best_score_


Fitting 3 folds for each of 144 candidates, totalling 432 fits


0.7199928660602818

In [31]:
g_res.best_params_


{'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'uniform'}

In [32]:
knn = KNeighborsClassifier(n_neighbors = 15, weights = 'uniform',algorithm = 'brute',metric = 'manhattan')
knn.fit(X_train, y_train)

In [33]:
y_knn = knn.predict(X_test)

## Model Evaluation

In [34]:
print("Confussion Matrix: \n", confusion_matrix(y_test, y_knn),"\n")

print("Classification report according to Test prediction: \n", classification_report(y_test, y_knn))

print ("Accuracy of KNN: %.2f %%" %(100*accuracy_score(y_test, y_knn)))

Confussion Matrix: 
 [[  9  50]
 [  9 132]] 

Classification report according to Test prediction: 
               precision    recall  f1-score   support

           0       0.50      0.15      0.23        59
           1       0.73      0.94      0.82       141

    accuracy                           0.70       200
   macro avg       0.61      0.54      0.53       200
weighted avg       0.66      0.70      0.65       200

Accuracy of KNN: 70.50 %


In [35]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(knn, X, y, cv =5)
print('Model accuracy: ',np.mean(scores))


Model accuracy:  0.6880000000000001


## SMOTE

In [36]:
from imblearn.over_sampling import SMOTE 

X_train, y_train = SMOTE(random_state = 42).fit_resample(X_train, y_train)

X_train.shape

(1118, 9)

In [37]:
knn = KNeighborsClassifier(n_neighbors = 15, weights = 'uniform',algorithm = 'brute',metric = 'manhattan')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print("Confussion Matrix: \n", confusion_matrix(y_test, y_pred),"\n")

print("Classification report according to Test prediction: \n", classification_report(y_test, y_pred))

print ("Accuracy of KNN: %.2f %%" %(100*accuracy_score(y_test, y_pred)))

Confussion Matrix: 
 [[27 32]
 [49 92]] 

Classification report according to Test prediction: 
               precision    recall  f1-score   support

           0       0.36      0.46      0.40        59
           1       0.74      0.65      0.69       141

    accuracy                           0.59       200
   macro avg       0.55      0.56      0.55       200
weighted avg       0.63      0.60      0.61       200

Accuracy of KNN: 59.50 %
