In [29]:
import pandas as pd
from pandas import get_dummies

import numpy as np

from imblearn.over_sampling import SMOTE 

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [30]:
df = pd.read_csv('german_credit_data.csv')
train_labels = df['Risk']
train_data = df.drop('Risk',axis = 'columns')
train_data.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,0,67,male,2,own,,little,1169,6,radio/TV
1,1,22,female,2,own,little,moderate,5951,48,radio/TV
2,2,49,male,1,own,little,,2096,12,education
3,3,45,male,2,free,little,little,7882,42,furniture/equipment
4,4,53,male,2,free,little,little,4870,24,car


In [31]:
test_df = pd.read_csv('german_credit_data.csv')
test_labels = test_df['Risk']
test_data = test_df.drop('Risk', axis='columns')
test_data

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,0,67,male,2,own,,little,1169,6,radio/TV
1,1,22,female,2,own,little,moderate,5951,48,radio/TV
2,2,49,male,1,own,little,,2096,12,education
3,3,45,male,2,free,little,little,7882,42,furniture/equipment
4,4,53,male,2,free,little,little,4870,24,car
...,...,...,...,...,...,...,...,...,...,...
995,995,31,female,1,own,little,,1736,12,furniture/equipment
996,996,40,male,3,own,little,little,3857,30,car
997,997,38,male,2,own,little,,804,12,radio/TV
998,998,23,male,2,free,little,little,1845,45,radio/TV


In [32]:
test_data

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,0,67,male,2,own,,little,1169,6,radio/TV
1,1,22,female,2,own,little,moderate,5951,48,radio/TV
2,2,49,male,1,own,little,,2096,12,education
3,3,45,male,2,free,little,little,7882,42,furniture/equipment
4,4,53,male,2,free,little,little,4870,24,car
...,...,...,...,...,...,...,...,...,...,...
995,995,31,female,1,own,little,,1736,12,furniture/equipment
996,996,40,male,3,own,little,little,3857,30,car
997,997,38,male,2,own,little,,804,12,radio/TV
998,998,23,male,2,free,little,little,1845,45,radio/TV


In [33]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        1000 non-null   int64 
 1   Age               1000 non-null   int64 
 2   Sex               1000 non-null   object
 3   Job               1000 non-null   int64 
 4   Housing           1000 non-null   object
 5   Saving accounts   817 non-null    object
 6   Checking account  606 non-null    object
 7   Credit amount     1000 non-null   int64 
 8   Duration          1000 non-null   int64 
 9   Purpose           1000 non-null   object
dtypes: int64(5), object(5)
memory usage: 78.2+ KB


In [34]:
missing_value_cols = ['Saving accounts','Checking account']

In [35]:
train_data[missing_value_cols] = train_data[missing_value_cols].replace(np.nan,'NOT_FILL')
test_data[missing_value_cols] = test_data[missing_value_cols].replace(np.nan,'NOT_FILL')

In [36]:
train_data.dtypes

Unnamed: 0           int64
Age                  int64
Sex                 object
Job                  int64
Housing             object
Saving accounts     object
Checking account    object
Credit amount        int64
Duration             int64
Purpose             object
dtype: object

In [37]:
# for value in train_data['Credit amount']:
#     if value > 10000: value = 10000


train_data['Credit amount'] = train_data['Credit amount'].map(lambda x: 10000 if x > 10000 else x)
train_data['Credit amount'] = np.log(train_data['Credit amount']+1)

print(train_data['Credit amount'].unique())

[7.06475903 8.69148258 7.64826303 8.97246382 8.49105453 9.1111828
 7.95014989 8.84635304 8.02617019 8.56312212 7.16703788 8.36846114
 7.3575562  7.09007684 7.24708058 7.15695636 7.7935868  8.99628044
 9.21044037 8.14060704 7.66622193 7.88155992 7.7151236  7.49831587
 7.63530389 7.22620901 6.05678401 6.01615716 7.78986856 8.83010432
 7.55695057 8.29928591 8.67709871 7.1428274  7.29641327 8.46526812
 8.7178457  7.6501687  7.11151212 6.12905021 7.75533881 7.05531284
 8.7331107  8.73036721 8.72323127 7.23993259 7.7406644  7.21007963
 8.88585599 7.63723439 8.69383197 7.14124512 8.12533509 7.70796153
 6.66440902 8.77477682 9.16607496 7.58171964 8.73713161 7.23849684
 7.33823815 7.57763383 8.06526521 8.55468164 7.68340368 6.91572345
 7.50659178 7.78113851 9.00380809 6.59441346 7.06047637 8.69198648
 7.58984151 7.33106031 8.28551331 8.47052078 9.15239341 8.25140307
 8.69013759 7.10167597 7.35819375 7.47079377 7.74759684 7.25347038
 7.7186855  7.01121399 6.42810527 7.25134498 6.6821086  8.19367

In [38]:
numeric_cols = ['Job','Credit amount','Duration']
categorical_cols = ['Sex','Housing','Saving accounts','Checking account','Purpose', 'Age']

In [39]:
def age_transform(value):
    if value > 70: value = "O3"
    elif value > 55: value = "O2"
    elif value > 35: value = 'O1'
    elif value > 25: value = 'O0'
    else:
        value = "YOUNG"
    return value

train_data['Age'] = train_data['Age'].map(age_transform)
test_data['Age'] = test_data['Age'].map(age_transform)


# def job_transform(value):
#     if value < 1 : value = 'No rent'
#     elif value < 2 : value = 'None skill - No rent'
#     elif value < 3: value = 'Skill - Rent'
#     elif value < 4 : value = 'High skill'
#     else: value = 'None'
#     return value

# train_data['Job'] = train_data['Job'].map(job_transform)
# test_data['Job'] = test_data['Job'].map(job_transform)


In [40]:
for col in categorical_cols:
    print(col,train_data[col].unique())

Sex ['male' 'female']
Housing ['own' 'free' 'rent']
Saving accounts ['NOT_FILL' 'little' 'quite rich' 'rich' 'moderate']
Checking account ['little' 'moderate' 'NOT_FILL' 'rich']
Purpose ['radio/TV' 'education' 'furniture/equipment' 'car' 'business'
 'domestic appliances' 'repairs' 'vacation/others']
Age ['O2' 'YOUNG' 'O1' 'O0' 'O3']


In [41]:
train_data['Housing'] = train_data['Housing'].apply(lambda x: 0 if x == 'free' else (2 if x == 'own' else 1))
train_data['Saving accounts'] = train_data['Saving accounts'].apply(lambda x: 0 if x == 'nan' 
                                                                    else(1 if x == 'little' 
                                                                    else(2 if x == 'moderate'
                                                                    else(3 if x == 'quite rich' 
                                                                    else 4))))
                                                                    
train_data['Checking account'] = train_data['Checking account'].apply(lambda x: 0 if x == 'nan' 
                                                                    else(1 if x == 'little' 
                                                                    else(2 if x == 'moderate'
                                                                    else(3 if x == 'quite rich' 
                                                                    else 4))))

In [42]:
test_data['Housing'] = test_data['Housing'].apply(lambda x: 0 if x == 'free' else (1 if x == 'own' else 2))
test_data['Saving accounts'] = test_data['Saving accounts'].apply(lambda x: 0 if x == 'nan' 
                                                                    else(1 if x == 'little' 
                                                                    else(2 if x == 'moderate'
                                                                    else(3 if x == 'quite rich' 
                                                                    else 4))))
                                                                    
test_data['Checking account'] = test_data['Checking account'].apply(lambda x: 0 if x == 'nan' 
                                                                    else(1 if x == 'little' 
                                                                    else(2 if x == 'moderate'
                                                                    else(3 if x == 'quite rich' 
                                                                    else 4))))

In [43]:
numeric_cols = ['Job','Credit amount','Duration','Housing','Saving accounts','Checking account']
categorical_cols = ['Age','Sex','Purpose']

In [44]:
scaler = MinMaxScaler()
train_data[numeric_cols] = scaler.fit_transform(train_data[numeric_cols])
train_data = get_dummies(train_data, categorical_cols)
train_data, train_labels = SMOTE().fit_resample(train_data, train_labels)

In [45]:
for col in train_data.columns:
    print(col,train_data[col].unique())

Unnamed: 0 [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 234 235 236 237 238 239 240 241 242 243

In [46]:
for value in test_data['Credit amount']:
    if value > 10000: value = 10000
test_data['Credit amount'] = np.log(test_data['Credit amount']+1)
test_data[numeric_cols] = scaler.fit_transform(test_data[numeric_cols])
test_data = get_dummies(test_data, categorical_cols)


In [47]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(train_data,train_labels)

In [48]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=7, p = 2)
knn.fit(train_data, train_labels)


In [49]:
pred1 = gnb.predict(test_data)

from sklearn.metrics import classification_report
print(classification_report(y_pred=pred1, y_true = test_labels))

              precision    recall  f1-score   support

         bad       0.41      0.73      0.52       300
        good       0.82      0.55      0.66       700

    accuracy                           0.60      1000
   macro avg       0.62      0.64      0.59      1000
weighted avg       0.70      0.60      0.62      1000



In [50]:
pred = knn.predict(test_data)

from sklearn.metrics import classification_report
print(classification_report(y_pred=pred, y_true = test_labels))

              precision    recall  f1-score   support

         bad       0.47      0.65      0.54       300
        good       0.82      0.68      0.75       700

    accuracy                           0.67      1000
   macro avg       0.64      0.67      0.64      1000
weighted avg       0.71      0.67      0.68      1000

