<h1 align="center"> Task: German Credit Classification</h1>


In [4]:
# Import modules
import pandas as pd
from patsy import dmatrices
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import neighbors
from sklearn import metrics
import os
import sys

## 1. Data

In [5]:
# Load Data
os.getcwd()
dat = pd.read_csv("credit-g.csv")

In [6]:
# View data head
dat.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,'critical/other existing credit',radio/tv,1169,'no known savings',>=7,4,'male single',none,...,'real estate',67,none,own,2,skilled,1,yes,yes,good
1,0<=X<200,48,'existing paid',radio/tv,5951,<100,1<=X<4,2,'female div/dep/mar',none,...,'real estate',22,none,own,1,skilled,1,none,yes,bad
2,'no checking',12,'critical/other existing credit',education,2096,<100,4<=X<7,2,'male single',none,...,'real estate',49,none,own,1,'unskilled resident',2,none,yes,good
3,<0,42,'existing paid',furniture/equipment,7882,<100,4<=X<7,2,'male single',guarantor,...,'life insurance',45,none,'for free',1,skilled,2,none,yes,good
4,<0,24,'delayed previously','new car',4870,<100,1<=X<4,3,'male single',none,...,'no known property',53,none,'for free',2,skilled,2,none,yes,bad


In [7]:
# Summary Statistics
dat.describe()

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,20.903,3271.258,2.973,2.845,35.546,1.407,1.155
std,12.058814,2822.736876,1.118715,1.103718,11.375469,0.577654,0.362086
min,4.0,250.0,1.0,1.0,19.0,1.0,1.0
25%,12.0,1365.5,2.0,2.0,27.0,1.0,1.0
50%,18.0,2319.5,3.0,3.0,33.0,1.0,1.0
75%,24.0,3972.25,4.0,4.0,42.0,2.0,1.0
max,72.0,18424.0,4.0,4.0,75.0,4.0,2.0


## 2. Data Preprocessing

### 2.1 Data Transformation

In [14]:
# The two-level class attribute is transformed from good/bad to 1/0 int type.
dat['Class'] = dat['class']
dat['Class'] = dat['Class'].str.replace('good','1')
dat['Class'] = dat['Class'].str.replace('bad','0')
dat['Class'] = dat['Class'].astype('int')
dat.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class,Class
0,<0,6,'critical/other existing credit',radio/tv,1169,'no known savings',>=7,4,'male single',none,...,67,none,own,2,skilled,1,yes,yes,good,1
1,0<=X<200,48,'existing paid',radio/tv,5951,<100,1<=X<4,2,'female div/dep/mar',none,...,22,none,own,1,skilled,1,none,yes,bad,0
2,'no checking',12,'critical/other existing credit',education,2096,<100,4<=X<7,2,'male single',none,...,49,none,own,1,'unskilled resident',2,none,yes,good,1
3,<0,42,'existing paid',furniture/equipment,7882,<100,4<=X<7,2,'male single',guarantor,...,45,none,'for free',1,skilled,2,none,yes,good,1
4,<0,24,'delayed previously','new car',4870,<100,1<=X<4,3,'male single',none,...,53,none,'for free',2,skilled,2,none,yes,bad,0


### 2.2 Creating Design Matrices

In [15]:
# Creating design matrices. 0 is used to exclude intercept info . Note that y is the container of the response attribute, 
# whereas X holds all the predictors.
y, X = dmatrices('Class ~ 0 + checking_status + duration+credit_history + savings_status + employment'+
                 '+ installment_commitment + personal_status + other_parties + residence_since + property_magnitude'+
                 '+ age + other_payment_plans + housing + existing_credits + job+num_dependents + own_telephone' +
                 '+ foreign_worker',
                 data=dat, 
                 return_type='dataframe')

In [18]:
X.head()

Unnamed: 0,checking_status['no checking'],checking_status[0<=X<200],checking_status[<0],checking_status[>=200],credit_history[T.'critical/other existing credit'],credit_history[T.'delayed previously'],credit_history[T.'existing paid'],credit_history[T.'no credits/all paid'],savings_status[T.100<=X<500],savings_status[T.500<=X<1000],...,job[T.'unskilled resident'],job[T.skilled],own_telephone[T.yes],foreign_worker[T.yes],duration,installment_commitment,residence_since,age,existing_credits,num_dependents
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,6.0,4.0,4.0,67.0,2.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,48.0,2.0,2.0,22.0,1.0,1.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,12.0,2.0,3.0,49.0,1.0,2.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,42.0,2.0,4.0,45.0,1.0,2.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,24.0,3.0,4.0,53.0,2.0,2.0


In [10]:
y.head()

Unnamed: 0,Class
0,1.0
1,0.0
2,1.0
3,1.0
4,0.0


### 2.3 Normalize Data

We now normalize the data to make sure the variables are analyzed in the same scale. This is also needed in order to implement
some of the models we intend using. In this case, we normalize the data into the scale range [0, 1]


In [13]:
# Normalize columns in X to range [0,1]
X_scale = preprocessing.minmax_scale(X, feature_range=(0, 1), axis=0)
X_scale = pd.DataFrame(X_scale)
X_scale.columns = X.columns
X_scale.head()

Unnamed: 0,checking_status['no checking'],checking_status[0<=X<200],checking_status[<0],checking_status[>=200],credit_history[T.'critical/other existing credit'],credit_history[T.'delayed previously'],credit_history[T.'existing paid'],credit_history[T.'no credits/all paid'],savings_status[T.100<=X<500],savings_status[T.500<=X<1000],...,job[T.'unskilled resident'],job[T.skilled],own_telephone[T.yes],foreign_worker[T.yes],duration,installment_commitment,residence_since,age,existing_credits,num_dependents
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.029412,1.0,1.0,0.857143,0.333333,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.647059,0.333333,0.333333,0.053571,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.117647,0.333333,0.666667,0.535714,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.558824,0.333333,1.0,0.464286,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.294118,0.666667,1.0,0.607143,0.333333,1.0


In [10]:
X_scale.describe()

Unnamed: 0,checking_status['no checking'],checking_status[0<=X<200],checking_status[<0],checking_status[>=200],credit_history[T.'critical/other existing credit'],credit_history[T.'delayed previously'],credit_history[T.'existing paid'],credit_history[T.'no credits/all paid'],savings_status[T.100<=X<500],savings_status[T.500<=X<1000],...,job[T.'unskilled resident'],job[T.skilled],own_telephone[T.yes],foreign_worker[T.yes],duration,installment_commitment,residence_since,age,existing_credits,num_dependents
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.394,0.269,0.274,0.063,0.293,0.088,0.53,0.04,0.103,0.063,...,0.2,0.63,0.404,0.963,0.248574,0.657667,0.615,0.295464,0.135667,0.155
std,0.488879,0.443662,0.446232,0.243085,0.455366,0.283437,0.499349,0.196057,0.304111,0.243085,...,0.4002,0.483046,0.490943,0.188856,0.177336,0.372905,0.367906,0.203133,0.192551,0.362086
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.117647,0.333333,0.333333,0.142857,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.205882,0.666667,0.666667,0.25,0.0,0.0
75%,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.294118,1.0,1.0,0.410714,0.333333,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## 3. k-NN Using Simple Split

### 3.1. Data Partition

We use the sklearn.model_selection.train_test_split() method to split the dataset into test and training sets. 

In [22]:
# 30-70% simple split
# To make the result reproducible, set the random_state
train_y,test_y,train_X,test_X = model_selection.train_test_split(y, X_scale,
                                                                 test_size=0.3,
                                                                 random_state=123)

In [23]:
train_y.shape

(700, 1)

In [24]:
test_y.shape

(300, 1)

In [25]:
train_X.shape

(700, 39)

In [26]:
test_X.shape

(300, 39)

### 3.2. Tune the k-NN Classifier 

The choice of the paramter value k has impact on the performance of the k-NN algorithm.
In the following, we tune the k parameter based on accuracy.

In [33]:
for k in range(20):
    k = k + 1
    knn = neighbors.KNeighborsClassifier(n_neighbors = k, 
                                         weights='uniform', 
                                         algorithm='auto')
    knn.fit(train_X, train_y.Class)
    pred_y = knn.predict(test_X)
    print("Accuracy is ", round(metrics.accuracy_score(test_y, pred_y)*100,2),"% for k =",k)

Accuracy is  69.67 % for k = 1
Accuracy is  65.67 % for k = 2
Accuracy is  71.33 % for k = 3
Accuracy is  70.0 % for k = 4
Accuracy is  74.0 % for k = 5
Accuracy is  73.33 % for k = 6
Accuracy is  70.67 % for k = 7
Accuracy is  75.0 % for k = 8
Accuracy is  73.67 % for k = 9
Accuracy is  75.0 % for k = 10
Accuracy is  73.33 % for k = 11
Accuracy is  73.33 % for k = 12
Accuracy is  73.67 % for k = 13
Accuracy is  71.67 % for k = 14
Accuracy is  73.67 % for k = 15
Accuracy is  75.0 % for k = 16
Accuracy is  74.33 % for k = 17
Accuracy is  74.67 % for k = 18
Accuracy is  74.67 % for k = 19
Accuracy is  75.33 % for k = 20


From the above parameter tuning, it seems the best k-NN classifier for this dataset is k = 24. 

### 3.3 Train a k-NN Classifier 


In [34]:
# KNN: K=8, default measure of distance (euclidean)
knn8 = neighbors.KNeighborsClassifier(n_neighbors=8, 
                                      weights='uniform', 
                                      algorithm='auto')

In [35]:
knn8.fit(train_X, train_y.Class)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=8, p=2,
           weights='uniform')

In [36]:
pred_y = knn8.predict(test_X)

In [37]:
# Print confusion matrix
print(metrics.confusion_matrix(test_y, pred_y))

[[ 55  45]
 [ 30 170]]


In [38]:
# Calculate classification accuracy
metrics.accuracy_score(test_y, pred_y)

0.75

In [40]:
# Manually calculate classification accuracy
(55+170)/(55+45+30+170)

0.75

In [39]:
# Calculate AUC
metrics.roc_auc_score(test_y, pred_y)

0.70000000000000007

In [41]:
# Calculate Cohen's Kappa
metrics.cohen_kappa_score(test_y, pred_y)

0.41558441558441561

In [42]:
# Print classification report
print(metrics.classification_report(test_y, pred_y))

             precision    recall  f1-score   support

        0.0       0.65      0.55      0.59       100
        1.0       0.79      0.85      0.82       200

avg / total       0.74      0.75      0.74       300

