# Libraries

In [2]:
# Imports
# numpy,pandas,scipy, math, matplotlib
import numpy as np
import pandas as pd
import scipy
from math import sqrt
import matplotlib.pyplot as plt

import seaborn as sns

from matplotlib.colors import ListedColormap

# for KNN
from sklearn import neighbors, datasets

# for class variables
from sklearn.preprocessing import OneHotEncoder

# Model metrics
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix

# Cross validation
from sklearn.model_selection import train_test_split

# Random
import random

# Import Data

In [3]:
# Import Data
# clean credit
credit = pd.read_csv('Data/creditClean.csv')

# dummified clusters
dummies = pd.read_csv('Data/clusterDummies.csv')



In [4]:
# concatenate: credit - dummies
creditCluster = pd.concat([credit, dummies.iloc[:, 1:8]], axis=1, sort=False)

# Prepare Model

In [5]:
# set seed
random.seed(123)
np.random.seed(123)

In [6]:
# Select dependent variables
indepVars = creditCluster.drop(['Default'], axis = 1)

# Dependent variable
depVar = creditCluster.loc[: , ['Default']]

x_train, x_test, y_train, y_test = train_test_split(indepVars, depVar, test_size = 0.25)
print('training set looks like:', str(x_train.shape) + ', and the testing set like:', str(x_test.shape))

training set looks like: (22500, 27), and the testing set like: (7500, 27)


In [7]:
# KNN Model

# Create an instance of Neighbours Classifier
knn_model = neighbors.KNeighborsClassifier(n_neighbors = 10)

# Fit that to the train data
# Note: the model expects an array as y variable (df['array'])
credit_knn = knn_model.fit(x_train, y_train['Default'])

In [8]:
# make predictions
predictions = credit_knn.predict(x_test)

# Check performance
accuracy = accuracy_score(predictions, y_test)
kappa = cohen_kappa_score(predictions, y_test)
print ("Accuracy : %s" % "{0:.2%}".format(accuracy))
print ("Kappa : %s" % "{0:.2%}".format(kappa))

Accuracy : 77.87%
Kappa : 7.59%


In [None]:
# Confusion Matrix

# First, change 0/1 into 'Pay'/'Default'
# for df y_test:
default_int_to_object = {0: 'Pay', 1: 'Default'}
y_test_obj = y_test.replace(default_int_to_object)
# for array predictions
predictions_obj = ['Default' if x == 1 else 'Pay' for x in predictions]

# Labels
labels = ['Pay', 'Default']

cm = confusion_matrix(y_true = y_test_obj,
                      y_pred = predictions_obj,
                      labels = labels)
print(cm)


fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Name of Confusion Matrix')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()