In [63]:
# importing libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn import metrics 
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier


In [64]:
# import dataset
dataset = pd.read_csv('csv/cleaned_train_all.csv')
dataset.head()
dataset.shape

(43508, 17)

In [65]:
x = dataset.drop(['credit_card_default'], axis = 1)
y = dataset['credit_card_default'].values
print(x.shape, y.shape)

(43508, 16) (43508,)


In [66]:
x.head()

Unnamed: 0,customer_id,age,gender,owns_car,owns_house,no_of_children,net_yearly_income,no_of_days_employed,total_family_members,migrant_worker,yearly_debt_payments,credit_limit,credit_limit_used(%),credit_score,prev_defaults,default_in_last_6months
0,CST_115179,46,0,0.0,1,0.0,107934.04,612.0,1.0,1.0,33070.28,18690.93,73,544,2,1
1,CST_121920,29,1,0.0,1,0.0,109862.62,2771.0,2.0,0.0,15329.53,37745.19,52,857,0,0
2,CST_109330,37,1,0.0,1,0.0,230153.17,204.0,2.0,0.0,48416.6,41598.36,43,650,0,0
3,CST_128288,39,0,0.0,1,0.0,122325.82,11941.0,2.0,0.0,22574.36,32627.76,20,754,0,0
4,CST_151355,46,1,1.0,1,0.0,387286.0,1459.0,1.0,0.0,38282.95,52950.64,75,927,0,0


In [67]:
print(y)

[1 0 0 ... 0 0 0]


In [68]:
scalar =  preprocessing.StandardScaler()

# copy of datasets
X_train = x.copy()

# numerical features

#for all 
num_cols = ['net_yearly_income','no_of_days_employed','yearly_debt_payments','credit_limit']

#for less
#num_cols = ["credit_limit_used(%)", "credit_score"]

# apply standardization on numerical features
for i in num_cols:
    
    # fit on training data column
    scale = scalar.fit(X_train[[i]])
    
    # transform the training data column
    X_train[i] = scale.transform(X_train[[i]])


In [69]:
X_train = X_train.set_index('customer_id')
X_train.head()

Unnamed: 0_level_0,age,gender,owns_car,owns_house,no_of_children,net_yearly_income,no_of_days_employed,total_family_members,migrant_worker,yearly_debt_payments,credit_limit,credit_limit_used(%),credit_score,prev_defaults,default_in_last_6months
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
CST_115179,46,0,0.0,1,0.0,107934.04,612.0,1.0,1.0,33070.28,18690.93,73,544,2,1
CST_121920,29,1,0.0,1,0.0,109862.62,2771.0,2.0,0.0,15329.53,37745.19,52,857,0,0
CST_109330,37,1,0.0,1,0.0,230153.17,204.0,2.0,0.0,48416.6,41598.36,43,650,0,0
CST_128288,39,0,0.0,1,0.0,122325.82,11941.0,2.0,0.0,22574.36,32627.76,20,754,0,0
CST_151355,46,1,1.0,1,0.0,387286.0,1459.0,1.0,0.0,38282.95,52950.64,75,927,0,0


In [70]:
# Splitting Dataset into Training and Test Set
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X_train, y, test_size=0.2  , random_state=2)

In [71]:
#Parameter tuning with GridSearchCV 

estimator_KNN = KNeighborsClassifier(algorithm='auto')
parameters_KNN = { 
    'n_neighbors': (1,10, 1),
    'leaf_size': (20,40,1),
    'p': (1,2),
    'weights': ('uniform', 'distance'),
    'metric': ('minkowski', 'chebyshev')}
                   
# with GridSearch
grid_search_KNN = GridSearchCV(
     estimator=estimator_KNN,
     param_grid=parameters_KNN,
     scoring = 'accuracy',
     n_jobs = -1,
     cv = 5)

In [72]:
KNN = grid_search_KNN.fit(X_train, y_train)
y_pred_KNN =KNN.predict(X_test)

#Parameter setting that gave the best results on the hold out data.
print(grid_search_KNN.best_params_ ) 
#Mean cross-validated score of the best_estimator
print('Best Score - KNN:', grid_search_KNN.best_score_ )



{'leaf_size': 20, 'metric': 'chebyshev', 'n_neighbors': 10, 'p': 1, 'weights': 'uniform'}
Best Score - KNN: 0.9184336039577998


In [73]:
print('Accuracy Score - KNN ', metrics.accuracy_score(y_test, y_pred_KNN)) 


Accuracy Score - KNN  0.9204780510227534


In [74]:
print('F1 Score - KNN:', metrics.f1_score(y_test, y_pred_KNN)) 


F1 Score - KNN: 0.0
