In [1]:
import warnings
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score


In [2]:
data= pd.read_csv('../z_norm.csv')

### Imbalanced dataset

In [3]:
data['Loan Status'].value_counts()

Fully Paid     51276
Charged Off    16214
Name: Loan Status, dtype: int64

### split data en X et Y

In [12]:
data1= data.copy()
X= data1.drop('Loan Status', axis=1)
Y= data1['Loan Status']


### One hot encoding

In [13]:
X= pd.get_dummies(X, drop_first= True)
Y= pd.get_dummies(Y, drop_first= True)

### split data en train and test

In [14]:
X_train , X_test , Y_train , Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 123, stratify=Y)


### ---KNN---

In [15]:
knn = KNeighborsClassifier()
##Hyperparameter tuning     ##5*4*10= 200 training 
params={'n_neighbors' : [3, 4, 5, 6, 7],
        'metric': ['minkowski','euclidean','manhattan', 'hamming']}
grid_search= GridSearchCV(estimator=knn, param_grid= params, scoring='accuracy', n_jobs=-1)
grid_search= grid_search.fit(X_train, Y_train)
y_predict= grid_search.best_estimator_.predict(X_test)

In [16]:
## evaluating the model
print("----------")
print('best parameters: ', grid_search.best_params_)
print("----------")
print(confusion_matrix(Y_test,y_predict))
print("----------")
print(classification_report(Y_test, y_predict))
print("----------")
print('best accuracy: ', grid_search.best_score_)
print("----------")

----------
best parameters:  {'metric': 'manhattan', 'n_neighbors': 7}
----------
[[1073 2170]
 [ 500 9755]]
----------
              precision    recall  f1-score   support

           0       0.68      0.33      0.45      3243
           1       0.82      0.95      0.88     10255

    accuracy                           0.80     13498
   macro avg       0.75      0.64      0.66     13498
weighted avg       0.79      0.80      0.78     13498

----------
best accuracy:  0.804248777596681
----------


### SVM

In [17]:
##training the model
svc = SVC()
svc.fit(X_train, Y_train)
y_predict = svc.predict(X_test)

In [18]:
## evaluating the model
print(confusion_matrix(Y_test,y_predict))
print("----------")
print(classification_report(Y_test, y_predict))
print("----------")
print("Score: ",svc.score(X_test, Y_test))

[[  864  2379]
 [    0 10255]]
----------
              precision    recall  f1-score   support

           0       1.00      0.27      0.42      3243
           1       0.81      1.00      0.90     10255

    accuracy                           0.82     13498
   macro avg       0.91      0.63      0.66     13498
weighted avg       0.86      0.82      0.78     13498

----------
Score:  0.8237516669136168


## Cross Validation (K-Fold)

In [22]:
scores2= cross_val_score(estimator=svc, X=X_train, y=Y_train, cv=10)

In [27]:
knn = KNeighborsClassifier(n_neighbors= 7, metric='manhattan')
scores1= cross_val_score(estimator=knn, X=X_train, y=Y_train, cv=10)

In [28]:
scores1_list= [round(elem, 3) for elem in list(scores1)]
scores2_list= [round(elem, 3) for elem in list(scores2)]
print("KNN: ")
print("---------------")
print("scores: ", scores1_list)
print("Mean score: ", round(scores1.mean(), 3))
print("standart deviation: ", round(scores1.std(), 3))
print("")
print("SVC: ")
print("---------------")
print("scores: ", scores2_list)
print("Mean score: ", round(scores2.mean(), 3))
print("standart deviation: ", round(scores2.std(), 3))

KNN: 
---------------
scores:  [0.813, 0.807, 0.808, 0.804, 0.802, 0.806, 0.793, 0.804, 0.806, 0.798]
Mean score:  0.804
standart deviation:  0.005

SVC: 
---------------
scores:  [0.829, 0.819, 0.828, 0.828, 0.818, 0.823, 0.816, 0.823, 0.826, 0.821]
Mean score:  0.823
standart deviation:  0.004
