In [5]:
%matplotlib notebook

import scipy.stats
import numpy as np
import pandas as pd

from imblearn.over_sampling import SMOTE
from collections import Counter

import matplotlib.pyplot as plt

# Process the data

In [6]:
# Read the data
data = pd.read_csv('C:/Users/vabalagon/Desktop/Meta/New Workflow/data/2 data for modeling (With PCA).csv')

# Get the features and target variable from the dataframe
X = data.drop(['Survey ID', 'Response Date', 'Likelihood to Recommend'], axis=1).to_numpy()
y = data['Likelihood to Recommend'].to_numpy()

# Split the data into test and train sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.copy(), y, test_size = 0.25, shuffle=True, random_state=42) #, stratify=y_smote

# Apply SMOTE oversampling to the TRAINING SET ONLY
sm = SMOTE(random_state=42)
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_train_smote))

Resampled dataset shape Counter({0: 8116, 2: 8116, 1: 8116})


# kNN Classifier

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score

In [8]:
# Create a kNN instance
kNN_clf = KNeighborsClassifier() 

# Dictionary of possible parameter values per parameter
k_range = list(range(1, 20))
param_grid = dict(n_neighbors=k_range) 

# Grid search step
kNN_model = GridSearchCV(kNN_clf, param_grid, cv=5, scoring='balanced_accuracy', return_train_score=False, verbose=5)
kNN_model.fit(X_train_smote, y_train_smote)

# Print the best parameter and the best score
print("\nBest parameter:", kNN_model.best_params_)
print("Training set cross-validation balanced accuracy score:", kNN_model.best_score_) # average of all cv folds for a single combination of the parameters we specify
print("Test set balanced accuracy score:", balanced_accuracy_score(y_test, kNN_model.predict(X_test)))

Fitting 5 folds for each of 19 candidates, totalling 95 fits
[CV 1/5] END .....................n_neighbors=1;, score=0.780 total time=   0.0s
[CV 2/5] END .....................n_neighbors=1;, score=0.770 total time=   0.1s
[CV 3/5] END .....................n_neighbors=1;, score=0.771 total time=   0.1s
[CV 4/5] END .....................n_neighbors=1;, score=0.794 total time=   0.1s
[CV 5/5] END .....................n_neighbors=1;, score=0.783 total time=   0.1s
[CV 1/5] END .....................n_neighbors=2;, score=0.739 total time=   0.1s
[CV 2/5] END .....................n_neighbors=2;, score=0.730 total time=   0.1s
[CV 3/5] END .....................n_neighbors=2;, score=0.736 total time=   0.1s
[CV 4/5] END .....................n_neighbors=2;, score=0.763 total time=   0.1s
[CV 5/5] END .....................n_neighbors=2;, score=0.747 total time=   0.1s
[CV 1/5] END .....................n_neighbors=3;, score=0.728 total time=   0.1s
[CV 2/5] END .....................n_neighbors=3;

##### Accuracy per class

In [9]:
y_pred = kNN_model.predict(X_test)

for y_i in np.unique(y_test)[::-1]:
    print('class: ', y_i)
    
    # Find the indices of y_i in the true labels
    indices_i = np.where(y_test == y_i)
    
    # Computes the accuracy
    print('Accuracy: ', str(round(np.sum(y_test[indices_i] == y_pred[indices_i])/ len(np.where(y_test==y_i)[0]), 3)))

class:  2
Accuracy:  0.68
class:  1
Accuracy:  0.31
class:  0
Accuracy:  0.588
