In [1]:
%matplotlib notebook

import scipy.stats
import numpy as np
import pandas as pd

from imblearn.over_sampling import SMOTE
from collections import Counter

import matplotlib.pyplot as plt

# Process the data

In [2]:
# Read the data
df = pd.read_csv('C:/Users/vabalagon/Desktop/Machine Learning Projects/Applied-Machine-Learning-Projects/Customer Churn Prediction/data/processed.csv')

# Get the features and target variable from the dataframe
X = df.drop(['state', 'area_code', 'churn'], axis=1).to_numpy()
y = df['churn'].to_numpy()

# Split into training and testing parts
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.copy(), y, 
                                                    test_size = 0.25, 
                                                    shuffle=True, 
                                                    random_state=42, 
                                                    stratify=y) #, stratify=y_smote
# Apply SMOTE oversampling technique to the training set
sm = SMOTE(random_state=42)
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_train_smote))

Resampled dataset shape Counter({0: 2739, 1: 2739})


In [3]:
def accuracy_per_class(classifier, X_test, y_test):
    y_pred = classifier.predict(X_test)

    for y_i in np.unique(y_test)[::-1]:
        print()

        # Find the indices of y_i in the true labels
        indices_i = np.where(y_test == y_i)

        # Computes the accuracy
        print('class', y_i, 'Accuracy: ', str(round(np.sum(y_test[indices_i] == y_pred[indices_i])/ len(np.where(y_test==y_i)[0]), 3)))

# kNN without SMOTE

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score

In [5]:
# Create a kNN instance
kNN_clf = KNeighborsClassifier() 

# Dictionary of possible parameter values per parameter
k_range = list(range(1, 20))
param_grid = dict(n_neighbors=k_range) 

# Grid search step
kNN_model = GridSearchCV(kNN_clf, param_grid, cv=5, scoring='balanced_accuracy', return_train_score=False, verbose=5)
kNN_model.fit(X_train, y_train)

# Print the best parameter and the best score
print("\nBest parameter:", kNN_model.best_params_)
print("Training set cross-validation balanced accuracy score:", kNN_model.best_score_) # average of all cv folds for a single combination of the parameters we specify
print("Test set balanced accuracy score:", balanced_accuracy_score(y_test, kNN_model.predict(X_test)))

Fitting 5 folds for each of 19 candidates, totalling 95 fits
[CV 1/5] END .....................n_neighbors=1;, score=0.656 total time=   0.0s
[CV 2/5] END .....................n_neighbors=1;, score=0.641 total time=   0.0s
[CV 3/5] END .....................n_neighbors=1;, score=0.664 total time=   0.0s
[CV 4/5] END .....................n_neighbors=1;, score=0.616 total time=   0.0s
[CV 5/5] END .....................n_neighbors=1;, score=0.578 total time=   0.0s
[CV 1/5] END .....................n_neighbors=2;, score=0.608 total time=   0.0s
[CV 2/5] END .....................n_neighbors=2;, score=0.598 total time=   0.0s
[CV 3/5] END .....................n_neighbors=2;, score=0.608 total time=   0.0s
[CV 4/5] END .....................n_neighbors=2;, score=0.596 total time=   0.0s
[CV 5/5] END .....................n_neighbors=2;, score=0.558 total time=   0.0s
[CV 1/5] END .....................n_neighbors=3;, score=0.629 total time=   0.0s
[CV 2/5] END .....................n_neighbors=3;

##### Accuracy per class

In [6]:
accuracy_per_class(kNN_model, X_test, y_test)


class 1 Accuracy:  0.313

class 0 Accuracy:  0.917


# kNN with SMOTE oversampling

In [8]:
# Create a kNN instance
kNN_clf = KNeighborsClassifier() 

# Dictionary of possible parameter values per parameter
k_range = list(range(1, 20))
param_grid = dict(n_neighbors=k_range) 

# Grid search step
kNN_model_smote = GridSearchCV(kNN_clf, param_grid, cv=5, scoring='balanced_accuracy', return_train_score=False, verbose=5)
kNN_model_smote.fit(X_train_smote, y_train_smote)

# Print the best parameter and the best score
print("\nBest parameter:", kNN_model_smote.best_params_)
print("Training set cross-validation balanced accuracy score:", kNN_model_smote.best_score_) # average of all cv folds for a single combination of the parameters we specify
print("Test set balanced accuracy score:", balanced_accuracy_score(y_test, kNN_model_smote.predict(X_test)))

Fitting 5 folds for each of 19 candidates, totalling 95 fits
[CV 1/5] END .....................n_neighbors=1;, score=0.883 total time=   0.0s
[CV 2/5] END .....................n_neighbors=1;, score=0.880 total time=   0.0s
[CV 3/5] END .....................n_neighbors=1;, score=0.885 total time=   0.0s
[CV 4/5] END .....................n_neighbors=1;, score=0.859 total time=   0.0s
[CV 5/5] END .....................n_neighbors=1;, score=0.879 total time=   0.0s
[CV 1/5] END .....................n_neighbors=2;, score=0.887 total time=   0.0s
[CV 2/5] END .....................n_neighbors=2;, score=0.887 total time=   0.0s
[CV 3/5] END .....................n_neighbors=2;, score=0.886 total time=   0.0s
[CV 4/5] END .....................n_neighbors=2;, score=0.859 total time=   0.0s
[CV 5/5] END .....................n_neighbors=2;, score=0.882 total time=   0.0s
[CV 1/5] END .....................n_neighbors=3;, score=0.851 total time=   0.0s
[CV 2/5] END .....................n_neighbors=3;