In [1]:
%matplotlib notebook

import scipy.stats
import numpy as np
import pandas as pd

from imblearn.over_sampling import SMOTE
from collections import Counter

import matplotlib.pyplot as plt

# Pre-processing

In [2]:
def accuracy_per_class(classifier, X_test, y_test):
    y_pred = classifier.predict(X_test)

    for y_i in np.unique(y_test)[::-1]:
        print()

        # Find the indices of y_i in the true labels
        indices_i = np.where(y_test == y_i)

        # Computes the accuracy
        print('class', y_i, 'Accuracy: ', str(round(np.sum(y_test[indices_i] == y_pred[indices_i])/ len(np.where(y_test==y_i)[0]), 3)))

In [3]:
# Read the data
df = pd.read_csv('C:/Users/vabalagon/Desktop/Machine Learning Projects/Applied-Machine-Learning-Projects/Customer Churn Prediction/data/processed.csv')

# Get the features and target variable from the dataframe
X = df.drop(['state', 'area_code', 'churn'], axis=1).to_numpy()
y = df['churn'].to_numpy()

# Split into training and testing parts
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.copy(), y, 
                                                    test_size = 0.25, 
                                                    shuffle=True, 
                                                    random_state=42, 
                                                    stratify=y) #, stratify=y_smote
# Apply SMOTE oversampling technique to the training set
sm = SMOTE(random_state=42)
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_train_smote))

Resampled dataset shape Counter({0: 2739, 1: 2739})


# SVM Model

In [4]:
from sklearn.svm import LinearSVC # much faster than SVC(kernel='linear') for linear SVM
from sklearn.svm import SVC # for polynomial and RBF kernel SVM
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import balanced_accuracy_score

### Linear SVM

In [5]:
svm_linear_clf = LinearSVC(penalty='l2', max_iter=10000, dual=False, class_weight='balanced')

# Dictionary of possible parameter values per parameter
C_range = np.abs(np.random.normal(1, 5, 20))   # np.logspace(-5,5,11)
param_grid = dict(C=C_range) # Note that C is the inverse of the regularization parameter

# Grid search step
svm_linear = GridSearchCV(svm_linear_clf, param_grid, cv=3, scoring='balanced_accuracy', return_train_score=True, verbose=3)
svm_linear.fit(X_train, y_train)

# Print the best parameter and the best score
print("\nBest parameter:", svm_linear.best_params_)
print("Training set cross-validation balanced accuracy score:", svm_linear.best_score_) 
print("Test set balanced accuracy score:", balanced_accuracy_score(y_test, svm_linear.predict(X_test)))

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV 1/3] END C=4.518992789436781;, score=(train=0.772, test=0.796) total time=   0.0s
[CV 2/3] END C=4.518992789436781;, score=(train=0.790, test=0.770) total time=   0.0s
[CV 3/3] END C=4.518992789436781;, score=(train=0.794, test=0.772) total time=   0.0s
[CV 1/3] END C=1.6169126140397219;, score=(train=0.771, test=0.796) total time=   0.0s
[CV 2/3] END C=1.6169126140397219;, score=(train=0.790, test=0.770) total time=   0.0s
[CV 3/3] END C=1.6169126140397219;, score=(train=0.796, test=0.771) total time=   0.0s
[CV 1/3] END C=0.9644139702449539;, score=(train=0.769, test=0.796) total time=   0.0s
[CV 2/3] END C=0.9644139702449539;, score=(train=0.790, test=0.770) total time=   0.0s
[CV 3/3] END C=0.9644139702449539;, score=(train=0.800, test=0.772) total time=   0.0s
[CV 1/3] END C=8.140031480052066;, score=(train=0.770, test=0.796) total time=   0.0s
[CV 2/3] END C=8.140031480052066;, score=(train=0.790, test=0.770) total 

##### Accuracy score per class

In [6]:
accuracy_per_class(svm_linear, X_test, y_test)


class 1 Accuracy:  0.747

class 0 Accuracy:  0.756


### Linear SVM with SMOTE oversampling

In [7]:
svm_linear_clf = LinearSVC(penalty='l2', max_iter=10000, dual=False, class_weight='balanced')

# Dictionary of possible parameter values per parameter
C_range = np.abs(np.random.normal(1, 5, 20))   # np.logspace(-5,5,11)
param_grid = dict(C=C_range) # Note that C is the inverse of the regularization parameter

# Grid search step
svm_linear_smote = GridSearchCV(svm_linear_clf, 
                                param_grid, 
                                cv=3, 
                                scoring='balanced_accuracy', 
                                return_train_score=True, 
                                verbose=3)
svm_linear_smote.fit(X_train_smote, y_train_smote)

# Print the best parameter and the best score
print("\nBest parameter:", svm_linear.best_params_)
print("Training set cross-validation balanced accuracy score:", svm_linear_smote.best_score_) 
print("Test set balanced accuracy score:", balanced_accuracy_score(y_test, svm_linear.predict(X_test)))

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV 1/3] END C=3.5985782995558218;, score=(train=0.799, test=0.791) total time=   0.0s
[CV 2/3] END C=3.5985782995558218;, score=(train=0.796, test=0.802) total time=   0.0s
[CV 3/3] END C=3.5985782995558218;, score=(train=0.799, test=0.803) total time=   0.0s
[CV 1/3] END C=6.5567760223995;, score=(train=0.800, test=0.791) total time=   0.0s
[CV 2/3] END C=6.5567760223995;, score=(train=0.795, test=0.802) total time=   0.0s
[CV 3/3] END C=6.5567760223995;, score=(train=0.799, test=0.803) total time=   0.0s
[CV 1/3] END C=9.39000624199762;, score=(train=0.799, test=0.791) total time=   0.0s
[CV 2/3] END C=9.39000624199762;, score=(train=0.795, test=0.803) total time=   0.0s
[CV 3/3] END C=9.39000624199762;, score=(train=0.799, test=0.803) total time=   0.0s
[CV 1/3] END C=3.820504901370491;, score=(train=0.798, test=0.791) total time=   0.0s
[CV 2/3] END C=3.820504901370491;, score=(train=0.795, test=0.802) total time=   0.0s

In [8]:
accuracy_per_class(svm_linear_smote, X_test, y_test)


class 1 Accuracy:  0.74

class 0 Accuracy:  0.76


### Polynomial SVM without SMOTE

In [10]:
svm_poly_clf2 = SVC(kernel='poly', degree=2, class_weight='balanced')

# Dictionary of possible parameter values per parameter
C_range = np.logspace(-4,4,9).tolist() #np.abs(np.random.normal(0.1, .01, 20)) # np.logspace(-3,3,7).tolist() #[0.02, 0.03, 0.04] # [0.005]
param_grid = dict(C=C_range) #degree=degree_range, 

# Grid search step
svm_poly2 = GridSearchCV(svm_poly_clf2, 
                         param_grid, 
                         cv=3, 
                         scoring='balanced_accuracy', 
                         return_train_score=True, 
                         verbose=3, 
                         n_jobs=-1)
svm_poly2.fit(X_train, y_train)

# Print the best parameter and the best score
print("\nBest parameter:", svm_poly2.best_params_)
print("Training set cross-validation balanced accuracy score:", svm_poly2.best_score_) 
print("Test set balanced accuracy score:", balanced_accuracy_score(y_test, svm_poly2.predict(X_test)))

Fitting 3 folds for each of 9 candidates, totalling 27 fits

Best parameter: {'C': 10000.0}
Training set cross-validation balanced accuracy score: 0.8100647699604601
Test set balanced accuracy score: 0.7958525009127418


##### Accuracy score per class

In [11]:
accuracy_per_class(svm_poly2, X_test, y_test)


class 1 Accuracy:  0.787

class 0 Accuracy:  0.805


### Polynomial SVM with SMOTE oversampling

In [12]:
svm_poly_clf2 = SVC(kernel='poly', degree=2, class_weight='balanced')

# Dictionary of possible parameter values per parameter
C_range = np.logspace(-4,4,9).tolist() #np.abs(np.random.normal(0.1, .01, 20)) # np.logspace(-3,3,7).tolist() #[0.02, 0.03, 0.04] # [0.005]
param_grid = dict(C=C_range) #degree=degree_range, 

# Grid search step
svm_poly2_smote = GridSearchCV(svm_poly_clf2, 
                         param_grid, 
                         cv=3, 
                         scoring='balanced_accuracy', 
                         return_train_score=True, 
                         verbose=3, 
                         n_jobs=-1)
svm_poly2_smote.fit(X_train_smote, y_train_smote)

# Print the best parameter and the best score
print("\nBest parameter:", svm_poly2_smote.best_params_)
print("Training set cross-validation balanced accuracy score:", svm_poly2_smote.best_score_) 
print("Test set balanced accuracy score:", balanced_accuracy_score(y_test, svm_poly2_smote.predict(X_test)))

Fitting 3 folds for each of 9 candidates, totalling 27 fits

Best parameter: {'C': 10000.0}
Training set cross-validation balanced accuracy score: 0.8346111719605696
Test set balanced accuracy score: 0.7853048557867836


##### Accuracy per class

In [20]:
accuracy_per_class(svm_poly2_smote, X_test, y_test)


class 1 Accuracy:  0.767

class 0 Accuracy:  0.804


### RBF Kernel without SMOTE oversampling

In [14]:
from scipy.stats import randint

In [19]:
svm_RBF = SVC(kernel='rbf', class_weight='balanced')

params = {
    "C": np.logspace(-5,5,11).tolist(), # default 100
    "gamma": [10**-8, 10**-7, 10**-6, 10**-5, 10**-4, 10**-3, 10**-2],
}

svm_RBF_model = GridSearchCV(svm_RBF, 
                            param_grid=params, 
                            cv=3, 
                            verbose=3, 
                            n_jobs=-1, 
                            return_train_score=True,
                            scoring='balanced_accuracy')

svm_RBF_model.fit(X_train, y_train)
 
print("\nBest parameter:", svm_RBF_model.best_params_)
print("Training set cross-validation balanced accuracy score:", svm_RBF_model.best_score_) 
print("Test set balanced accuracy score:", balanced_accuracy_score(y_test, svm_RBF_model.predict(X_test)))

Fitting 3 folds for each of 77 candidates, totalling 231 fits

Best parameter: {'C': 100000.0, 'gamma': 1e-06}
Training set cross-validation balanced accuracy score: 0.8503631119964911
Test set balanced accuracy score: 0.8415443592552025


In [21]:
accuracy_per_class(svm_RBF_model, X_test, y_test)


class 1 Accuracy:  0.82

class 0 Accuracy:  0.863


### RBF Kernel with SMOTE

In [22]:
svm_RBF_clf = SVC(kernel='rbf', class_weight='balanced')

params = {
    "C": np.logspace(-5,5,11).tolist(), # default 100
    "gamma": [10**-8, 10**-7, 10**-6, 10**-5, 10**-4, 10**-3, 10**-2],
}

svm_RBF_model_smote = GridSearchCV(svm_RBF_clf, 
                            param_grid=params, 
                            cv=3, 
                            verbose=3, 
                            n_jobs=-1, 
                            return_train_score=True,
                            scoring='balanced_accuracy')

svm_RBF_model_smote.fit(X_train_smote, y_train_smote)
 
print("\nBest parameter:", svm_RBF_model_smote.best_params_)
print("Training set cross-validation balanced accuracy score:", svm_RBF_model_smote.best_score_) 
print("Test set balanced accuracy score:", balanced_accuracy_score(y_test, svm_RBF_model_smote.predict(X_test)))

Fitting 3 folds for each of 77 candidates, totalling 231 fits

Best parameter: {'C': 10000.0, 'gamma': 0.0001}
Training set cross-validation balanced accuracy score: 0.916575392479007
Test set balanced accuracy score: 0.7961883899233297


##### Accuracy per class

In [23]:
accuracy_per_class(svm_RBF_model_smote, X_test, y_test)


class 1 Accuracy:  0.68

class 0 Accuracy:  0.912
