In [1]:
%matplotlib notebook

import scipy.stats
import numpy as np
import pandas as pd

from imblearn.over_sampling import SMOTE
from collections import Counter

import matplotlib.pyplot as plt

# Pre-processing

In [2]:
# Read the data
data = pd.read_csv('C:/Users/vabalagon/Desktop/Meta/New Workflow/data/2 data for modeling (With PCA).csv')

# Get the features and target variable from the dataframe
X = data.drop(['Survey ID', 'Response Date', 'Likelihood to Recommend'], axis=1).to_numpy()
y = data['Likelihood to Recommend'].to_numpy()

# Split the data into test and train sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.copy(), y, test_size = 0.25, shuffle=True, random_state=42) #, stratify=y_smote

# Apply SMOTE oversampling to the TRAINING SET ONLY
sm = SMOTE(random_state=42)
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_train_smote))

Resampled dataset shape Counter({0: 8116, 2: 8116, 1: 8116})


# SVM Model

In [3]:
from sklearn.svm import LinearSVC # much faster than SVC(kernel='linear') for linear SVM
from sklearn.svm import SVC # for polynomial and RBF kernel SVM
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import balanced_accuracy_score

### Linear SVM

In [8]:
svm_linear_clf = LinearSVC(penalty='l2', max_iter=10000, dual=False, class_weight='balanced')

# Dictionary of possible parameter values per parameter
C_range = np.abs(np.random.normal(13, 1, 20)) #np.logspace(-5,5,11)  np.abs(np.random.normal(0.1, .01, 20))
param_grid = dict(C=C_range) # Note that C is the inverse of the regularization parameter

# Grid search step
svm_linear = GridSearchCV(svm_linear_clf, param_grid, cv=3, scoring='balanced_accuracy', return_train_score=True, verbose=3)
svm_linear.fit(X_train_smote, y_train_smote)

# Print the best parameter and the best score
print("\nBest parameter:", svm_linear.best_params_)
print("Training set cross-validation balanced accuracy score:", svm_linear.best_score_) 
print("Test set balanced accuracy score:", balanced_accuracy_score(y_test, svm_linear.predict(X_test)))

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV 1/3] END C=13.793518095980012;, score=(train=0.737, test=0.722) total time=   0.0s
[CV 2/3] END C=13.793518095980012;, score=(train=0.726, test=0.733) total time=   0.0s
[CV 3/3] END C=13.793518095980012;, score=(train=0.740, test=0.745) total time=   0.0s
[CV 1/3] END C=13.924460430182977;, score=(train=0.712, test=0.701) total time=   0.0s
[CV 2/3] END C=13.924460430182977;, score=(train=0.726, test=0.733) total time=   0.0s
[CV 3/3] END C=13.924460430182977;, score=(train=0.740, test=0.745) total time=   0.0s
[CV 1/3] END C=12.301351691273194;, score=(train=0.732, test=0.720) total time=   0.0s
[CV 2/3] END C=12.301351691273194;, score=(train=0.726, test=0.733) total time=   0.0s
[CV 3/3] END C=12.301351691273194;, score=(train=0.740, test=0.745) total time=   0.0s
[CV 1/3] END C=12.909604061836536;, score=(train=0.707, test=0.695) total time=   0.0s
[CV 2/3] END C=12.909604061836536;, score=(train=0.726, test=0.733) t

##### Accuracy score per class

In [9]:
y_pred = svm_linear.predict(X_test)

for y_i in np.unique(y_test)[::-1]:
    print('class: ', y_i)
    
    # Find the indices of y_i in the true labels
    indices_i = np.where(y_test == y_i)
    
    # Computes the accuracy
    print('Accuracy: ', str(round(np.sum(y_test[indices_i] == y_pred[indices_i])/ len(np.where(y_test==y_i)[0]), 3)))

class:  2
Accuracy:  0.905
class:  1
Accuracy:  0.367
class:  0
Accuracy:  0.867


### Polynomial SVM

In [11]:
svm_poly_clf2 = SVC(kernel='poly', degree=2, class_weight='balanced')

# Dictionary of possible parameter values per parameter
C_range = np.logspace(-5,5,11).tolist() #np.abs(np.random.normal(0.1, .01, 20)) # np.logspace(-3,3,7).tolist() #[0.02, 0.03, 0.04] # [0.005]
param_grid = dict(C=C_range) #degree=degree_range, 

# Grid search step
svm_poly2 = GridSearchCV(svm_poly_clf2, param_grid, cv=3, scoring='balanced_accuracy', return_train_score=True, verbose=3, n_jobs=-1)
svm_poly2.fit(X_train_smote, y_train_smote)

# Print the best parameter and the best score
print("\nBest parameter:", svm_poly2.best_params_)
print("Training set cross-validation balanced accuracy score:", svm_poly2.best_score_) 
print("Test set balanced accuracy score:", balanced_accuracy_score(y_test, svm_poly2.predict(X_test)))

Fitting 3 folds for each of 11 candidates, totalling 33 fits


KeyboardInterrupt: 

##### Accuracy score per class

In [None]:
y_pred = svm_poly2.predict(X_test)

for y_i in np.unique(y_test)[::-1]:
    print('class: ', y_i)
    
    # Find the indices of y_i in the true labels
    indices_i = np.where(y_test == y_i)
    
    # Computes the accuracy
    print('Accuracy: ', str(round(np.sum(y_test[indices_i] == y_pred[indices_i])/ len(np.where(y_test==y_i)[0]), 3)))

### RBF Kernel