In [1]:
%matplotlib notebook

from imblearn.over_sampling import SMOTE
from collections import Counter
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

# Pre-processing

In [2]:
# Read the data
data = pd.read_csv('C:/Users/vabalagon/Desktop/Meta/New Workflow/data/2 data for modeling (With PCA).csv')

# Get the features and target variable from the dataframe
X = data.drop(['Survey ID', 'Response Date', 'Likelihood to Recommend'], axis=1).to_numpy()
y = data['Likelihood to Recommend'].to_numpy()

# Split the data into test and train sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.copy(), y, test_size = 0.25, shuffle=True, random_state=42) #, stratify=y_smote

# Apply SMOTE oversampling to the TRAINING SET ONLY
sm = SMOTE(random_state=42)
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_train_smote))

Resampled dataset shape Counter({0: 8116, 2: 8116, 1: 8116})


# Logistic Regression Model

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import balanced_accuracy_score

##### Plain logistic regression model with L2 regularization

In [None]:
logreg_plain_clf = LogisticRegression(penalty="l2", 
                                      max_iter=5000, 
                                      class_weight='balanced')

# Note that C is the inverse of the regularization parameter. The smaller C is, the stronger the regularization effect
C_range = np.logspace(-3, 3, 7) #np.abs(np.random.normal(1, 3, 20))   #np.logspace(-3, 3, 7)  np.abs(np.random.normal(.3, .5, 20))
param_grid = dict(C=C_range) 

# Grid search step
logreg_plain = GridSearchCV(logreg_plain_clf, param_grid, cv=5, scoring='balanced_accuracy', error_score="raise", return_train_score=True, verbose=5)
logreg_plain.fit(X_train_smote, y_train_smote) #roc_auc_score, balanced_accuracy

# Print the best parameter and the best score
print("\nBest parameter:", logreg_plain.best_params_)
print("Training set cross-validation balanced accuracy score:", logreg_plain.best_score_) 
print("Test set balanced accuracy score:", balanced_accuracy_score(y_test, logreg_plain.predict(X_test)))

##### Accuracy per class

In [None]:
y_pred = logreg_plain.predict(X_test)

for y_i in np.unique(y_test)[::-1]:
    print('class: ', y_i)
    
    # Find the indices of y_i in the true labels
    indices_i = np.where(y_test == y_i)
    
    # Computes the accuracy
    print('Accuracy: ', str(round(np.sum(y_test[indices_i] == y_pred[indices_i])/ len(np.where(y_test==y_i)[0]), 3)))

##### Logistic Regression with Polynomial features and L2 regularization

Steps:
1. Fit the data X to Polynomial Features
2. Split training and testing examples
3. Apply SMOTE oversampling

In [6]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
# X_poly_logreg_smote = StandardScaler().fit_transform(X_poly_logreg_smote)

In [7]:
# Go back to X first and apply Polynomial features
from sklearn.preprocessing import PolynomialFeatures
poly_log_reg = PolynomialFeatures(2)
X_poly_logreg = poly_log_reg.fit_transform(X)

# Apply standard scaling on the dataset
X_poly_logreg = StandardScaler().fit_transform(X_poly_logreg)

# Test-train split
X_train_poly_logreg, X_test_poly_logreg, y_train_poly_logreg, y_test_poly_logreg = train_test_split(X_poly_logreg, 
                                                                                                    y, 
                                                                                                    test_size = 0.25)

# Apply SMOTE oversampling algorithm
sm = SMOTE(random_state=42)
X_train_smote, y_train_smote = sm.fit_resample(X_train_poly_logreg, y_train_poly_logreg)
print('Resampled dataset shape %s' % Counter(y_train_smote))

Resampled dataset shape Counter({1: 8108, 2: 8108, 0: 8108})


##### Model

In [8]:
logreg_poly_clf = LogisticRegression(penalty="l2", max_iter=3000) #, class_weight='balanced'

# Dictionary of possible parameter values per parameter
C_range = np.logspace(-3, 3, 20) #np.abs(np.random.normal(0.1, .01, 20))
#[0.0004, 0.0005, 0.0006] # np.random.normal(11, .3, 5).tolist() np.logspace(-2, 1, 4).tolist(), [0.02, 0.03, 0.04, 0.05, 0.06]

param_grid = dict(C=C_range) # Note that C is the inverse of the regularization parameter

# Grid search step
logreg_poly = GridSearchCV(logreg_poly_clf, param_grid, cv=5, 
                           scoring='balanced_accuracy', 
                           return_train_score=True, 
                           verbose=5)
logreg_poly.fit(X_train_smote, y_train_smote)

# Print the best parameter and the best score
print("\nBest parameter:", logreg_poly.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END .......C=0.001;, score=(train=0.795, test=0.790) total time=   0.0s
[CV 2/5] END .......C=0.001;, score=(train=0.793, test=0.792) total time=   0.0s
[CV 3/5] END .......C=0.001;, score=(train=0.793, test=0.792) total time=   0.0s
[CV 4/5] END .......C=0.001;, score=(train=0.792, test=0.797) total time=   0.0s
[CV 5/5] END .......C=0.001;, score=(train=0.794, test=0.790) total time=   0.0s
[CV 1/5] END C=0.00206913808111479;, score=(train=0.798, test=0.792) total time=   0.1s
[CV 2/5] END C=0.00206913808111479;, score=(train=0.796, test=0.796) total time=   0.1s
[CV 3/5] END C=0.00206913808111479;, score=(train=0.795, test=0.796) total time=   0.1s
[CV 4/5] END C=0.00206913808111479;, score=(train=0.795, test=0.800) total time=   0.1s
[CV 5/5] END C=0.00206913808111479;, score=(train=0.796, test=0.793) total time=   0.1s
[CV 1/5] END C=0.004281332398719396;, score=(train=0.799, test=0.793) total time=   0.1s
[CV 

[CV 5/5] END C=483.2930238571752;, score=(train=0.801, test=0.799) total time=   1.5s
[CV 1/5] END ......C=1000.0;, score=(train=0.805, test=0.793) total time=   1.5s
[CV 2/5] END ......C=1000.0;, score=(train=0.802, test=0.802) total time=   1.4s
[CV 3/5] END ......C=1000.0;, score=(train=0.802, test=0.801) total time=   1.4s
[CV 4/5] END ......C=1000.0;, score=(train=0.802, test=0.806) total time=   1.6s
[CV 5/5] END ......C=1000.0;, score=(train=0.801, test=0.799) total time=   1.4s

Best parameter: {'C': 0.6951927961775606}


In [9]:
y_pred = logreg_poly.predict(X_test_poly_logreg)

for y_i in np.unique(y_test_poly_logreg)[::-1]:
    print('class: ', y_i)
    
    # Find the indices of y_i in the true labels
    indices_i = np.where(y_test_poly_logreg == y_i)
    
    # Computes the accuracy
    print('Accuracy: ', str(round(np.sum(y_test_poly_logreg[indices_i] == y_pred[indices_i])/ len(np.where(y_test_poly_logreg==y_i)[0]), 3)))

class:  2
Accuracy:  0.879
class:  1
Accuracy:  0.663
class:  0
Accuracy:  0.874


##### Balanced accuracy score

In [10]:
print("Test set balanced accuracy score:", balanced_accuracy_score(y_test_poly_logreg, logreg_poly.predict(X_test_poly_logreg)))

Test set balanced accuracy score: 0.8055270745965964
