In [1]:
%matplotlib notebook

from imblearn.over_sampling import SMOTE
from collections import Counter
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

# Pre-processing

In [2]:
# Read the data
df = pd.read_csv('C:/Users/vabalagon/Desktop/Machine Learning Projects/Applied-Machine-Learning-Projects/Customer Churn Prediction/data/processed.csv')

# Get the features and target variable from the dataframe
X = df.drop(['state', 'area_code', 'churn'], axis=1).to_numpy()
y = df['churn'].to_numpy()

# Split into training and testing parts
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.copy(), y, 
                                                    test_size = 0.25, 
                                                    shuffle=True, 
                                                    random_state=42, 
                                                    stratify=y) #, stratify=y_smote
# Apply SMOTE oversampling technique to the training set
sm = SMOTE(random_state=42)
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_train_smote))

Resampled dataset shape Counter({0: 2739, 1: 2739})


# Logistic Regression Model

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score

In [4]:
def accuracy_per_class(classifier, X_test, y_test):
    y_pred = classifier.predict(X_test)

    for y_i in np.unique(y_test)[::-1]:
        print()

        # Find the indices of y_i in the true labels
        indices_i = np.where(y_test == y_i)

        # Computes the accuracy
        print('class', y_i, 'Accuracy: ', str(round(np.sum(y_test[indices_i] == y_pred[indices_i])/ len(np.where(y_test==y_i)[0]), 3)))

### Plain logistic regression model with L2 regularization

In [5]:
logreg_plain_clf_l2 = LogisticRegression(penalty="l2", 
                                      max_iter=5000, 
                                      class_weight='balanced')


# The task is to find the best regularization parameter. Note that C is the inverse of the regularization parameter. 
# The smaller C is, the stronger the regularization effect
C_range = np.logspace(-5, 5, 11)  # np.abs(np.random.normal(8.5, 3, 10)) 
param_grid = dict(C=C_range) 


# Grid search step
logreg_plain_l2 = GridSearchCV(logreg_plain_clf_l2, param_grid, cv=5, 
                               scoring='balanced_accuracy', error_score="raise", 
                               return_train_score=True, verbose=5)

logreg_plain_l2.fit(X_train, y_train)

# Print the best parameter and the best score
print("\nBest parameter:", logreg_plain_l2.best_params_)
print("Training set cross-validation balanced accuracy score:", logreg_plain_l2.best_score_) 
print("Test set balanced accuracy score:", balanced_accuracy_score(y_test, logreg_plain_l2.predict(X_test)))

Fitting 5 folds for each of 11 candidates, totalling 55 fits
[CV 1/5] END .......C=1e-05;, score=(train=0.584, test=0.644) total time=   0.0s
[CV 2/5] END .......C=1e-05;, score=(train=0.590, test=0.629) total time=   0.0s
[CV 3/5] END .......C=1e-05;, score=(train=0.595, test=0.614) total time=   0.0s
[CV 4/5] END .......C=1e-05;, score=(train=0.609, test=0.559) total time=   0.0s
[CV 5/5] END .......C=1e-05;, score=(train=0.616, test=0.550) total time=   0.0s
[CV 1/5] END ......C=0.0001;, score=(train=0.606, test=0.667) total time=   0.0s
[CV 2/5] END ......C=0.0001;, score=(train=0.618, test=0.625) total time=   0.0s
[CV 3/5] END ......C=0.0001;, score=(train=0.607, test=0.639) total time=   0.0s
[CV 4/5] END ......C=0.0001;, score=(train=0.627, test=0.563) total time=   0.0s
[CV 5/5] END ......C=0.0001;, score=(train=0.622, test=0.563) total time=   0.1s
[CV 1/5] END .......C=0.001;, score=(train=0.703, test=0.769) total time=   0.1s
[CV 2/5] END .......C=0.001;, score=(train=0.710

##### Accuracy per class

In [6]:
accuracy_per_class(logreg_plain_l2, X_test, y_test)


class 1 Accuracy:  0.753

class 0 Accuracy:  0.756


### Plain LogReg with SMOTE oversampling

In [7]:
logreg_plain_clf_l2 = LogisticRegression(penalty="l2", 
                                      max_iter=5000, 
                                      class_weight='balanced')


# The task is to find the best regularization parameter. Note that C is the inverse of the regularization parameter. 
# The smaller C is, the stronger the regularization effect
C_range = np.logspace(-5, 5, 11)  # np.abs(np.random.normal(8.5, 3, 10)) 
param_grid = dict(C=C_range) 


# Grid search step
logreg_plain_l2_smote = GridSearchCV(logreg_plain_clf_l2, param_grid, cv=5, 
                               scoring='balanced_accuracy', error_score="raise", 
                               return_train_score=True, verbose=5)

logreg_plain_l2_smote.fit(X_train_smote, y_train_smote)

# Print the best parameter and the best score
print("\nBest parameter:", logreg_plain_l2.best_params_)
print("Training set cross-validation balanced accuracy score:", logreg_plain_l2_smote.best_score_) 
print("Test set balanced accuracy score:", balanced_accuracy_score(y_test, logreg_plain_l2_smote.predict(X_test)))

Fitting 5 folds for each of 11 candidates, totalling 55 fits
[CV 1/5] END .......C=1e-05;, score=(train=0.594, test=0.611) total time=   0.0s
[CV 2/5] END .......C=1e-05;, score=(train=0.593, test=0.606) total time=   0.0s
[CV 3/5] END .......C=1e-05;, score=(train=0.597, test=0.585) total time=   0.0s
[CV 4/5] END .......C=1e-05;, score=(train=0.599, test=0.589) total time=   0.0s
[CV 5/5] END .......C=1e-05;, score=(train=0.601, test=0.590) total time=   0.0s
[CV 1/5] END ......C=0.0001;, score=(train=0.636, test=0.661) total time=   0.1s
[CV 2/5] END ......C=0.0001;, score=(train=0.634, test=0.653) total time=   0.1s
[CV 3/5] END ......C=0.0001;, score=(train=0.643, test=0.617) total time=   0.0s
[CV 4/5] END ......C=0.0001;, score=(train=0.645, test=0.611) total time=   0.1s
[CV 5/5] END ......C=0.0001;, score=(train=0.635, test=0.637) total time=   0.1s
[CV 1/5] END .......C=0.001;, score=(train=0.731, test=0.737) total time=   0.4s
[CV 2/5] END .......C=0.001;, score=(train=0.729

In [8]:
accuracy_per_class(logreg_plain_l2_smote, X_test, y_test)


class 1 Accuracy:  0.733

class 0 Accuracy:  0.759


### Logistic Regression with Polynomial features and L2 regularization (no SMOTE)

Steps:
1. Fit the data X to Polynomial Features
2. Split training and testing examples

In [9]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

In [10]:
# Transform the add polynomial features to the original dataset
poly_log_reg = PolynomialFeatures(2)
X_poly_logreg = poly_log_reg.fit_transform(X.copy())

# Apply standard scaling on the dataset
X_poly_logreg = StandardScaler().fit_transform(X_poly_logreg)

# Test-train split
X_train_poly_logreg, X_test_poly_logreg, y_train_poly_logreg, y_test_poly_logreg = train_test_split(X_poly_logreg, 
                                                                                                    y, 
                                                                                                    test_size = 0.25,
                                                                                                    stratify=y)

# Apply SMOTE oversampling algorithm to the training set
sm = SMOTE(random_state=42)
X_train_smote_poly, y_train_smote_poly = sm.fit_resample(X_train_poly_logreg, y_train_poly_logreg)
print('Resampled dataset shape %s' % Counter(y_train_smote_poly))

Resampled dataset shape Counter({0: 2739, 1: 2739})


##### Model

In [11]:
# Model
logreg_poly_clf = LogisticRegression(penalty="l2", 
                                     max_iter=10000, 
                                     class_weight='balanced') #

# Dictionary of possible parameter values per parameter
C_range = np.logspace(-4, 4, 9) #np.abs(np.random.normal(0.1, .01, 20))
param_grid = dict(C=C_range) # Note that C is the inverse of the regularization parameter

# Grid search step
logreg_poly = GridSearchCV(logreg_poly_clf, param_grid, cv=5, 
                           scoring='balanced_accuracy', 
                           return_train_score=True, 
                           verbose=5)
logreg_poly.fit(X_train_poly_logreg, y_train_poly_logreg)

# Print the best parameter and the best score
print("\nBest parameter:", logreg_poly.best_params_)
print("Training set cross-validation balanced accuracy score:", logreg_poly.best_score_) 
print("Test set balanced accuracy score:", balanced_accuracy_score(y_test_poly_logreg, logreg_poly.predict(X_test_poly_logreg)))

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END ......C=0.0001;, score=(train=0.771, test=0.781) total time=   0.0s
[CV 2/5] END ......C=0.0001;, score=(train=0.771, test=0.800) total time=   0.0s
[CV 3/5] END ......C=0.0001;, score=(train=0.766, test=0.751) total time=   0.0s
[CV 4/5] END ......C=0.0001;, score=(train=0.780, test=0.751) total time=   0.0s
[CV 5/5] END ......C=0.0001;, score=(train=0.771, test=0.769) total time=   0.0s
[CV 1/5] END .......C=0.001;, score=(train=0.791, test=0.799) total time=   0.0s
[CV 2/5] END .......C=0.001;, score=(train=0.786, test=0.800) total time=   0.0s
[CV 3/5] END .......C=0.001;, score=(train=0.792, test=0.767) total time=   0.0s
[CV 4/5] END .......C=0.001;, score=(train=0.790, test=0.770) total time=   0.0s
[CV 5/5] END .......C=0.001;, score=(train=0.790, test=0.793) total time=   0.0s
[CV 1/5] END ........C=0.01;, score=(train=0.816, test=0.818) total time=   0.0s
[CV 2/5] END ........C=0.01;, score=(train=0.809,

In [12]:
accuracy_per_class(logreg_poly, X_test_poly_logreg, y_test_poly_logreg)


class 1 Accuracy:  0.853

class 0 Accuracy:  0.844


### LogReg with Polynomial Features with l2 regularization and SMOTE

In [13]:
# Model
logreg_poly_clf = LogisticRegression(penalty="l2", 
                                     max_iter=10000, 
                                     class_weight='balanced') #

# Dictionary of possible parameter values per parameter
C_range = np.logspace(-3, 3, 7) #np.abs(np.random.normal(0.1, .01, 20))
param_grid = dict(C=C_range) # Note that C is the inverse of the regularization parameter

# Grid search step
logreg_poly = GridSearchCV(logreg_poly_clf, param_grid, cv=5, 
                           scoring='balanced_accuracy', 
                           return_train_score=True, 
                           verbose=5)
logreg_poly.fit(X_train_smote_poly, y_train_smote_poly)

# Print the best parameter and the best score
print("\nBest parameter:", logreg_poly.best_params_)
print("Training set cross-validation balanced accuracy score:", logreg_poly.best_score_) 
print("Test set balanced accuracy score:", balanced_accuracy_score(y_test_poly_logreg, logreg_poly.predict(X_test_poly_logreg)))

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV 1/5] END .......C=0.001;, score=(train=0.805, test=0.814) total time=   0.0s
[CV 2/5] END .......C=0.001;, score=(train=0.805, test=0.820) total time=   0.0s
[CV 3/5] END .......C=0.001;, score=(train=0.807, test=0.807) total time=   0.0s
[CV 4/5] END .......C=0.001;, score=(train=0.807, test=0.798) total time=   0.0s
[CV 5/5] END .......C=0.001;, score=(train=0.810, test=0.795) total time=   0.0s
[CV 1/5] END ........C=0.01;, score=(train=0.834, test=0.840) total time=   0.0s
[CV 2/5] END ........C=0.01;, score=(train=0.835, test=0.842) total time=   0.0s
[CV 3/5] END ........C=0.01;, score=(train=0.835, test=0.840) total time=   0.0s
[CV 4/5] END ........C=0.01;, score=(train=0.837, test=0.826) total time=   0.0s
[CV 5/5] END ........C=0.01;, score=(train=0.839, test=0.824) total time=   0.0s
[CV 1/5] END .........C=0.1;, score=(train=0.863, test=0.869) total time=   0.0s
[CV 2/5] END .........C=0.1;, score=(train=0.862,

In [14]:
accuracy_per_class(logreg_poly, X_test_poly_logreg, y_test_poly_logreg)


class 1 Accuracy:  0.82

class 0 Accuracy:  0.876
