In [None]:
%matplotlib notebook

from imblearn.over_sampling import SMOTE
from collections import Counter
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

# Pre-processing

In [None]:
# Read the data
df = pd.read_csv('C:/Users/vabalagon/Desktop/Machine Learning Projects/Applied-Machine-Learning-Projects/Customer Churn Prediction/data/processed.csv')

# Get the features and target variable from the dataframe
X = df.drop(['state', 'area_code', 'churn'], axis=1).to_numpy()
y = df['churn'].to_numpy()

# Split into training and testing parts
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.copy(), y, 
                                                    test_size = 0.25, 
                                                    shuffle=True, 
                                                    random_state=42, 
                                                    stratify=y) #, stratify=y_smote
# Apply SMOTE oversampling technique to the training set
sm = SMOTE(random_state=42)
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_train_smote))

In [None]:
def accuracy_per_class(classifier, X_test, y_test):
    y_pred = classifier.predict(X_test)

    for y_i in np.unique(y_test)[::-1]:
        print()

        # Find the indices of y_i in the true labels
        indices_i = np.where(y_test == y_i)

        # Computes the accuracy
        print('class', y_i, 'Accuracy: ', str(round(np.sum(y_test[indices_i] == y_pred[indices_i])/ len(np.where(y_test==y_i)[0]), 3)))

# Random Forest without SMOTE oversampling

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import RandomizedSearchCV

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 2, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in range(1,20)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Random Forest Classifier
rf = RandomForestClassifier(class_weight="balanced_subsample")

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 200, 
                               cv = 3, 
                               verbose=3, 
                               random_state=0, 
                               n_jobs = -1,
                               scoring='balanced_accuracy', 
                               return_train_score=True)

# Fit the random search model
rf_random.fit(X_train, y_train)

# Print the best parameter and the best score
print("\nBest parameter:", rf_random.best_params_)
print("Training set cross-validation balanced accuracy score:", rf_random.best_score_) 
print("Test set balanced accuracy score:", balanced_accuracy_score(y_test, rf_random.predict(X_test)))

In [None]:
rf_random.best_params_

# Best parameters

* n_estimators: 600,
* min_samples_split: 2,
* min_samples_leaf: 1,
* max_features: 'sqrt',
* max_depth: None,
* bootstrap: True

In [None]:
random_forest_clf = RandomForestClassifier(n_estimators= 668,
                                            min_samples_split= 2,
                                            min_samples_leaf= 1,
                                            max_features= 'sqrt',
                                            max_depth= None,
                                            bootstrap= False,
                                          class_weight="balanced_subsample")
random_forest_clf.fit(X_train, y_train)

print("Accuracy")
print("Training set accuracy score:", accuracy_score(y_train, 
                                                random_forest_clf.predict(X_train))) 
print("Test set accuracy score:", accuracy_score(y_test, 
                                                random_forest_clf.predict(X_test)))

print( )
print("Balanced Accuracy")
print("Training set balanced accuracy score:", balanced_accuracy_score(y_train, 
                                                                   random_forest_clf.predict(X_train))) 
print("Test set balanced accuracy score:", balanced_accuracy_score(y_test, 
                                                                   random_forest_clf.predict(X_test)))

##### Accuracy per class

In [None]:
accuracy_per_class(random_forest_clf, X_test, y_test)

# Feature importance

In [None]:
# Get the importances of each column 
importances = random_forest_clf.feature_importances_

# Arranges the indices such that the importances are arranged in descending order
indices = np.argsort(importances)[::-1]

# Find the corresponding columns
cols_feature_importance = df.drop(['state', 'area_code', 'churn'], axis=1).columns[indices].to_numpy()
print(cols_feature_importance)

plt.figure(figsize=(8,5))

plt.barh(cols_feature_importance[::-1], importances[indices][::-1])
plt.title('Feature Importance, Random Forest Model')

plt.tight_layout()

# Random forest classifier with SMOTE

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 2, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in range(1,20)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Random Forest Classifier
rf = RandomForestClassifier(class_weight="balanced_subsample")

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random_smote = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 200, 
                               cv = 3, 
                               verbose=3, 
                               random_state=0, 
                               n_jobs = -1,
                               scoring='balanced_accuracy', 
                               return_train_score=True)

# Fit the random search model
rf_random_smote.fit(X_train_smote, y_train_smote)

# Print the best parameter and the best score
print("\nBest parameter:", rf_random_smote.best_params_)
print("Training set cross-validation balanced accuracy score:", rf_random_smote.best_score_) 
print("Test set balanced accuracy score:", balanced_accuracy_score(y_test, rf_random_smote.predict(X_test)))