In [1]:
%matplotlib notebook

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from collections import Counter

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import balanced_accuracy_score, accuracy_score
from scipy.stats import uniform, randint, loguniform, norm

# Pre-processing

In [2]:
# Read the data
df = pd.read_csv('C:/Users/vabalagon/Desktop/Machine Learning Projects/Applied-Machine-Learning-Projects/Customer Churn Prediction/data/processed.csv')

# Get the features and target variable from the dataframe
X = df.drop(['state', 'area_code', 'churn'], axis=1).to_numpy()
y = df['churn'].to_numpy()

# Split into training and testing parts
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.copy(), y, 
                                                    test_size = 0.3, 
                                                    shuffle=True, 
                                                    random_state=42, 
                                                    stratify=y)

In [3]:
def accuracy_per_class(classifier, X_test, y_test):
    y_pred = classifier.predict(X_test)

    for y_i in np.unique(y_test)[::-1]:
        print()

        # Find the indices of y_i in the true labels
        indices_i = np.where(y_test == y_i)

        # Computes the accuracy
        print('class', y_i, 'Accuracy: ', str(round(np.sum(y_test[indices_i] == y_pred[indices_i])/ len(np.where(y_test==y_i)[0]), 3)))

# XGBoost

Parameter guide: https://towardsdatascience.com/xgboost-fine-tune-and-optimize-your-model-23d996fab663

In [4]:
import xgboost as xgb

  from pandas import MultiIndex, Int64Index


In [5]:
from sklearn.utils import class_weight

classes_weights = class_weight.compute_sample_weight(
                                                        class_weight='balanced',
                                                        y=y_train
                                                    )

In [21]:
model_cv = xgb.XGBClassifier(eval_metric='logloss',
                            objective="binary:logistic",
                            use_label_encoder=False,
                            n_estimators=300,
                            max_depth=6,
                            min_child_weight=7,
                            subsample=.6,
                            colsample_bytree=.5,
                            reg_lambda=6, #[0-20]
                            reg_alpha=4,
                            gamma=.45,
                            seed=42,
                            learning_rate=0.3,
                            scale_pos_weight=.9)
model_cv.fit(X_train, y_train)

print( )
print('Accuracy')
print("Train accuracy score: ", accuracy_score(y_train, model_cv.predict(X_train)))
print("Test accuracy score: ", accuracy_score(y_test, model_cv.predict(X_test)))
print( )
print('Balanced Accuracy')
print("Train balanced accuracy score: ", balanced_accuracy_score(y_train, model_cv.predict(X_train)))
print("Test accuracy score: ", balanced_accuracy_score(y_test, model_cv.predict(X_test)))


Accuracy
Train accuracy score:  0.9774789915966386
Test accuracy score:  0.9733333333333334

Balanced Accuracy
Train balanced accuracy score:  0.9240385297731764
Test accuracy score:  0.9190504832198345


##### Accuracy per class

In [22]:
accuracy_per_class(model_cv, X_test, y_test)


class 1 Accuracy:  0.844

class 0 Accuracy:  0.995


# RandomCV to find the best parameters

In [None]:
# model_test_clf = xgb.XGBClassifier(eval_metric='logloss',
#                             objective="binary:logistic",
#                             use_label_encoder=False)

# params = {
#         "n_estimators": [100, 300, 500],
#         "learning_rate" : [0.05,0.10,0.15,0.20,0.25,0.30],
#         "max_depth" : [2, 3, 4, 5, 6, 8, 10],
#         "min_child_weight" : [1, 3, 5, 7],
#         "gamma": [0.0, 0.15, 0.3, 0.45, 0.6, 0.75, 0.9],
#         "colsample_bytree" : [0.3, 0.4, 0.5, 0.7],
#         "reg_lambda": [1,2,3,4,5,6,7,8,9,10],
#         "reg_alpha": [1,2,3,4,5,6,7,8,9,10]
#         }

# xgb_model = RandomizedSearchCV(model_test_clf,
#                                 param_distributions=params,
#                                 n_iter=200,
#                                 scoring='balanced_accuracy',
#                                 n_jobs=-1,
#                                 cv=5,
#                                 verbose=5)

# xgb_model.fit(X_train, y_train) #, sample_weight=classes_weights

# print("\nBest parameter:", xgb_model.best_params_)
# print("Training set cross-validation balanced accuracy score:", xgb_model.best_score_) 
# print("Test set balanced accuracy score:", balanced_accuracy_score(y_test, xgb_model.predict(X_test)))