## IMPORTING THE LIBRARIES AND MODELS

In [None]:
# local modules
import model_helper
import preprocess

# external libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os


## LOADING THE DATASET

In [None]:
def load_telecom_customer_data(filename):
    if os.path.isfile(filename):
      return pd.read_csv(filename)
    else:
      return ("Invalid file name, make sure the filename is correct and is in the same package")

In [None]:
TelecomCustomerData = load_telecom_customer_data('TelcoCustomerChurn.csv')

In [None]:
TelecomCustomerData.head()

In [None]:
TelecomCustomerData.tail()

## EXPLORE AND VISUALISE THE DATASET

In [None]:
TelecomCustomerData.info()

In [None]:
sns.pairplot(data =TelecomCustomerData, hue="Churn")

In [None]:
sns.scatterplot(data=TelecomCustomerData)

#### Exploring and visualising the training data set

In [None]:
train_copy, validate_copy, test_copy = preprocess.process_unencoded_data(data=TelecomCustomerData)

In [None]:
train_copy

In [None]:
sns.set_theme()

In [None]:
sns.pairplot(train_copy, hue='Churn')
plt.show()

In [None]:
sns.boxplot(x='Churn', y='MonthlyCharges', data=train_copy, color='lightblue')
plt.title("Comparing monthly charges of customers that churn and those that do not churn")
plt.show()

In [None]:
sns.boxplot(x='TotalCharges', y='Churn', data=train_copy, color='green')
plt.title("Comparing Total charges of customers that churn and those that do not churn")
plt.show()

In [None]:
sns.boxenplot(data=train_copy,x='Churn',y='TotalCharges' )
plt.title("Comparing Total charges of customers that churn and those that do not churn")
plt.show()

In [None]:
sns.boxenplot(data=train_copy, x='Churn', y='MonthlyCharges')
plt.title("Comparing monthly charges of customers that churn and those that do not churn")
plt.show()

## PREPROCESS THE DATA
Here, I used the function process_unencoded_data in the preprocess file to split data into train, validate, and testing data.
Afterwards, I clean the data and prepared it for training, validating and testing by performing the following operations;
* Stripped all leading and trailing whitespaces from each categorical column.
* Dropped rows where tenure was zero.
* Transformed 'TotalCharges' from object data type to float data type.
* Dropped duplicates.


In [None]:
# Here I am going to use the functions defined in the preprocess.py file to prepare the data for training, validating and testin
train_data, validate_data, test_data = preprocess.process_clean_data(data=TelecomCustomerData)

In [None]:
train_data.head()


In [None]:
train_data.shape

In [None]:
test_data.shape

In [None]:
validate_data.shape

### Getting the control model score

In [None]:
# splitting the training dataset x and y variables for trianing
x_train_data, y_train_data = train_data.drop('Churn', axis=1), train_data.Churn

In [None]:
# splitting the validating dataset into x and y variables for validating
x_validate_data, y_validate_data = validate_data.drop('Churn', axis = 1), validate_data.Churn

In [None]:
# splitting the testing dataset into x and y variables for testing
x_test_data, y_test_data = test_data.drop('Churn', axis = 1), test_data.Churn

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train_data = sc.fit_transform(x_train_data)
x_test_data = sc.transform(x_test_data)

In [None]:
# Getting the control model score as the baseline using the get_control_score function in the model_helper.py file
# I used the DummyClassifier model in the sklearn library for this
score = model_helper.get_control_score(x_train_data, y_train_data)
print("The accuracy for the control model is: ", (score*100).round(2), '%')


## TRAINING AND COMPARING MODELS

* Here I train models using the Random Forest algorithm from the sklearn library.
* The main procedures and computations have been delegated to the functions in the model_helper.py module
* I will be using these functions to train and compare different models with the Random Forest classifier by using different random_state, max_depth and min_samples_leaf for each model
#### NB: More details in the doctstrings of each function in the model_helper module

In [None]:
random_forest_classifiers = model_helper.compare_models(x_train = x_train_data,y_train= y_train_data, x_validate = x_validate_data,y_validate = y_validate_data)

In [None]:
random_forest_classifiers[random_forest_classifiers['Validate Acc Score'] > 80]

## TESTING AND EVALUATING THE CLASSIFIER WITH THE BEST FEATURE SELECTION AND MAX_DEPTH

* Here I mak predictions with the classifier model that appears to perform better after evaluating a number of models with the random forest classifier.
* None of the models from the results of the cell above (using different values used for max_depth, and min_samples_leaf in evalating the models ) were grealty over fit
* Now, after the evaluation, the model with min_samples_leaf = 5 and max_depth appears to have the best performance sine it has the best recal score and the slightly higher accuracy score

In [None]:
# Now selecting and testing the chosen model.
from test_predict import test_classifier
model, dataframe = test_classifier(x_train_data, y_train_data,x_validate_data,y_validate_data, x_test_data,y_test_data)

dataframe

In [None]:
predictions = model.predict(x_test_data)

In [None]:
metrics_fig, ax_f = plt.subplots(figsize=(10, 5))

In [None]:
from sklearn.metrics import RocCurveDisplay
ax = plt.gca()
rfc_disp = RocCurveDisplay.from_estimator(model, x_test_data, y_test_data, ax=ax, alpha=0.8)
rfc_disp.plot(ax=ax, alpha=0.8)
plt.show()

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

cm_1 = confusion_matrix(y_test_data, predictions, labels=model.classes_)
display = ConfusionMatrixDisplay(confusion_matrix=cm_1,display_labels=model.classes_)
display.plot()
plt.show()

In [None]:
model.score(x_test_data, y_test_data)

In [None]:
from test_predict import compute_predictions_dataframe   
# (explore_data, classifier, X_test):
predict_dataframe = compute_predictions_dataframe(test_copy, model, x_test_data)

In [None]:
from test_predict import write_to_csv
write_to_csv(predict_dataframe)

## BENCHMARKING WITH KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
bench = KNeighborsClassifier(n_neighbors = 5)
bench.fit(x_train_data, y_train_data)

In [None]:
bench.score(x_test_data, y_test_data)

In [None]:
pred = bench.predict(x_test_data)

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
cm = confusion_matrix(y_test_data, pred, labels=bench.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=bench.classes_)


In [None]:
disp.plot()
plt.show()