#### Loading Packages

In [None]:
import pandas as pd
import numpy as np

# Plotting
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import seaborn as sns

# Utilities
import sys
import os

# Statsmodels
import statsmodels.api as sm
from statsmodels.tools import add_constant

# Sklearn modules
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve, make_scorer, confusion_matrix, accuracy_score, recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
project_root = os.path.abspath('..')

# Adding the project root to my system path
if project_root not in sys.path:
    sys.path.append(project_root)

from src.data_loader import telco_data_loader
from src.data_processor import telcoDataCleaner

#### Loading and cleaning data

In [None]:
# running the loader
raw_telco_data = telco_data_loader()

cleaner = telcoDataCleaner()

cleaned_data = cleaner.clean_data(raw_telco_data)

## Testing Models

Data prep

In [None]:
# identifying predeictors columns with NA's

# Only selecting columns that aren't missing values
non_na_cols = []

for col in cleaned_data['predictors']:
    if cleaned_data['predictors'][col].isna().sum() == 0:
        non_na_cols.append(col)

# Adding constant to the predictors
reg_predictors = add_constant(cleaned_data['predictors'].copy(),
                              has_constant = 'raise',
                              prepend = True)

##### Statistical Modeling

Performing regressions with the statistical model first to understand the impact of predictors.

In [None]:
online_log_reg_model = sm.Logit(endog = cleaned_data['target'][cleaned_data["masks"]],
                         exog = reg_predictors.loc[cleaned_data["masks"], :]).fit()

In [None]:
print(online_log_reg_model.summary())

In [None]:
cols_to_exclude = non_na_cols.copy()

cols_to_exclude.remove("InternetService")

nol_log_reg_model = sm.Logit(endog = cleaned_data['target'][~np.array(cleaned_data["masks"])],
                             exog = reg_predictors.loc[~np.array(cleaned_data["masks"]), cols_to_exclude]).fit()

In [None]:
print(nol_log_reg_model.summary())

## Switching to ML

### Internet Customers Model

In [None]:
internet_target = cleaned_data['target'][cleaned_data["masks"]]
internet_preds = reg_predictors.loc[cleaned_data["masks"], :]

Writing a custom loss function to reflect the tradeoff of customer churn vs. outreach

In [None]:
# Custom loss function
def churn_loss(y_true, y_pred):
    '''Custom scoring function to represent the business cost of churned customers.
       Weights are dependent on the cost of customer acquisition.
       Assumption is a churned customer is worth ~6 months of average revenue, 
       while interventions cost 2 months of average revenue. Therefore the relative payoff weights are 3:1'''

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    cost = (fn * 3) + (fp * 1)
    return -cost

# Making a custom loss scorer
churn_loss_scorer = make_scorer(churn_loss)
        

In [None]:
# Scaling Data
scaler = StandardScaler()

internet_preds_for_model = scaler.fit_transform(internet_preds)

# Initiating Class of LogisticRegressionCV
log_reg_cv = LogisticRegressionCV(cv = 5, 
                                  max_iter = 10000, 
                                  random_state=42,
                                  scoring = churn_loss_scorer)

# Fitting the regression
log_reg_cv.fit(X = internet_preds, 
               y = internet_target)

### Grid Searching

Logistic Regression

In [None]:
# Setting up the parameter grid
log_reg_param_grid = {'C': np.linspace(start = 0.1, stop = 1, num = 10),
                      'penalty' : ['l1', 'l2']}



# Initializing a LogisticRegression class
log_reg_gs = GridSearchCV(LogisticRegression(max_iter = 10000,
                                             solver = 'liblinear'),
                          scoring= {"churn_loss": churn_loss_scorer,
                                    "accuracy": 'accuracy',
                                   "recall": 'recall',
                                    "roc_auc": 'roc_auc'},
                          param_grid= log_reg_param_grid,
                          cv = 5,
                          refit= "churn_loss")

In [None]:
# Fitting the grid search
log_reg_gs.fit(internet_preds, internet_target)

In [None]:
# Getting the best parameters
print(f"Best parameters: {log_reg_gs.best_params_}")
print(f"Best score: {log_reg_gs.best_score_}")

gs_cv_results_df = pd.DataFrame(log_reg_gs.cv_results_)

Testing Random Forest

In [None]:
# Creating a random forest parameter grid
rf_param_grid = {'n_estimators': np.arange(14, 40, 2),
                 'max_depth': np.arange(2, 20, 2)}

# Setting up the K-Fold Object
rf_gs = GridSearchCV(RandomForestClassifier(criterion='entropy',
                                            max_depth = 10),
                     param_grid= rf_param_grid,
                     cv = 5,
                     scoring= {"churn_loss": churn_loss_scorer,
                                    "accuracy": 'accuracy',
                                   "recall": 'recall',
                                    "roc_auc": 'roc_auc'},
                    refit= 'churn_loss')

In [None]:
# Fitting the Random Forest Grid
rf_gs.fit(internet_preds, internet_target)

In [None]:
print(f"Best parameters: {rf_gs.best_params_}")
print(f"Best score: {rf_gs.best_score_}")

In [None]:
rf_cv_results = pd.DataFrame(rf_gs.cv_results_)

rf_cv_results.columns

In [None]:
rf_fig = plt.figure()
rf_ax = plt.axes(projection='3d')
rf_ax.view_init(elev=0, azim=0)
rf_ax.scatter3D(rf_cv_results['param_max_depth'],
             rf_cv_results['param_n_estimators'],
             rf_cv_results['mean_test_churn_loss'])
plt.xlabel('Max Depth')
plt.ylabel('N estimators')
plt.show()