# Telco Customer Churn - Model Training

Prepare the dataset Telco Customer Churn in order to research and training the best model for predicting customer churn.

# Setup Notebook

## Import

In [1]:
# Import Standard Libraries
import pandas as pd
import mlflow
import numpy as np

import os
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns

from colorama import Style, Fore

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

# Import Package Modules
from src.general_utils.general_utils import read_configuration
from src.data_preparation.data_preparation import CustomerChurnDataPreparation
from src.model_training.model_training import ModelTrainer

## Setup Plots Characteristics

In [2]:
# Define Seaborn theme parameters
theme_parameters =  {
    'axes.spines.right': False,
    'axes.spines.top': False,
    'grid.alpha':0.3,
    'figure.figsize': (16, 6),
    'font.family': 'Andale Mono',
    'axes.titlesize': 24,
    'figure.facecolor': '#E5E8E8',
    'axes.facecolor': '#E5E8E8'
}

# Set the theme
sns.set_theme(style='whitegrid',
              palette=sns.color_palette('deep'), 
              rc=theme_parameters)

In [3]:
# Define Colors
black = Style.BRIGHT + Fore.BLACK
magenta = Style.BRIGHT + Fore.MAGENTA
red = Style.BRIGHT + Fore.RED
blue = Style.BRIGHT + Fore.BLUE
reset_colors = Style.RESET_ALL

## Define Configuration

In [4]:
# Retrieve root path
root_path = Path(os.getcwd()).parents[0]

# Read configuration variables
config = read_configuration(root_path / 'configuration' / 'config.yaml')

# Extract configuration variables
dataset_config = config['dataset']
data_pipeline_config = config['data_pipeline_config']
model_training_config = config['model_training_config']

[05/28/2024 11:55:39 - general_utils] INFO - read_configuration - Start
[05/28/2024 11:55:39 - general_utils] INFO - read_configuration - Reading /Users/s.porreca/Projects/customer_churn_predictor/configuration/config.yaml
[05/28/2024 11:55:39 - general_utils] INFO - read_configuration - Configuration file /Users/s.porreca/Projects/customer_churn_predictor/configuration/config.yaml read successfully
[05/28/2024 11:55:39 - general_utils] INFO - read_configuration - End


# Read Data

In [5]:
# Read data
data = pd.read_csv(root_path.as_posix() + '/' + dataset_config['path'])

# Data Preparation

## Define Features and Label

In [6]:
# Define the features to include
features = data_pipeline_config['features']['numerical'] + \
           data_pipeline_config['features']['categorical']

# Define the label to include
label = data_pipeline_config['labels']

print('Features:')
[print(f'{index + 1}. {feature}') for index, feature in enumerate(features)]
print()
print(f'Labels: {label}')

Features:
1. tenure
2. MonthlyCharges
3. TotalCharges
4. gender
5. SeniorCitizen
6. Partner
7. Dependents
8. PhoneService
9. MultipleLines
10. InternetService
11. OnlineSecurity
12. OnlineBackup
13. DeviceProtection
14. TechSupport
15. StreamingTV
16. StreamingMovies
17. Contract
18. PaperlessBilling
19. PaymentMethod

Labels: ['Churn']


## Define Data Preparation Pipeline 

In [7]:
# Instance the data preparation pipeline object
data_preparation = CustomerChurnDataPreparation(data_pipeline_config['data_transformations'], 
                                                data_pipeline_config['features'])

[05/28/2024 11:55:39 - CustomerChurnDataPreparation] INFO - __init__ - Initialise object attributes


In [8]:
# Get the training data preparation pipeline
data_preparation_pipeline = data_preparation.build_training_data_preparation_pipeline()

[05/28/2024 11:55:39 - CustomerChurnDataPreparation] INFO - build_training_data_preparation_pipeline - Start
[05/28/2024 11:55:39 - CustomerChurnDataPreparation] INFO - build_training_data_preparation_pipeline - Build the Numerical Data Pipeline
[05/28/2024 11:55:39 - data_preparation_utils] INFO - build_numerical_data_pipeline_steps - Start
[05/28/2024 11:55:39 - data_preparation_utils] INFO - build_numerical_data_pipeline_steps - Building steps
[05/28/2024 11:55:39 - data_preparation_utils] INFO - build_numerical_data_pipeline_steps - Skipping Feature Engineering step
[05/28/2024 11:55:39 - data_preparation_utils] INFO - build_numerical_data_pipeline_steps - Adding SimpleImputer Imputation step
[05/28/2024 11:55:39 - data_preparation_utils] INFO - build_numerical_data_pipeline_steps - Adding MinMaxScaler Standardisation step
[05/28/2024 11:55:39 - data_preparation_utils] INFO - build_numerical_data_pipeline_steps - Skipping Normalization step
[05/28/2024 11:55:39 - data_preparation_u

In [9]:
data_preparation_pipeline

## Clean TotalCharges

`TotalCharges` has white space values for new customers → Replace white spaces with 0 and cast the column to float

In [10]:
# Replace spaces with 0
data['TotalCharges'] = data['TotalCharges'].replace(' ', '0')

In [11]:
# Convert the column to float
data['TotalCharges'] = data['TotalCharges'].astype(float)

## Label Encoder

In [12]:
# Encode the label
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(np.ravel(data[label]))

## Train & Test Split

In [13]:
# Define X and y for the training set
X = data[features]
y = encoded_labels

In [14]:
# Retrieve test_size and random_state
test_size = data_pipeline_config['train_test_split']['test_size']
random_state = data_pipeline_config['train_test_split']['random_state']

In [15]:
# Split training data into train and validation
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=test_size,
                                                    random_state=random_state)

# Model Training

## Setup Training

In [16]:
# Set MLflow Experiment
mlflow_experiment_name = model_training_config['mlflow']['experiment_name']

# Set MLflow Experiment
mlflow.set_experiment(mlflow_experiment_name)

<Experiment: artifact_location='file:///Users/s.porreca/Projects/customer_churn_predictor/notebooks/mlruns/887185653513794297', creation_time=1716888584313, experiment_id='887185653513794297', last_update_time=1716888584313, lifecycle_stage='active', name='Version 1.0.3', tags={}>

In [17]:
# Initialise trained models dictionary
models = {}

# Initialize DataFrame of models performance
performance = pd.DataFrame(columns=model_training_config['metrics'])

## Logistic Regression

In [18]:
# Define the model
model_lr = LogisticRegression()

# Create a ModelTrainer
model_trainer_lr = ModelTrainer(model_name=model_training_config['logistic_regression']['model_name'], 
                                model=model_lr, 
                                data_pipeline=data_preparation_pipeline)

[05/28/2024 11:55:39 - ModelTrainer] INFO - __init__ - Initialise object attributes


In [19]:
# Start an MLflow run
with mlflow.start_run(run_name=model_training_config['logistic_regression']['mlflow_run_name']):
    
    print(f'MLflow Run ID: {mlflow.active_run().info.run_id}\n')

    # Fit the model trainer
    model_trainer_lr.bundle_and_fit_pipeline(X_train, y_train)
    
    # Evaluate the model trainer
    evaluation = model_trainer_lr.evaluate_pipeline(X_test, y_test, model_training_config['metrics'])
    
    # Log model's evaluation metrics
    mlflow.log_metrics(evaluation.to_dict()['Value'])
    
    # Log model's features
    mlflow.log_params({'Features': features, 
                       'Label': label,
                       'Data Transformations': data_pipeline_config['data_transformations'],
                       'Model Initial Hyperparameters': None,
                       'Model Optimised Hyperparameters': None})
    
    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=model_trainer_lr.pipeline,
        artifact_path='model_artifacts',
        registered_model_name=model_training_config['logistic_regression']['mlflow_run_name']
    )

MLflow Run ID: 16c504d8a91949d8bd4a06c4cb9b0eb4

[05/28/2024 11:55:39 - ModelTrainer] INFO - bundle_and_fit_pipeline - Start
[05/28/2024 11:55:39 - ModelTrainer] INFO - bundle_and_fit_pipeline - Bundle the pipeline
[05/28/2024 11:55:39 - ModelTrainer] INFO - bundle_and_fit_pipeline - Fit the pipeline
[05/28/2024 11:55:39 - ModelTrainer] INFO - bundle_and_fit_pipeline - End
[05/28/2024 11:55:39 - ModelTrainer] INFO - evaluate_pipeline - Start
[05/28/2024 11:55:39 - ModelTrainer] INFO - evaluate_pipeline - Compute predictions
[05/28/2024 11:55:39 - ModelTrainer] INFO - evaluate_pipeline - Evaluate pipeline
[05/28/2024 11:55:39 - model_training_utils] INFO - compute_classification_metrics - Start
[05/28/2024 11:55:39 - model_training_utils] INFO - compute_classification_metrics - Computed metrics
[05/28/2024 11:55:39 - model_training_utils] INFO -            Value
Accuracy    0.79
Precision   0.64
Recall      0.51
F1 Score    0.57
ROC AUC     0.83
[05/28/2024 11:55:39 - model_training_uti

Successfully registered model 'Logistic Regression'.
Created version '1' of model 'Logistic Regression'.


In [21]:
# Update performance dataframe for Model Explainability
performance.loc[model_training_config['logistic_regression']['mlflow_run_name']] = evaluation.Value.values

# Update models dataframe for Model Explainability
models[model_training_config['logistic_regression']['model_name']] = model_trainer_lr.pipeline