In [2]:
import pandas as pd

# modelling
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# model registry
import mlflow
import mlflow.sklearn

# Import Prepared Data

In [3]:
X_train = pd.read_pickle('./../data/modelling/X_train.pkl')
X_test = pd.read_pickle('./../data/modelling/X_test.pkl')
y_train = pd.read_pickle('./../data/modelling/y_train.pkl')
y_test = pd.read_pickle('./../data/modelling/y_test.pkl')

# Modelling
Note: 
- Model comparisons and hyperparameter tuning are skipped as the focus is on deployment of a basic model.
- Improved modelling can be added at a later stage and incorporated into the ML model lifecycle

In [5]:
X_train.columns

Index(['tenure', 'MonthlyCharges', 'TotalCharges', 'gender_Male',
       'SeniorCitizen_1', 'Partner_Yes', 'Dependents_Yes', 'PhoneService_Yes',
       'MultipleLines_No', 'MultipleLines_No phone service',
       'MultipleLines_Yes', 'InternetService_DSL',
       'InternetService_Fiber optic', 'InternetService_No',
       'OnlineSecurity_No', 'OnlineSecurity_No internet service',
       'OnlineSecurity_Yes', 'OnlineBackup_No',
       'OnlineBackup_No internet service', 'OnlineBackup_Yes',
       'DeviceProtection_No', 'DeviceProtection_No internet service',
       'DeviceProtection_Yes', 'TechSupport_No',
       'TechSupport_No internet service', 'TechSupport_Yes', 'StreamingTV_No',
       'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No', 'StreamingMovies_No internet service',
       'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'PaperlessBilling_Yes',
       'PaymentMethod_Bank transfer (automatic)',

In [13]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_validate(model, X_train, y_train, cv=cv, scoring=['accuracy', 'f1'], n_jobs=-1, verbose=1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 22 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.2s finished


In [14]:
mean_f1 = scores['test_f1'].mean()
mean_accuracy = scores['test_accuracy'].mean()
std_f1 = scores['test_f1'].std()
std_accuracy = scores['test_accuracy'].std()

print(f'Mean F1 Score: {mean_f1:.4f} ± {std_f1:.4f}')
print(f'Mean Accuracy: {mean_accuracy:.4f} ± {std_accuracy:.4f}')

Mean F1 Score: 0.5330 ± 0.0136
Mean Accuracy: 0.7819 ± 0.0043


# Model Registering

In [None]:
with mlflow.start_run():
    # log metrics from cv
    mlflow.log_metric('cv_test_f1_mean', mean_f1)
    mlflow.log_metric('cv_test_f1_std', std_f1)

    # final model training
    model.fit(X_train, y_train)

    # register model
    mlflow.sklearn.log_model(
        sk_model=model,
        name='telco_churn_classifier',
        registered_model_name='telco_churn_classifier'
    )

Registered model 'telco_churn_classifier' already exists. Creating a new version of this model...
Created version '2' of model 'telco_churn_classifier'.
