In [50]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [51]:
%cd "/content/drive/MyDrive/Resume Project/01 Production ML Pipeline with Drift Monitoring, Auto-Retraining & Scalable Deployment"

/content/drive/MyDrive/Resume Project/01 Production ML Pipeline with Drift Monitoring, Auto-Retraining & Scalable Deployment


In [53]:
#Importing libraries
import pandas as pd
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns

In [54]:
#Importing models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoost

In [55]:
#importing dependencies
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, roc_auc_score, roc_curve, classification_report, roc_curve, auc

In [56]:
#Loading the dataset
dataset =pd.read_csv("/content/drive/MyDrive/Resume Project/01 Production ML Pipeline with Drift Monitoring, Auto-Retraining & Scalable Deployment/dataset/processed/telcodata_02_processed.csv")

In [57]:
#Copying the data to preventing the accidental overwrites
telco_dataset = dataset.copy()

### Step1) Splitting data as training and testing data

**splitting the data before applying encoding or embedding techniques so in order to prevent data leakage.**

In [58]:
#Splitting the data as input feature and target data
y= telco_dataset["Churn"]   # Target feature
X = telco_dataset.drop(columns ="Churn")   # Input data point features

In [59]:
#Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=7, stratify = y)

In [60]:
#Training and testing dataset
print(f"""Shape of training input feautre data : {X_train.shape}""")
print(f"""Shape of training target feautre data : {y_train.shape}""")
print(f"""Shape of testing input feautre data : {X_test.shape}""")
print(f"""Shape of testing target feautre data : {y_test.shape}""")

Shape of training input feautre data : (5625, 18)
Shape of training target feautre data : (5625,)
Shape of testing input feautre data : (1407, 18)
Shape of testing target feautre data : (1407,)


### Step2) Label Encoding for target feature

In [61]:
#Label encoding for Churn columns
le_churn = LabelEncoder()
y_train_encoded = le_churn.fit_transform(y_train)
y_test_encoded = le_churn.transform(y_test)

In [62]:
print(le_churn.classes_)
print(pd.Series(y_train_encoded).value_counts())

['No' 'Yes']
0    4130
1    1495
Name: count, dtype: int64


* No --> 0
* Yes --> 1

### Step3) Creating Preprocessing Pipeline

**OneHot Encoding for train + test dataset categorical columns and standard scaling for numerical columns (For input features only)**

In [63]:
#Separating categorical and numerical columns
categorical_column = X_train.select_dtypes(include = "object").columns
numerical_column = X_test.select_dtypes(exclude= "object").columns

In [64]:
# Creating Transformer for performing scaling and encoding
transformer_instance = ColumnTransformer(
    [
        ("scaler", StandardScaler(), numerical_column),
        ("oh_encoder", OneHotEncoder(handle_unknown="ignore"), categorical_column)
    ]
)

### Step4) Creating a function for metrics evaluation and Roc-auc curve plotting

In [65]:
def eval_metrics(true, pred, prob,  dataset_type):
  acc = accuracy_score(true, pred)
  recall = recall_score(true, pred)
  f1 = f1_score(true, pred)
  prec = precision_score(true, pred)
  roc_auc = roc_auc_score(true, prob)
  report= classification_report(true, pred)

  print(f"Accuracy score for {dataset_type} is : {acc}")
  print(f"Recall score for {dataset_type} is : {recall}")
  print(f"f1 score for {dataset_type} is : {f1}")
  print(f"Precision score for {dataset_type} is : {prec}")
  if prob is not None:
    print(f"Roc-auc score for {dataset_type} is : {roc_auc}")
  print(f"Classification report for {dataset_type} is : \n {report}")
  print("\n")
  print("==" * 50)
  print("\n")




In [66]:
def roc_auc_curve(y_true, y_prob, dataset_type):
    fpr, tpr, thresholds = roc_curve(y_true, y_prob)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.3f})")
    plt.plot([0, 1], [0, 1], linestyle="--", label="Random Classifier")

    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"ROC Curve – {dataset_type}")
    plt.legend()
    plt.grid(alpha=0.3)
    plt.show()

### Step 5 **Main Step** Pipeline creation for different models

<h3>1. Logistic Regression</h3>

In [67]:
#Creating a pipeline for logistic regression model
log_reg_pipeline = Pipeline(
    steps = [
        ("preprocessor", transformer_instance),
        ("log_reg_model",LogisticRegression(class_weight="balanced"))
    ]
)
log_reg_pipeline.fit(X_train, y_train_encoded)

In [68]:
# Predicting scores by logistic regression
y_train_pred = log_reg_pipeline.predict(X_train)
y_train_prob = log_reg_pipeline.predict_proba(X_train)[:, 1]

y_test_pred = log_reg_pipeline.predict(X_test)
y_test_prob = log_reg_pipeline.predict_proba(X_test)[:, 1]


In [70]:
#Evaluating score
training_log_reg = eval_metrics(y_train_encoded, y_train_pred, y_train_prob, dataset_type="Training Data")
testing_log_reg = eval_metrics(y_test_encoded, y_test_pred, y_test_prob, dataset_type = "Testing data")
recall_log = recall_score(y_test_encoded, y_test_pred)
roc_auc_log= roc_auc_score(y_test_encoded, y_test_prob)

Accuracy score for Training Data is : 0.7450666666666667
Recall score for Training Data is : 0.808695652173913
f1 score for Training Data is : 0.6277258566978193
Precision score for Training Data is : 0.5129401781926177
Roc-auc score for Training Data is : 0.8457505648367845
Classification report for Training Data is : 
               precision    recall  f1-score   support

           0       0.91      0.72      0.81      4130
           1       0.51      0.81      0.63      1495

    accuracy                           0.75      5625
   macro avg       0.71      0.77      0.72      5625
weighted avg       0.81      0.75      0.76      5625





Accuracy score for Testing data is : 0.7583511016346838
Recall score for Testing data is : 0.820855614973262
f1 score for Testing data is : 0.6436058700209644
Precision score for Testing data is : 0.5293103448275862
Roc-auc score for Testing data is : 0.8550416470381164
Classification report for Testing data is : 
               precision    re

<h3>2. Naive Bayes</h3>

In [71]:
#Creating a pipeline for Naive Bayes model
naive_bayes_pipeline = Pipeline(
    steps = [
        ("preprocessor", transformer_instance),
        ("naive_bayes_model",BernoulliNB())
    ]
)
naive_bayes_pipeline.fit(X_train, y_train_encoded)

In [72]:
# Predicting scores by logistic regression
y_train_pred = naive_bayes_pipeline.predict(X_train)
y_train_prob = naive_bayes_pipeline.predict_proba(X_train)[:, 1]

y_test_pred = naive_bayes_pipeline.predict(X_test)
y_test_prob = naive_bayes_pipeline.predict_proba(X_test)[:, 1]

In [74]:
#Evaluating score
training_naive_bayes = eval_metrics(y_train_encoded, y_train_pred, y_train_prob, dataset_type="Training Data")
testing_naive_bayes = eval_metrics(y_test_encoded, y_test_pred, y_test_prob, dataset_type = "Testing data")
recall_naive = recall_score(y_test_encoded, y_test_pred)
roc_auc_nb= roc_auc_score(y_test_encoded, y_test_prob)

Accuracy score for Training Data is : 0.7219555555555556
Recall score for Training Data is : 0.7986622073578595
f1 score for Training Data is : 0.604251012145749
Precision score for Training Data is : 0.48595848595848595
Roc-auc score for Training Data is : 0.815455149125009
Classification report for Training Data is : 
               precision    recall  f1-score   support

           0       0.90      0.69      0.79      4130
           1       0.49      0.80      0.60      1495

    accuracy                           0.72      5625
   macro avg       0.70      0.75      0.69      5625
weighted avg       0.79      0.72      0.74      5625





Accuracy score for Testing data is : 0.7370291400142146
Recall score for Testing data is : 0.8181818181818182
f1 score for Testing data is : 0.6232179226069247
Precision score for Testing data is : 0.5032894736842105
Roc-auc score for Testing data is : 0.8289339497129486
Classification report for Testing data is : 
               precision    r

<h3>3. Random Forest Classifier</h3>

In [75]:
#Creating a pipeline for Naive Bayes model
random_forest_pipeline = Pipeline(
    steps = [
        ("preprocessor", transformer_instance),
        ("random_forest_model",RandomForestClassifier(n_estimators=200, random_state= 42, class_weight="balanced"))
    ]
)
random_forest_pipeline.fit(X_train, y_train_encoded)

In [76]:
# Predicting scores by logistic regression
y_train_pred = random_forest_pipeline.predict(X_train)
y_train_prob = random_forest_pipeline.predict_proba(X_train)[:, 1]

y_test_pred = random_forest_pipeline.predict(X_test)
y_test_prob = random_forest_pipeline.predict_proba(X_test)[:, 1]

In [77]:
#Evaluating score
training_random_forest = eval_metrics(y_train_encoded, y_train_pred, y_train_prob, dataset_type="Training Data")
testing_random_forest= eval_metrics(y_test_encoded, y_test_pred, y_test_prob, dataset_type = "Testing data")
recall_rf = recall_score(y_test_encoded, y_test_pred)
roc_auc_rf= roc_auc_score(y_test_encoded, y_test_prob)

Accuracy score for Training Data is : 0.9976888888888888
Recall score for Training Data is : 0.9973244147157191
f1 score for Training Data is : 0.9956594323873121
Precision score for Training Data is : 0.994
Roc-auc score for Training Data is : 0.9997751990088025
Classification report for Training Data is : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4130
           1       0.99      1.00      1.00      1495

    accuracy                           1.00      5625
   macro avg       1.00      1.00      1.00      5625
weighted avg       1.00      1.00      1.00      5625





Accuracy score for Testing data is : 0.7910447761194029
Recall score for Testing data is : 0.48128342245989303
f1 score for Testing data is : 0.5504587155963303
Precision score for Testing data is : 0.6428571428571429
Roc-auc score for Testing data is : 0.8380010457056183
Classification report for Testing data is : 
               precision    recall  f1-s

<h3>4. XGBoost Classifier</h3>

In [78]:
#Creating a pipeline for Naive Bayes model
xgboost_pipeline = Pipeline(
    steps = [
        ("preprocessor", transformer_instance),
        ("xgboost_model",XGBClassifier())
    ]
)
xgboost_pipeline.fit(X_train, y_train_encoded)

In [79]:
# Predicting scores by logistic regression
y_train_pred = xgboost_pipeline.predict(X_train)
y_train_prob = xgboost_pipeline.predict_proba(X_train)[:, 1]

y_test_pred = xgboost_pipeline.predict(X_test)
y_test_prob = xgboost_pipeline.predict_proba(X_test)[:, 1]

In [80]:
#Evaluating score
training_random_forest = eval_metrics(y_train_encoded, y_train_pred, y_train_prob, dataset_type="Training Data")
testing_random_forest= eval_metrics(y_test_encoded, y_test_pred, y_test_prob, dataset_type = "Testing data")
recall_xg = recall_score(y_test_encoded, y_test_pred)
roc_auc_xg= roc_auc_score(y_test_encoded, y_test_prob)

Accuracy score for Training Data is : 0.9290666666666667
Recall score for Training Data is : 0.82876254180602
f1 score for Training Data is : 0.8613138686131386
Precision score for Training Data is : 0.8965267727930536
Roc-auc score for Training Data is : 0.9804147805032108
Classification report for Training Data is : 
               precision    recall  f1-score   support

           0       0.94      0.97      0.95      4130
           1       0.90      0.83      0.86      1495

    accuracy                           0.93      5625
   macro avg       0.92      0.90      0.91      5625
weighted avg       0.93      0.93      0.93      5625





Accuracy score for Testing data is : 0.7853589196872779
Recall score for Testing data is : 0.5401069518716578
f1 score for Testing data is : 0.5722379603399433
Precision score for Testing data is : 0.608433734939759
Roc-auc score for Testing data is : 0.8383194164755577
Classification report for Testing data is : 
               precision    rec

### Step6) Final Result

In [83]:
# Creating dataFrame for storing final result having roc-auc recall score for various models
result = pd.DataFrame({
    "Model" : ["Logistic Regression", "Naive Bayes", "Random Forest Classifier", "Xgboost"],
    "Recall Score" : [
        recall_log, recall_naive, recall_rf, recall_xg
    ],
    "ROC-AUC Score" : [
        roc_auc_log, roc_auc_nb, roc_auc_rf, roc_auc_xg
    ]
})
result

Unnamed: 0,Model,Recall Score,ROC-AUC Score
0,Logistic Regression,0.820856,0.855042
1,Naive Bayes,0.818182,0.828934
2,Random Forest Classifier,0.481283,0.838001
3,Xgboost,0.540107,0.838319


<h3>Final Gettings :</h3>

* As our data is highly imbalanced so we will be Looking here Recall and ROC-AUC instead of accuracy.
* Logistic Regresssion is performing well out of all the used model.
* Tree-based models showed signs of overfitting without hyperparameter tuning.