In [2]:
# Step 1: Import required libraries
import pandas as pd
import numpy as np


In [3]:
# Step 2: Load the dataset from the data folder
df = pd.read_csv('../data/Telco_Customer_Churn_Dataset.csv')

# Show the first 5 rows
df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
# Step 3: Basic dataset info
print("Shape of dataset:", df.shape)
print("\nColumn data types and non-null counts:\n")
df.info()


Shape of dataset: (7043, 21)

Column data types and non-null counts:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7

In [5]:
# Step 4: Convert 'TotalCharges' to numeric, force errors to NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Check how many missing values were created
print("Missing values in 'TotalCharges':", df['TotalCharges'].isna().sum())


Missing values in 'TotalCharges': 11


In [6]:
# Step 5: Drop rows with missing 'TotalCharges'
df = df.dropna(subset=['TotalCharges'])

# Confirm they're gone
print("Missing values after dropping:", df['TotalCharges'].isna().sum())
print("New dataset shape:", df.shape)


Missing values after dropping: 0
New dataset shape: (7032, 21)


In [7]:
# Step 6: Check categorical columns
cat_cols = df.select_dtypes(include='object').columns.tolist()
print("Categorical Columns:\n", cat_cols)


Categorical Columns:
 ['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']


In [8]:
# Step 7: Encode Categorical Variables

from sklearn.preprocessing import LabelEncoder

# List of binary categorical columns for Label Encoding
label_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'PaperlessBilling', 'Churn']

# Initialize the LabelEncoder
encoder = LabelEncoder()

# Apply label encoding to binary categorical columns
for col in label_cols:
    df[col] = encoder.fit_transform(df[col])

print("Label Encoding done for binary columns.")

# One-hot encode nominal categorical columns
df = pd.get_dummies(df, columns=['InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                                  'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 
                                  'PaymentMethod'], drop_first=True)

print("One-Hot Encoding done for nominal columns.")


Label Encoding done for binary columns.
One-Hot Encoding done for nominal columns.


In [9]:
# Step 8: Train-Test Split

from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df.drop('Churn', axis=1)  # All columns except 'Churn'
y = df['Churn']  # Target variable

# Split data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train-Test Split done.")
print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")


Train-Test Split done.
Training data shape: (5625, 30)
Test data shape: (1407, 30)


In [11]:
# Step: Check column names after encoding
print("Columns after encoding:\n", X_train.columns.tolist())


Columns after encoding:
 ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges', 'InternetService_Fiber optic', 'InternetService_No', 'OnlineSecurity_No internet service', 'OnlineSecurity_Yes', 'OnlineBackup_No internet service', 'OnlineBackup_Yes', 'DeviceProtection_No internet service', 'DeviceProtection_Yes', 'TechSupport_No internet service', 'TechSupport_Yes', 'StreamingTV_No internet service', 'StreamingTV_Yes', 'StreamingMovies_No internet service', 'StreamingMovies_Yes', 'Contract_One year', 'Contract_Two year', 'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check']


In [12]:
# Step 9: Feature Selection

# Choose key features manually (with updated column names)
selected_features = ['tenure', 'MonthlyCharges', 'Contract_One year', 'Contract_Two year', 
                     'InternetService_Fiber optic', 'PaymentMethod_Electronic check']

# Update X_train and X_test to include only the selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

print("Feature Selection done.")
print(f"Training data shape after feature selection: {X_train_selected.shape}")
print(f"Test data shape after feature selection: {X_test_selected.shape}")


Feature Selection done.
Training data shape after feature selection: (5625, 6)
Test data shape after feature selection: (1407, 6)


In [13]:
# Step 10: Model Selection

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Initialize the logistic regression model
model = LogisticRegression(random_state=42)

# Train the model on the selected training data
model.fit(X_train_selected, y_train)

# Predict on the test data
y_pred = model.predict(X_test_selected)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Print the evaluation metrics
print("Model Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")


Model Evaluation Metrics:
Accuracy: 0.7861
Precision: 0.6281
Recall: 0.4786
F1-Score: 0.5432
ROC-AUC: 0.6880


In [23]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)

# Plot confusion matrix and save it
cm_display.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix - Logistic Regression")
plt.savefig('C:/Users/heave/OneDrive/Desktop/customer-churn-prediction/outputs/graphs/logistic_regression_confusion_matrix.png')
plt.close()

# Generate ROC curve and save it
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic - Logistic Regression')
plt.legend(loc='lower right')
plt.savefig('C:/Users/heave/OneDrive/Desktop/customer-churn-prediction/outputs/graphs/logistic_regression_roc_curve.png')
plt.close()


In [22]:
import joblib

# Define the path where you want to save the logistic regression model
logreg_model_path = 'C:/Users/heave/OneDrive/Desktop/customer-churn-prediction/models/logistic_regression_model.pkl'

# Save the logistic regression model
joblib.dump(model, logreg_model_path)

print(f"Logistic Regression Model saved at: {logreg_model_path}")


Logistic Regression Model saved at: C:/Users/heave/OneDrive/Desktop/customer-churn-prediction/models/logistic_regression_model.pkl


In [14]:
# Step 11: Model Selection - Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)

# Train the Decision Tree model
dt_model.fit(X_train_selected, y_train)

# Predict on the test data
y_pred_dt = dt_model.predict(X_test_selected)

# Evaluate the model
accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt)
recall_dt = recall_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt)
roc_auc_dt = roc_auc_score(y_test, y_pred_dt)

# Print the evaluation metrics for the Decision Tree model
print("Decision Tree Model Evaluation Metrics:")
print(f"Accuracy: {accuracy_dt:.4f}")
print(f"Precision: {precision_dt:.4f}")
print(f"Recall: {recall_dt:.4f}")
print(f"F1-Score: {f1_dt:.4f}")
print(f"ROC-AUC: {roc_auc_dt:.4f}")


Decision Tree Model Evaluation Metrics:
Accuracy: 0.7342
Precision: 0.5000
Recall: 0.5187
F1-Score: 0.5092
ROC-AUC: 0.6655


In [21]:
import os
import joblib

# Define the target directory
model_dir = r'C:\Users\heave\OneDrive\Desktop\customer-churn-prediction\models'

# Ensure the directory exists
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Save the trained Decision Tree model to a file
dt_model_path = os.path.join(model_dir, 'decision_tree_model.pkl')
joblib.dump(dt_model, dt_model_path)
print(f"Decision Tree Model saved at: {dt_model_path}")


Decision Tree Model saved at: C:\Users\heave\OneDrive\Desktop\customer-churn-prediction\models\decision_tree_model.pkl


In [15]:
# Step 12: Model Selection - Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)

# Train the Random Forest model
rf_model.fit(X_train_selected, y_train)

# Predict on the test data
y_pred_rf = rf_model.predict(X_test_selected)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, y_pred_rf)

# Print the evaluation metrics for the Random Forest model
print("Random Forest Model Evaluation Metrics:")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")
print(f"F1-Score: {f1_rf:.4f}")
print(f"ROC-AUC: {roc_auc_rf:.4f}")


Random Forest Model Evaluation Metrics:
Accuracy: 0.7562
Precision: 0.5492
Recall: 0.4626
F1-Score: 0.5022
ROC-AUC: 0.6626


In [24]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc

# Generate confusion matrix for Random Forest model
cm_rf = confusion_matrix(y_test, y_pred_rf)
cm_display_rf = ConfusionMatrixDisplay(confusion_matrix=cm_rf, display_labels=rf_model.classes_)

# Plot confusion matrix and save it
cm_display_rf.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix - Random Forest")
plt.savefig('C:/Users/heave/OneDrive/Desktop/customer-churn-prediction/outputs/graphs/random_forest_confusion_matrix.png')
plt.close()

# Generate ROC curve for Random Forest model and save it
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, y_pred_rf)
roc_auc_rf = auc(fpr_rf, tpr_rf)

plt.figure()
plt.plot(fpr_rf, tpr_rf, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc_rf)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic - Random Forest')
plt.legend(loc='lower right')
plt.savefig('C:/Users/heave/OneDrive/Desktop/customer-churn-prediction/outputs/graphs/random_forest_roc_curve.png')
plt.close()


In [19]:
import joblib

# Save the trained RandomForest model
model_path = '../models/random_forest_model.pkl'  # going one level up from notebooks/
joblib.dump(rf_model, model_path)

print(f"Model saved at: {model_path}")


Model saved at: ../models/random_forest_model.pkl


In [17]:
# Step 13: Hyperparameter Tuning - Logistic Regression

# Importing required libraries
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Step 13.1: Scale the data
# Scaling features to ensure better performance of Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

# Step 13.2: Define the hyperparameters for tuning
param_grid = {
    'solver': ['lbfgs', 'newton-cg', 'saga'],  # Different solvers for better convergence
    'max_iter': [500, 1000],  # Increase the number of iterations for convergence
    'C': [0.1, 1, 10]  # Regularization strength
}

# Step 13.3: Initialize Logistic Regression model
lr_model = LogisticRegression(random_state=42)

# Step 13.4: Perform Grid Search with Cross Validation
grid_search_lr = GridSearchCV(estimator=lr_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_lr.fit(X_train_scaled, y_train)

# Step 13.5: Display the best parameters from Grid Search
print("Best Parameters: ", grid_search_lr.best_params_)

# Step 13.6: Evaluate the model on Test Data
y_pred_lr = grid_search_lr.predict(X_test_scaled)

# Step 13.7: Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_lr)
precision = precision_score(y_test, y_pred_lr)
recall = recall_score(y_test, y_pred_lr)
f1 = f1_score(y_test, y_pred_lr)
roc_auc = roc_auc_score(y_test, y_pred_lr)

# Step 13.8: Print the evaluation metrics for the Logistic Regression model
print(f"Logistic Regression Model Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")


Best Parameters:  {'C': 1, 'max_iter': 500, 'solver': 'newton-cg'}
Logistic Regression Model Evaluation Metrics:
Accuracy: 0.7861
Precision: 0.6281
Recall: 0.4786
F1-Score: 0.5432
ROC-AUC: 0.6880


In [18]:
# Step 14: Hyperparameter Tuning - Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Create a Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Define the parameter grid for tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the model to the training data
grid_search.fit(X_train_selected, y_train)

# Get the best parameters and the best model
best_params_rf = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

# Output the best parameters
print("Best Parameters:", best_params_rf)

# Evaluate the tuned Random Forest model
y_pred_rf = best_rf_model.predict(X_test_selected)

# Metrics for evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, y_pred_rf)

print("\nRandom Forest Model Evaluation Metrics:")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")
print(f"F1-Score: {f1_rf:.4f}")
print(f"ROC-AUC: {roc_auc_rf:.4f}")


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best Parameters: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}

Random Forest Model Evaluation Metrics:
Accuracy: 0.7811
Precision: 0.6086
Recall: 0.4947
F1-Score: 0.5457
ROC-AUC: 0.6897
