In [122]:
import pandas as pd
import numpy as np
import matplotlib as mp
import seaborn as sb
import sklearn
import tensorflow

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix,  roc_curve, auc
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import Callback, EarlyStopping

import os
import shutil
import warnings
warnings.filterwarnings('ignore')
import kagglehub

In [123]:
# Download latest version
downloaded_path = kagglehub.dataset_download("mnassrib/telecom-churn-datasets")
# print("Path to dataset files:", downloaded_path)

#get current working directory to save dataset
current_dir = os.getcwd()
# print(current_dir)

#get train and test data paths
train_file_path = os.path.join(downloaded_path, 'churn-bigml-80.csv')
test_file_path = os.path.join(downloaded_path, 'churn-bigml-20.csv')

destination_train_path = os.path.join(current_dir, "churn_train.csv")
destination_test_path = os.path.join(current_dir, "churn_test.csv")

#save train, test data files
train_data = pd.read_csv(shutil.copy(train_file_path, destination_train_path))
test_data = pd.read_csv(shutil.copy(test_file_path, destination_test_path))

In [124]:
#select the relevant columns

train_data = train_data[['Account length', 'International plan', 'Voice mail plan', 'Number vmail messages', 
                         'Total day minutes', 'Total day calls', 'Total day charge', 'Total eve minutes',
                         'Total eve calls', 'Total eve charge', 'Total night minutes', 'Total night calls', 
                         'Total night charge', 'Total intl minutes', 'Total intl calls', 'Total intl charge', 
                         'Customer service calls', 'Churn']]

#convert categorical into dummies
train_data = pd.get_dummies(train_data, columns=['International plan', 'Voice mail plan'], 
                            drop_first=True, dtype='int64')

#convert target boolean into integer encoding
train_data['Churn'] = train_data['Churn'].map({False:0, True:1})

test_data = test_data[['Account length', 'International plan', 'Voice mail plan', 'Number vmail messages', 
                         'Total day minutes', 'Total day calls', 'Total day charge', 'Total eve minutes',
                         'Total eve calls', 'Total eve charge', 'Total night minutes', 'Total night calls', 
                         'Total night charge', 'Total intl minutes', 'Total intl calls', 'Total intl charge', 
                         'Customer service calls', 'Churn']]

test_data = pd.get_dummies(test_data, columns=['International plan', 'Voice mail plan'], 
                            drop_first=True, dtype='int64')

test_data['Churn'] = test_data['Churn'].map({False:0, True:1})


#split the independent and target variable

X_train = train_data.drop(columns=['Churn'])
y_train = train_data['Churn']

X_test = test_data.drop(columns=['Churn'])
y_test = test_data['Churn']

In [141]:
#Logistic Regression

print("Logistic Regression model")
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

logistic_predictions = logistic_model.predict(X_test)
print("Predictions have been made.")

logistic_accuracy = accuracy_score(y_test, logistic_predictions)*100
logistic_confusion_matrix = confusion_matrix(y_test, logistic_predictions)

print(f"Accuracy:{logistic_accuracy:.2f}%")
print("Confusion matrix:\n", logistic_confusion_matrix)


Logistic Regression model
Predictions have been made.
Accuracy:84.71%
Confusion matrix:
 [[558  14]
 [ 88   7]]


In [142]:
#Random Forest model

print("Random Forest model")
random_forest_model = RandomForestClassifier(n_estimators=300, random_state=32)
random_forest_model.fit(X_train, y_train)

random_forest_predictions = random_forest_model.predict(X_test)
print("Predictions have been made.")

random_forest_accuracy = accuracy_score(y_test, random_forest_predictions)*100
random_forest_confusion_matrix = confusion_matrix(y_test, random_forest_predictions)

print(f"Accuracy:{random_forest_accuracy:.2f}%")
print("Confusion matrix:\n", random_forest_confusion_matrix)

Random Forest model
Predictions have been made.
Accuracy:95.20%
Confusion matrix:
 [[569   3]
 [ 29  66]]


In [143]:
#Gradient Boosting model

print("Gradient Boosting model")
gradient_boosting_model = GradientBoostingClassifier(n_estimators=250, random_state=32)
gradient_boosting_model.fit(X_train, y_train)

gradient_boosting_predictions = gradient_boosting_model.predict(X_test)
print("Predictions have been made.")

gradient_boosting_accuracy = accuracy_score(y_test, gradient_boosting_predictions)*100
gradient_boosting_confusion_matrix = confusion_matrix(y_test, gradient_boosting_predictions)

print(f"Accuracy:{gradient_boosting_accuracy:.2f}%")
print("Confusion matrix:\n", gradient_boosting_confusion_matrix)

Gradient Boosting model
Predictions have been made.
Accuracy:95.50%
Confusion matrix:
 [[565   7]
 [ 23  72]]


In [144]:
#Neural network model
num_classes = len(y_train.unique())
n_features = X_train.shape[1]

print("Building a sequential neural network model..")
#build a sequential model
model = Sequential()

#build the first layer 
model.add(Dense(units=60, activation='relu', input_shape = (n_features,)))

#build the final output layer
model.add(Dense(units=1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

#fit the model
callback = EarlyStopping(monitor='val_accuracy', patience=20, mode='max')
model.fit(X_train, y_train, batch_size=30, epochs=60, callbacks=[callback])

#make predictions
neural_network_predictions = model.predict(X_test)
neural_network_predictions = (neural_network_predictions>0.5).astype(int)

print("Predictions have been made.")

#evaluate error metrics
neural_network_accuracy = accuracy_score(y_test, neural_network_predictions)*100
neural_network_confusion_matrix = confusion_matrix(y_test, neural_network_predictions)

print(f"Accuracy:{neural_network_accuracy:.2f}%")
print("Confusion matrix:\n", neural_network_confusion_matrix)

Building a sequential neural network model..
Epoch 1/60
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7678 - loss: 4.2144 
Epoch 2/60
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7873 - loss: 0.9683
Epoch 3/60
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8143 - loss: 0.6046
Epoch 4/60
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8421 - loss: 0.4789
Epoch 5/60
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8458 - loss: 0.4304
Epoch 6/60
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8391 - loss: 0.4395
Epoch 7/60
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8515 - loss: 0.3970
Epoch 8/60
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8436 - loss: 0.4338
Epoch 9/60

In [145]:
#MonteCarlo Simulation to assess the risk/uncertainity for the future using RandomForest model (best model by far) 

n_simulations = 100
accuracy_list = []

print("Monte Carlo Simulation")
print(f"Total number of simulations executed:{n_simulations}")
print()
for i in range(n_simulations):
    trainX, valX, trainY, valY = train_test_split(X_train, y_train, test_size=0.30)
    gradient_boosting_model = GradientBoostingClassifier(n_estimators=250)
    gradient_boosting_model.fit(trainX, trainY)

    gradient_boosting_predictions = gradient_boosting_model.predict(valX)

    accuracy = round(accuracy_score(valY, gradient_boosting_predictions)*100,2)
    print(f"Simulation:{i+1}, Accuracy:{accuracy}")
    
    accuracy_list.append(accuracy)

# print(f"Accuracy list for {n_simulations} simulations:\n{accuracy_list}")

average_accuracy = sum(accuracy_list)/len(accuracy_list)
standard_deviation_accuracy = np.std(average_accuracy)

print(f"Average accuracy of {n_simulations} simulations:{average_accuracy}")
print(f"Standard deviation of {n_simulations} simulations:{standard_deviation_accuracy}")

Monte Carlo Simulation
Total number of simulations executed:100

Simulation:1, Accuracy:94.38
Simulation:2, Accuracy:94.38
Simulation:3, Accuracy:95.12
Simulation:4, Accuracy:94.5
Simulation:5, Accuracy:95.5
Simulation:6, Accuracy:96.38
Simulation:7, Accuracy:94.25
Simulation:8, Accuracy:95.12
Simulation:9, Accuracy:95.5
Simulation:10, Accuracy:95.62
Simulation:11, Accuracy:95.0
Simulation:12, Accuracy:95.5
Simulation:13, Accuracy:93.62
Simulation:14, Accuracy:96.25
Simulation:15, Accuracy:94.0
Simulation:16, Accuracy:94.75
Simulation:17, Accuracy:93.88
Simulation:18, Accuracy:95.12
Simulation:19, Accuracy:94.62
Simulation:20, Accuracy:95.12
Simulation:21, Accuracy:93.75
Simulation:22, Accuracy:94.62
Simulation:23, Accuracy:95.5
Simulation:24, Accuracy:94.5
Simulation:25, Accuracy:93.5
Simulation:26, Accuracy:94.88
Simulation:27, Accuracy:95.5
Simulation:28, Accuracy:95.12
Simulation:29, Accuracy:95.0
Simulation:30, Accuracy:95.88
Simulation:31, Accuracy:94.62
Simulation:32, Accuracy:9