###Libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import ipywidgets as widgets

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from IPython.display import display
from termcolor import colored

###Reading Data


In [None]:
df = pd.read_csv('customer_churn.csv')

###Dealing with missing data (in this case there's no missing data)

In [None]:
missing_data = df.isnull().sum() #finding the sum of missing data for each column
print("Missing values in each column:\n")
missing_data

In [None]:
#setting the empty numerical data to the mean for that column
numerical_imputer = SimpleImputer(strategy='mean')
#setting the empty catagorical data to the most frequent for that column
categorical_imputer = SimpleImputer(strategy='most_frequent')

numerical_features = ['Age', 'Tenure', 'Usage Frequency', 'Support Calls', 'Payment Delay', 'Total Spend', 'Last Interaction']
categorical_features = ['Gender', 'Subscription Type', 'Contract Length']
# Fill missing numerical and categorical data
df[numerical_features] = numerical_imputer.fit_transform(df[numerical_features])
df[categorical_features] = categorical_imputer.fit_transform(df[categorical_features])
df.head(5)

###Understaning the Data with tables and visuals

In [None]:
df.info()

In [None]:
df.describe().round(2)

In [None]:
#reading the data divided by gender
gender_churn = df.groupby(['Gender', 'Churn']).size().unstack().fillna(0)
gender_churn.columns = ['Not Churned', 'Churned']

sizes = [
    gender_churn.loc['Male', 'Churned'],
    gender_churn.loc['Male', 'Not Churned'],
    gender_churn.loc['Female', 'Churned'],
    gender_churn.loc['Female', 'Not Churned']
]
labels = ["M: Churn", "M: No Churn", "F: Churn", "F: No Churn"]
colors = ['#ff6666', '#6666ff', '#ff9999', '#ffccff']

pie_data = pd.DataFrame({
    'Category': labels,
    'Count': sizes
})

#create the pie chart
fig = px.pie(pie_data, values='Count', names='Category', title='Churn by Gender',
             color_discrete_sequence=colors, hole=0.3)

#the hover information
fig.update_traces(textinfo='percent+label', hoverinfo='label+percent+value')

fig.show()

#helping source: https://plotly.com/python/pie-charts/ to create an interactive pie chart

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Payment Delay', y='Total Spend', hue='Churn', palette={0: 'green', 1: 'red'}, data=df, alpha=0.6)

plt.title('Payment Delay vs Total Spend')
plt.xlabel('Payment Delay')
plt.ylabel('Total Spend')
plt.legend(title='Churn', loc='lower left', labels=['No Churn', 'Churn'])
plt.show()

In [None]:
print(df.columns)

###One Hot Encoding  

In [None]:
#changing catagorical columns to numerical
df_encoded = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform='pandas')
df_transformed = df_encoded.fit_transform(df[['Gender', 'Subscription Type', 'Contract Length']])
#prints table of the OneHotEncoded columns with numerical data
df_transformed
#helping source: Udemy course (Machine Learning A-Z: AI, Python & R + ChatGPT Prize [2024])


In [None]:
#droping the encoded columns and adding the new columns to the dataset
df=pd.concat([df,df_transformed], axis=1).drop(columns = ['Gender', 'Subscription Type', 'Contract Length'])
df.head(5)#check if encoded columns concatonated correctly

###Different Data Visualization

In [None]:
#finds and prints min and max age
min_age=df['Age'].min()
max_age=df['Age'].max()
print("Range of Ages: {} - {}".format(min_age, max_age))

sns.set(style="whitegrid")

#violin plot
plt.figure(figsize=(10, 6))
sns.violinplot(x='Churn', y='Age', data=df)
plt.title('Distribution of Age by Churn')
plt.xlabel('Churn')
plt.ylabel('Age')
plt.show()

#bar graph
age_ranges = [(18, 29), (30, 41), (42, 53), (54, 65)]
churn_counts = []

#calculate churn counts for each age range
for age_range in age_ranges:
    min_age, max_age = age_range
    churn_count = df[(df['Age'] >= min_age) & (df['Age'] <= max_age)]['Churn'].sum()
    churn_counts.append(churn_count)

#using for loop to set the x label to the min and max age in the age_range list
x_labels = ['{}-{}'.format(min_age, max_age) for min_age, max_age in age_ranges]

#create bar plot
plt.figure(figsize=(10, 6))
bars = plt.bar(x_labels, churn_counts, color='skyblue')

#adds churn count on top of bars
for bar, count in zip(bars, churn_counts):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), str(count),
             ha='center', va='bottom')

plt.title('Churn Count by Age Range')
plt.xlabel('Age Range')
plt.ylabel('Churn Count')
plt.show()

In [None]:
#features used in pair plot
features = ['Age', 'Tenure', 'Payment Delay', 'Total Spend', 'Churn']
sns.pairplot(df[features], hue='Churn', palette='coolwarm', diag_kind='kde')
plt.show()


In [None]:
correlation_matrix = df.corr()

#the correlation of each feature with 'Churn'
churn_correlation = correlation_matrix['Churn'].drop('Churn').sort_values(ascending=False)
#drop customer ID it has nothing to do with the predictions
churn_correlation = correlation_matrix['Churn'].drop(['Churn', 'CustomerID']).sort_values(ascending=False)

#print the correlation values
print("Correlation of features with Churn:\n", churn_correlation)

#plot the correlation values
plt.figure(figsize=(10, 6))
sns.barplot(x=churn_correlation.values, y=churn_correlation.index, palette='coolwarm', hue=churn_correlation.index, dodge=False)
plt.title('Correlation of Features with Churn')
plt.xlabel('Correlation Coefficient')
plt.ylabel('Features')
plt.show()

In [None]:
print(df.describe().round(2))

In [None]:
#setting variable to churn counts from dataframe
churn_counts = df['Churn'].value_counts().reset_index()
churn_counts.columns = ['Churn', 'Count']
#interactive bar chart using Ploty.express
fig = px.bar(churn_counts, x='Churn', y='Count', title='Distribution of Churn vs Non-Churn Customers',
             labels={'Churn': 'Churn', 'Count': 'Count'}, text='Count', color='Churn')
fig.show()
#helping source: https://plotly.com/python/bar-charts/

In [None]:
#Correlation Heatmap
plt.figure(figsize=(15, 12))
correlation_matrix = df.corr().round(2)
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Feature Correlation Heatmap')
plt.show()

###Splitting Data for Training and Testing

In [None]:
#setting a varaible to CustomerID before dropping it,
#to be used later when printing customer Id for those that where predicted to churn
customer_ids = df['CustomerID']

In [None]:
#Splitting the dataset to train and test
x = df.drop(columns=['CustomerID','Churn'])
y = df['Churn']

x_train, x_test, y_train, y_test, customer_ids_train, customer_ids_test = train_test_split(
    x, y, customer_ids, test_size=0.25, random_state=1)
#prints the split data
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

In [None]:
#Feature Scaling
scaler = StandardScaler()
features_to_scale = ['Age', 'Tenure', 'Usage Frequency', 'Support Calls',
                      'Payment Delay', 'Total Spend', 'Last Interaction']
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])
df.head(5)
#helping source: udemy course (Machine Learning A-Z: AI, Python & R + ChatGPT Prize [2024])

###Training Different Algorithms

helping source: udemy course (Machine Learning A-Z: AI, Python & R + ChatGPT Prize [2024])


In [None]:
#Training the LinearSVC
svm_model = LinearSVC(random_state=1, max_iter=10000)
svm_model.fit(x_train, y_train)

#Make the predictions
svm_y_pred = svm_model.predict(x_test)

#print the prediction accuracy
print("LinearSVC Accuracy:", accuracy_score(y_test, svm_y_pred))
print("\nLinearSVC Confusion Matrix:")
print(confusion_matrix(y_test, svm_y_pred))
print("\nLinearSVC Classification Report:")
print(classification_report(y_test, svm_y_pred))


In [None]:
#combining the predictions with the customer IDs
predicted_df = pd.DataFrame({'CustomerID': customer_ids_test, 'Predicted Churn': svm_y_pred})

#printing Customer IDs of predicted churners using SVM model
predicted_churners = predicted_df.loc[predicted_df['Predicted Churn'] == 1, 'CustomerID'].sort_values()
print("Customer IDs of predicted churners using SVM model:")
for customer_id in predicted_churners:
    print("---> ",customer_id)

In [None]:
#ROC curve for LinearSVC
from sklearn.metrics import roc_curve, auc
def plot_roc_curve(y_true, y_pred_scores, title):
    fpr, tpr, _ = roc_curve(y_true, y_pred_scores)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='green', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {title}')
    plt.legend(loc="lower right")
    plt.show()

svm_scores = svm_model.decision_function(x_test)
plot_roc_curve(y_test, svm_scores, 'LinearSVC')

In [None]:
#training the Naive Bayes algorithm
nb_model = GaussianNB()
nb_model.fit(x_train, y_train)

#making predictions
nb_y_pred = nb_model.predict(x_test)

#printing prediction accuracy
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_y_pred))
print("\nNaive Bayes Confusion Matrix:")
print(confusion_matrix(y_test, nb_y_pred))
print("\nNaive Bayes Classification Report:")
print(classification_report(y_test, nb_y_pred))

In [None]:
#printing out the customer Ids of those predicted to churn using Naive Bayes
predicted_df = pd.DataFrame({'CustomerID': customer_ids_test, 'Predicted Churn': nb_y_pred})

predicted_churners = predicted_df.loc[predicted_df['Predicted Churn'] == 1, 'CustomerID'].sort_values()
print("Customer IDs of predicted churners using Naive Bayes:")
for customer_id in predicted_churners:
    print("---> ",customer_id)

In [None]:
def plot_roc_curve(y_true, y_pred_proba, title):
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='red', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {title}')
    plt.legend(loc="lower right")
    plt.show()

#ROC curve for Naive Bayes
plot_roc_curve(y_test, nb_model.predict_proba(x_test)[:, 1], 'Naive Bayes')

In [None]:
#train Decision Tree
dt_model = DecisionTreeClassifier(random_state=1)
dt_model.fit(x_train, y_train)

#make predictions
dt_y_pred = dt_model.predict(x_test)

#printing prediction accuracy
print("Decision Tree Accuracy:", accuracy_score(y_test, dt_y_pred))
print("\nDecision Tree Confusion Matrix:")
print(confusion_matrix(y_test, dt_y_pred))
print("\nDecision Tree Classification Report:")
print(classification_report(y_test, dt_y_pred))

Decision Tree customer predictions

In [None]:
#printing out the customer Ids of those predicted to churn using Decision Tree
predicted_df = pd.DataFrame({'CustomerID': customer_ids_test, 'Predicted Churn': dt_y_pred})

predicted_churners = predicted_df.loc[predicted_df['Predicted Churn'] == 1, 'CustomerID'].sort_values()
print("Customer IDs of predicted churners using :")
for customer_id in predicted_churners:
    print("---> ",customer_id)

In [None]:
def plot_roc_curve(y_true, y_pred_proba, title):
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='purple', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {title}')
    plt.legend(loc="lower right")
    plt.show()

#ROC curve for Decision Tree
dt_y_pred_proba = dt_model.predict_proba(x_test)[:, 1]
plot_roc_curve(y_test, dt_y_pred_proba, 'Decision Tree')

###Most accurate algorithm: Decision Tree
helping source: https://www.geeksforgeeks.org/adding-value-labels-on-a-matplotlib-bar-chart/


In [None]:
#collecting the accuracy scores for each model in a list
model_names = ['SVC', 'Naive Bayes', 'Decision Tree']
accuracy_scores = [
    accuracy_score(y_test, svm_y_pred),
    accuracy_score(y_test, nb_y_pred),
    accuracy_score(y_test, dt_y_pred)
]
#setting up the bar graph
fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.bar(model_names, accuracy_scores, color=['green', 'red', 'purple'])

#adding text labels with the accuracy scores
for bar, acc in zip(bars, accuracy_scores):
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f'{acc:.4f}',
            ha='center', va='bottom', fontsize=12)

#adding title and labels
ax.set_title('Comparison of Model Accuracies', fontsize=16)
ax.set_xlabel('Models', fontsize=14)
ax.set_ylabel('Accuracy', fontsize=14)
ax.set_ylim([0, 1])
plt.show()

####Applying Decision Tree alogirthm to a scatter plot

In [None]:
#reverse to orignal values for all featuers for better visualization
original_features = scaler.inverse_transform(df[features_to_scale])

original_df = pd.DataFrame(original_features, columns=features_to_scale)
df[['Payment Delay', 'Total Spend']] = original_df[['Payment Delay', 'Total Spend']]

features = ['Payment Delay', 'Total Spend']
target = 'Churn'

x = df[features] #splitting the dataset
y = df[target]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)

dt_model_sc_plot = DecisionTreeClassifier(random_state=1)
dt_model_sc_plot.fit(x_train, y_train)#training the Decision Tree model with only these two features

#creating a mesh grid and boundries
X, Y = np.meshgrid(np.arange(0, 35, 0.1), np.arange(0, 1100, 0.1))

grid = np.c_[X.ravel(), Y.ravel()]
Z = dt_model_sc_plot.predict(grid)
Z = Z.reshape(X.shape)

plt.figure(figsize=(10, 6))
sns.scatterplot(x='Payment Delay', y='Total Spend', hue='Churn', palette={0: 'red', 1: 'green'}, data=df, alpha=0.6)
plt.contourf(X, Y, Z, alpha=0.3, cmap=plt.cm.RdYlGn)

plt.title('Payment Delay vs Total Spend with Decision Boundary')
plt.xlabel('Payment Delay')
plt.ylabel('Total Spend')
plt.legend(title='Churn', loc='lower left', labels=['No Churn', 'Churn'])
plt.show()

print("Decision Tree Accuracy:", accuracy_score(y_test, dt_y_pred))
#helping sources: udemy course (Machine Learning A-Z: AI, Python & R + ChatGPT Prize [2024])
#https://stackoverflow.com/questions/22294241/plotting-a-decision-boundary-separating-2-classes-using-matplotlibs-pyplot

##User Input
helping source: https://blog.neurotech.africa/interactive-results-with-jupyter-notebooks/

In [None]:
#retrained the Decision Tree with user input
x = df.drop(columns=['CustomerID', 'Churn'])
y = df['Churn']


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)
scaler = StandardScaler()
features_to_scale = ['Age', 'Tenure', 'Usage Frequency', 'Support Calls', 'Payment Delay', 'Total Spend', 'Last Interaction']
scaler.fit(x_train[features_to_scale])

x_train[features_to_scale] = scaler.transform(x_train[features_to_scale])
x_test[features_to_scale] = scaler.transform(x_test[features_to_scale])

dt_model_userInput = DecisionTreeClassifier(random_state=1)
dt_model_userInput.fit(x_train, y_train)

#create widgets for user inputs

age = widgets.FloatText(description='Age')
tenure = widgets.FloatText(description='Tenure')
usage_frequency = widgets.FloatText(description='Usage Frequency')
support_calls = widgets.FloatText(description='Support Calls')
payment_delay = widgets.FloatText(description='Payment Delay')
total_spend = widgets.FloatText(description='Total Spend')
last_interaction = widgets.FloatText(description='Last Interaction')

gender = widgets.Dropdown(
    options=['Female', 'Male'],
    description='Gender'
)
subscription_type = widgets.Dropdown(
    options=['Basic', 'Premium', 'Standard'],
    description='Subscription Type'
)
contract_length = widgets.Dropdown(
    options=['Annual', 'Monthly', 'Quarterly'],
    description='Contract Length'
)

prediction_button = widgets.Button(description='Predict Churn')#button to make prediction
output = widgets.Output()

#encoding categorical variables
def encode_categorical(gender, subscription_type, contract_length):
    encoded = []
    encoded.append(1 if gender == 'Male' else 0)
    encoded.append(1 if gender == 'Female' else 0)
    encoded.append(1 if subscription_type == 'Basic' else 0)
    encoded.append(1 if subscription_type == 'Premium' else 0)
    encoded.append(1 if subscription_type == 'Standard' else 0)
    encoded.append(1 if contract_length == 'Annual' else 0)
    encoded.append(1 if contract_length == 'Monthly' else 0)
    encoded.append(1 if contract_length == 'Quarterly' else 0)
    return encoded

#function to make prediction
def predict_churn(b):
    numerical_inputs = np.array([
        age.value, tenure.value, usage_frequency.value, support_calls.value,
        payment_delay.value, total_spend.value, last_interaction.value
    ]).reshape(1, -1)
    scaled_numerical_inputs = scaler.transform(numerical_inputs)
    categorical_inputs = encode_categorical(gender.value, subscription_type.value, contract_length.value)
    input_features = np.concatenate([scaled_numerical_inputs.flatten(), categorical_inputs])
    prediction = dt_model_userInput.predict([input_features])[0]
    with output:
        output.clear_output()
        if prediction == 1:
            print("\n----------->Prediction: Churn<-----------\n\n\n")
        else:
            print("\n----------->Prediction: No Churn<-----------\n\n\n")

prediction_button.on_click(predict_churn)#adding the predict_churn function to the button

#display widgets
display(age, tenure, usage_frequency, support_calls, payment_delay, total_spend, last_interaction,
        gender, subscription_type, contract_length, prediction_button, output)