<a href="https://colab.research.google.com/github/Sahanmee/Machine-Learning-Coursework/blob/main/2425450.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Exploratory Data Analysis**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn import tree

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Machine Learning Dataset/WA_Fn-UseC_-Telco-Customer-Churn.csv")

## Quick exploration

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
#Drop duplicates
df.shape

In [None]:
duplicates = df[df.duplicated()]
duplicates.shape
#There are no duplicates

## Missing Data

In [None]:
df.isnull().sum()

In [None]:
#Fixing TotalCharges
if df['TotalCharges'].dtype == 'object':
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())
else:
    print("Column 'TotalCharges' is already numeric, skipping conversion.")

In [None]:
df.head(5)

##Visualizing

In [None]:
#Distribution Diagram
plt.figure(figsize=(4,4))
sns.countplot(data=df, x="Churn")
plt.title("Churn Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="gender")
plt.title("Gender Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="Partner")
plt.title("Partner Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="Dependents")
plt.title("Dependents Class Distribution")
plt.show()

plt.figure(figsize=(20,4))
sns.countplot(data=df, x="tenure")
plt.title("tenure Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="PhoneService")
plt.title("PhoneService Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="MultipleLines")
plt.title("MultipleLines Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="InternetService")
plt.title("InternetService Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="OnlineSecurity")
plt.title("OnlineSecurity Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="OnlineBackup")
plt.title("OnlineBackup Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="DeviceProtection")
plt.title("DeviceProtection Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="TechSupport")
plt.title("TechSupport Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="StreamingTV")
plt.title("StreamingTV Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="StreamingMovies")
plt.title("StreamingMovies Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="Contract")
plt.title("Contract Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="PaperlessBilling")
plt.title("PaperlessBilling Class Distribution")
plt.show()

plt.figure(figsize=(10,4))
sns.countplot(data=df, x="PaymentMethod")
plt.title("PaymentMethod Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.histplot(data=df, x="MonthlyCharges")
plt.title("Monthly Charges Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.histplot(data=df, x="TotalCharges")
plt.title("Total Charges Distribution")
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df.select_dtypes(include=["int64", "float64"]).corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
plt.figure(figsize=(6,5))
sns.boxplot(data=df, y="Churn", x="MonthlyCharges")
plt.title("Monthly Charges vs Churn")
plt.show()

plt.figure(figsize=(6,5))
sns.boxplot(data=df, y="Churn", x="tenure")
plt.title("tenure vs Churn")
plt.show()

plt.figure(figsize=(6,5))
sns.boxplot(data=df, y="Churn", x="TotalCharges")
plt.title("Total Charges vs Churn")
plt.show()


#Removing Outliers

In [None]:
#Define an IQR-based outlier removal function
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

In [None]:
#Remove outliers from relevant columns
df_no_outliers = remove_outliers_iqr(df, 'tenure')
df_no_outliers = remove_outliers_iqr(df_no_outliers, 'TotalCharges')

In [None]:
#Replot the boxplots
plt.figure(figsize=(6,5))
sns.boxplot(data=df_no_outliers, y="Churn", x="tenure", showfliers=False)
plt.title("tenure vs Churn (Outliers Removed)")
plt.show()

plt.figure(figsize=(6,5))
sns.boxplot(data=df_no_outliers, y="Churn", x="TotalCharges", showfliers=False)
plt.title("Total Charges vs Churn (Outliers Removed)")
plt.show()

# **Data Preprocessing**

In [None]:
df = df.drop(['customerID', 'PaymentMethod'], axis=1)
#Removing customerID and PaymentMethod Columns

In [None]:
df.head(5)

In [None]:
from sklearn import preprocessing

# Encoder for 'gender'
le_gender = preprocessing.LabelEncoder()
# Check if the column still contains string values before fitting and transforming
if df['gender'].dtype == 'object':
    le_gender.fit(['Female','Male'])
    df['gender'] = le_gender.transform(df['gender'])
    #Female = 0, Male = 1
else:
    print("Column 'gender' is already numeric, skipping encoding.")

# Encoder for 'Partner'
le_partner = preprocessing.LabelEncoder()
if df['Partner'].dtype == 'object':
    le_partner.fit(['Yes','No'])
    df['Partner'] = le_partner.transform(df['Partner'])
    #Yes = 1, No = 0
else:
    print("Column 'Partner' is already numeric, skipping encoding.")

# Encoder for 'Dependents'
le_dependents = preprocessing.LabelEncoder()
if df['Dependents'].dtype == 'object':
    le_dependents.fit(['Yes','No'])
    df['Dependents'] = le_dependents.transform(df['Dependents'])
    #Yes = 1, No = 0
else:
    print("Column 'Dependents' is already numeric, skipping encoding.")

# Encoder for 'PhoneService'
le_phoneservice = preprocessing.LabelEncoder()
if df['PhoneService'].dtype == 'object':
    le_phoneservice.fit(['Yes','No'])
    df['PhoneService'] = le_phoneservice.transform(df['PhoneService'])
    #Yes = 1, No = 0
else:
    print("Column 'PhoneService' is already numeric, skipping encoding.")

# Encoder for 'MultipleLines'
le_multiplelines = preprocessing.LabelEncoder()
if df['MultipleLines'].dtype == 'object':
    le_multiplelines.fit(['Yes','No','No phone service'])
    df['MultipleLines'] = le_multiplelines.transform(df['MultipleLines'])
    #No phone service = 0, No = 1, Yes = 2
else:
    print("Column 'MultipleLines' is already numeric, skipping encoding.")

# Encoder for 'InternetService'
le_internetservice = preprocessing.LabelEncoder()
if df['InternetService'].dtype == 'object':
    le_internetservice.fit(['DSL','Fiber optic','No'])
    df['InternetService'] = le_internetservice.transform(df['InternetService'])
    #DSL = 0, Fiber optic = 1, No = 2
else:
    print("Column 'InternetService' is already numeric, skipping encoding.")

# Encoder for 'OnlineSecurity'
le_onlinesecurity = preprocessing.LabelEncoder()
if df['OnlineSecurity'].dtype == 'object':
    le_onlinesecurity.fit(['Yes','No','No internet service'])
    df['OnlineSecurity'] = le_onlinesecurity.transform(df['OnlineSecurity'])
    #No = 0, No internet service = 1, Yes = 2
else:
    print("Column 'OnlineSecurity' is already numeric, skipping encoding.")

# Encoder for 'OnlineBackup'
le_onlinebackup = preprocessing.LabelEncoder()
if df['OnlineBackup'].dtype == 'object':
    le_onlinebackup.fit(['Yes','No','No internet service'])
    df['OnlineBackup'] = le_onlinebackup.transform(df['OnlineBackup'])
    #No = 0, No internet service = 1, Yes = 2
else:
    print("Column 'OnlineBackup' is already numeric, skipping encoding.")

# Encoder for 'DeviceProtection'
le_deviceprotection = preprocessing.LabelEncoder()
if df['DeviceProtection'].dtype == 'object':
    le_deviceprotection.fit(['Yes','No','No internet service'])
    df['DeviceProtection'] = le_deviceprotection.transform(df['DeviceProtection'])
    #No = 0, No internet service = 1, Yes = 2
else:
    print("Column 'DeviceProtection' is already numeric, skipping encoding.")

# Encoder for 'TechSupport'
le_techsupport = preprocessing.LabelEncoder()
if df['TechSupport'].dtype == 'object':
    le_techsupport.fit(['Yes','No','No internet service'])
    df['TechSupport'] = le_techsupport.transform(df['TechSupport'])
    #No = 0, No internet service = 1, Yes = 2
else:
    print("Column 'TechSupport' is already numeric, skipping encoding.")

# Encoder for 'StreamingTV'
le_streamingtv = preprocessing.LabelEncoder()
if df['StreamingTV'].dtype == 'object':
    le_streamingtv.fit(['Yes','No','No internet service'])
    df['StreamingTV'] = le_streamingtv.transform(df['StreamingTV'])
    #No = 0, No internet service = 1, Yes = 2
else:
    print("Column 'StreamingTV' is already numeric, skipping encoding.")

# Encoder for 'StreamingMovies'
le_streamingmovies = preprocessing.LabelEncoder()
if df['StreamingMovies'].dtype == 'object':
    le_streamingmovies.fit(['Yes','No','No internet service'])
    df['StreamingMovies'] = le_streamingmovies.transform(df['StreamingMovies'])
    #No = 0, No internet service = 1, Yes = 2
else:
    print("Column 'StreamingMovies' is already numeric, skipping encoding.")

# Encoder for 'Contract'
le_contract = preprocessing.LabelEncoder()
if df['Contract'].dtype == 'object':
    le_contract.fit(['Month-to-month','One year','Two year'])
    df['Contract'] = le_contract.transform(df['Contract'])
    #Month-to-month = 0, One year = 1, Two year = 2
else:
    print("Column 'Contract' is already numeric, skipping encoding.")

# Encoder for 'PaperlessBilling'
le_paperlessbilling = preprocessing.LabelEncoder()
if df['PaperlessBilling'].dtype == 'object':
    le_paperlessbilling.fit(['Yes','No'])
    df['PaperlessBilling'] = le_paperlessbilling.transform(df['PaperlessBilling'])
    #No = 0, Yes = 1
else:
    print("Column 'PaperlessBilling' is already numeric, skipping encoding.")

# Encoder for 'Churn'
le_churn = preprocessing.LabelEncoder()
if df['Churn'].dtype == 'object':
    le_churn.fit(['Yes','No'])
    df['Churn'] = le_churn.transform(df['Churn'])
    #No = 0, Yes = 1
else:
    print("Column 'Churn' is already numeric, skipping encoding.")


In [None]:
df.head(5)

##Feature Engineering

In [None]:
#TotalServies
df['TotalServices'] = ((df['PhoneService'] == 1)
+ (df['OnlineSecurity'] == 2)
+ (df['OnlineBackup'] == 2)
+ (df['DeviceProtection'] == 2)
+ (df['TechSupport'] == 2)
+ (df['StreamingTV'] == 2)
+ (df['StreamingMovies'] == 2))

In [None]:
#TenureGroup
df['TenureGroup'] = pd.cut(df['tenure'],
                           bins=[0, 12, 24, 36, np.inf],
                           labels=['0-12', '12-24', '24-36', '36+'])

le_TenureGroup = preprocessing.LabelEncoder()
df['TenureGroup'] = le_TenureGroup.fit_transform(df['TenureGroup'].astype(str))

df.head(5)

In [None]:
#ChargerCategory
df['ChargerCategory'] = pd.cut(df['MonthlyCharges'],
                               bins=[0, 50, 100, np.inf],
                               labels=['Low', 'Medium', 'High'])

le_ChargerCategory = preprocessing.LabelEncoder()
df['ChargerCategory'] = le_ChargerCategory.fit_transform(df['ChargerCategory'].astype(str))

df.head(5)

In [None]:
#Splitting into Training and Testing sets
X = df.drop('Churn', axis=1)
y = df['Churn']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##Decision Tree

In [None]:
##Training the model
clf = DecisionTreeClassifier(random_state=42, criterion="gini")
clf.fit(X_train, y_train)

In [None]:
##Accuracy of Test
pred_test = clf.predict(X_test)
print(accuracy_score(y_test, pred_test))

In [None]:
##Accuracy of Train
pred_train = clf.predict(X_train)
print(accuracy_score(y_train, pred_train))

##Classification Report

In [None]:
print("Classification Report for Decision Tree Test:")
print(classification_report(y_test, pred_test))

print("Classification Report for Decision Tree Train:")
print(classification_report(y_train, pred_train))

##Confusion Matrix

In [None]:
print("Confusion Matrix for Decision Tree Test:")
print(confusion_matrix(y_test, pred_test))

print("Confusion Matrix for Decision Tree Train:")
print(confusion_matrix(y_train, pred_train))

##ROC Curve + AUC

In [None]:
dt_probs = clf.predict_proba(X_test)
dt_probs = dt_probs[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, dt_probs)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0,1], [0,1], linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Decision Tree ROC Curve')
plt.legend()
plt.grid()
plt.show()

##Final Tree Visualization

In [None]:
plt.figure(figsize=(20,12))
tree.plot_tree(clf, filled=True)
plt.show()

# **Neural Network**

In [None]:
#Build Neural Network
model = Sequential()

model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

#Training the model

In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

In [None]:
#Accuracy of Test
nn_probs_test = model.predict(X_test)
nn_probs_test = (nn_probs_test > 0.5).astype(int)
print(accuracy_score(y_test, nn_probs_test))

In [None]:
#Accuracy of Train
nn_probs_train = model.predict(X_train)
nn_probs_train = (nn_probs_train > 0.5).astype(int)
print(accuracy_score(y_train, nn_probs_train))

#Classification

In [None]:
print("Classification Report for Neural Network Test:")
print(classification_report(y_test, nn_probs_test))

print("Classification Report for Neural Network Train:")
print(classification_report(y_train, nn_probs_train))

#Confusion Matrix

In [None]:
print("Confusion Matrix for Neural Network Test:")
print(confusion_matrix(y_test, nn_probs_test))

print("Confusion Matrix for Neural Network Train:")
print(confusion_matrix(y_train, nn_probs_train))

#ROC Curve + AUC

In [None]:
fpr_nn, tpr_nn, thresholds_nn = roc_curve(y_test, nn_probs_test)
roc_auc_nn = auc(fpr_nn, tpr_nn)

plt.figure(figsize=(8, 6))
plt.plot(fpr_nn, tpr_nn, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc_nn)
plt.plot([0,1], [0,1], linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Neural Network ROC Curve')
plt.legend()
plt.grid()
plt.show()

# **Comparison of Decision Tree Model and Neural Network Model**

#Store results from both model

In [None]:
results = {
    "Model": ["Descision Tree","Neural Network"],

    "Train Accuracy": [
        accuracy_score(y_train, pred_train),
        accuracy_score(y_train, nn_probs_train)
    ],

    "Test Accuracy": [
        accuracy_score(y_test, pred_test),
        accuracy_score(y_test, nn_probs_test)
    ],

    "AUC": [
        roc_auc,
        roc_auc_nn
    ]
}

results_df = pd.DataFrame(results)
print("Comparison of Decision Tree Model and Neural Network Model:")
print(results_df)

#Precision, Recall, F1-Score Comparison

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

metrics_df = pd.DataFrame({
    'Model': ['Decision Tree', 'Neural Network'],
    'Precision': [
        precision_score(y_test, pred_test),
        precision_score(y_test, nn_probs_test)
    ],
    "Recall": [
        recall_score(y_test, pred_test),
        recall_score(y_test, nn_probs_test)
    ],
    "F1-Score": [
        f1_score(y_test, pred_test),
        f1_score(y_test, nn_probs_test)
    ]
})

print("Precision, Recall, F1-Score Comparison:")
print(metrics_df)

#Confusion Matrix

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12,5))

sns.heatmap(
    confusion_matrix(y_test, pred_test),
    annot=True, fmt="d", cmap="Blues", ax=ax[0]
)

ax[0].set_title("Decision Tree Confusion Matrix")
ax[0].set_xlabel("Predicted")
ax[0].set_ylabel("Actual")

sns.heatmap(
    confusion_matrix(y_test, nn_probs_test),
    annot=True, fmt="d", cmap="Blues", ax=ax[1]
)

ax[1].set_title("Neural Network Confusion Matrix")
ax[1].set_xlabel("Predicted")
ax[1].set_ylabel("Actual")

plt.tight_layout()
plt.show()

#ROC Curve Comparison

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label=f"Decision Tree (AUC = {roc_auc:.2f})")
plt.plot(fpr_nn, tpr_nn, label=f"Neural Network (AUC = {roc_auc_nn:.2f})")
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.grid()
plt.show()