<a href="https://colab.research.google.com/github/Sahanmee/Machine-Learning-Coursework/blob/main/2425450.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Exploratory Data Analysis**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn import tree

import tensorflow as tf
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Machine Learning Dataset/WA_Fn-UseC_-Telco-Customer-Churn.csv")

## Quick exploration

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
#Drop duplicates
df.shape

In [None]:
duplicates = df[df.duplicated()]
duplicates.shape
#There are no duplicates

## Missing Data

In [None]:
df.isnull().sum()

In [None]:
#Fixing TotalCharges
if df['TotalCharges'].dtype == 'object': #This checks whether the TotalCharge is stored as an object (String)
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce') #This is used to convert to numeric, turing non-numeric values into NAN
    df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median()) #Replace any NAN values with median value
else:
    print("Column 'TotalCharges' is already numeric, skipping conversion.") #If it is already converted in to numeric, then do need to change

In [None]:
df.head(5)

##Visualizing

In [None]:
#Distribution Diagram
plt.figure(figsize=(4,4)) #Figure size of the plot
sns.countplot(data=df, x="Churn") #Count plot for Churn
plt.title("Churn Class Distribution") #Title of the plot
plt.show() #Display the plot

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="gender")
plt.title("Gender Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="Partner")
plt.title("Partner Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="Dependents")
plt.title("Dependents Class Distribution")
plt.show()

plt.figure(figsize=(20,4))
sns.countplot(data=df, x="tenure")
plt.title("tenure Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="PhoneService")
plt.title("PhoneService Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="MultipleLines")
plt.title("MultipleLines Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="InternetService")
plt.title("InternetService Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="OnlineSecurity")
plt.title("OnlineSecurity Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="OnlineBackup")
plt.title("OnlineBackup Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="DeviceProtection")
plt.title("DeviceProtection Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="TechSupport")
plt.title("TechSupport Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="StreamingTV")
plt.title("StreamingTV Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="StreamingMovies")
plt.title("StreamingMovies Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="Contract")
plt.title("Contract Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.countplot(data=df, x="PaperlessBilling")
plt.title("PaperlessBilling Class Distribution")
plt.show()

plt.figure(figsize=(10,4))
sns.countplot(data=df, x="PaymentMethod")
plt.title("PaymentMethod Class Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.histplot(data=df, x="MonthlyCharges")
plt.title("Monthly Charges Distribution")
plt.show()

plt.figure(figsize=(4,4))
sns.histplot(data=df, x="TotalCharges")
plt.title("Total Charges Distribution")
plt.show()

In [None]:
plt.figure(figsize=(12,8)) #Figure size of the heatmap
#Select only numeric columns
#compute the correlation matrix, and plot it as heatmap
#'annot=True' displayes the correlation values inside each cell
#'cmap="coolwarm"' sets the color scheme from blue (negative) to red (positive)
sns.heatmap(df.select_dtypes(include=["int64", "float64"]).corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap") #Title of the heatmap
plt.show() #Displays the heatmap

In [None]:
plt.figure(figsize=(6,5)) #Figure size of the boxplot
#Create a boxplot to compare the distribution of MonthlyCharges betweeen Churn catergories (Yes/No)
#This helps visualize whetehr churners tend to have higher or lower monthly charges
sns.boxplot(data=df, y="Churn", x="MonthlyCharges")
plt.title("Monthly Charges vs Churn") #Title of the boxplot
plt.show() #Display the boxplot

plt.figure(figsize=(6,5))
sns.boxplot(data=df, y="Churn", x="tenure")
plt.title("tenure vs Churn")
plt.show()

plt.figure(figsize=(6,5))
sns.boxplot(data=df, y="Churn", x="TotalCharges")
plt.title("Total Charges vs Churn")
plt.show()

#Removing Outliers

In [None]:
#This function calculates the lower and upper bounds using IQR, checks for outliers outside these bounds, and removes them from the dataset.
#in the boxplot, showfliers=False is used to hide points near the Range Line that may viually appear as outliers, makinng the plot cleaner.

In [None]:
#Define an IQR-based outlier removal function
#Q1 and Q3 are calculated for the column
#IQR = Q3 - Q1 is used to define the "normal" data range
#Lower bound = Q1 - 1.5*IQR, Upper bound = Q3 + 1.5*IQR
#Any values outside this range are considered outliers and removed

def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25) #This calculates the 25th percentile Q1 of the selected column
    Q3 = df[column].quantile(0.75) #This calculates the 75th percentile Q3
    IQR = Q3 - Q1 #This calculates the IQR
    lower_bound = Q1 - 1.5 * IQR #This calculates the lower bound
    upper_bound = Q3 + 1.5 * IQR #This calculates the upper bound
    print("Lower bound:", lower_bound) #Prints the calculated lower bound
    print("Upper bound:", upper_bound) #Prints the calculated upper bound
    print("Min:", df[column].min(), "Max:", df[column].max()) #Prints the minimum and maximum values in the column to compare with the IQR bounds

    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)] #Return the dataframe excluding values outside the IQR range (outliers)


#Apply this fucntion to tenure and TotalCharges to filter the dataset
#This ensures that extreme values fo not distort our analysis or plots.
#In this object, the IQR boounds are wide enough that no roes are actually removed, meaning there are no extreme outliers by this standard

df_no_outliers = remove_outliers_iqr(df, 'tenure') #First remove outliers from the tenure column
df_no_outliers = remove_outliers_iqr(df_no_outliers, 'TotalCharges') #Then remove outliers from the TotalCharges column (applied on the already
#filtered dataset)

In [None]:
#Replot the boxplots
#This hides any points near the whiskers that may visually appear as outliers
#Even though these points are not true outliers, hiding them makes the boxplot cleaner and emphasizes the main distribution

plt.figure(figsize=(6,5)) #Figure size of the boxplot
#This boxplot is created to show no outliers in the tenure
#'showfliers=False' hides remanining outliers visually
sns.boxplot(data=df_no_outliers, y="Churn", x="tenure", showfliers=False)
plt.title("tenure vs Churn (Outliers Removed)") #Title of the boxplot
plt.show() #Displays the boxplot

plt.figure(figsize=(6,5)) #Figure size of the boxplot
#This boxplot is created to show no outliers in the TotalCharges
#'showfliers=False' hides remanining outliers visually
sns.boxplot(data=df_no_outliers, y="Churn", x="TotalCharges", showfliers=False)
plt.title("Total Charges vs Churn (Outliers Removed)") #Title of the boxplot
plt.show() #Displays the boxplot

# **Data Preprocessing**

In [None]:
df = df.drop(['customerID', 'PaymentMethod'], axis=1)
#Removing customerID and PaymentMethod Columns

In [None]:
df.head(5)

In [None]:
from sklearn import preprocessing

# Encoder for 'gender'
le_gender = preprocessing.LabelEncoder() #Create a LabelEncoder object ofr wncoding caregorical values
# Check if the column still contains string values before encoding
if df['gender'].dtype == 'object':
    le_gender.fit(['Female','Male']) #Fit the encoder with the known categories 'Female' and 'Male'
    df['gender'] = le_gender.transform(df['gender']) #Tranform the column into numeric labels
    #Female = 0, Male = 1
else:
    print("Column 'gender' is already numeric, skipping encoding.") #If the column is already numeric, then skip the encoding step

# Encoder for 'Partner'
le_partner = preprocessing.LabelEncoder()
if df['Partner'].dtype == 'object':
    le_partner.fit(['Yes','No'])
    df['Partner'] = le_partner.transform(df['Partner'])
    #Yes = 1, No = 0
else:
    print("Column 'Partner' is already numeric, skipping encoding.")

# Encoder for 'Dependents'
le_dependents = preprocessing.LabelEncoder()
if df['Dependents'].dtype == 'object':
    le_dependents.fit(['Yes','No'])
    df['Dependents'] = le_dependents.transform(df['Dependents'])
    #Yes = 1, No = 0
else:
    print("Column 'Dependents' is already numeric, skipping encoding.")

# Encoder for 'PhoneService'
le_phoneservice = preprocessing.LabelEncoder()
if df['PhoneService'].dtype == 'object':
    le_phoneservice.fit(['Yes','No'])
    df['PhoneService'] = le_phoneservice.transform(df['PhoneService'])
    #Yes = 1, No = 0
else:
    print("Column 'PhoneService' is already numeric, skipping encoding.")

# Encoder for 'MultipleLines'
le_multiplelines = preprocessing.LabelEncoder()
if df['MultipleLines'].dtype == 'object':
    le_multiplelines.fit(['Yes','No','No phone service'])
    df['MultipleLines'] = le_multiplelines.transform(df['MultipleLines'])
    #No phone service = 0, No = 1, Yes = 2
else:
    print("Column 'MultipleLines' is already numeric, skipping encoding.")

# Encoder for 'InternetService'
le_internetservice = preprocessing.LabelEncoder()
if df['InternetService'].dtype == 'object':
    le_internetservice.fit(['DSL','Fiber optic','No'])
    df['InternetService'] = le_internetservice.transform(df['InternetService'])
    #DSL = 0, Fiber optic = 1, No = 2
else:
    print("Column 'InternetService' is already numeric, skipping encoding.")

# Encoder for 'OnlineSecurity'
le_onlinesecurity = preprocessing.LabelEncoder()
if df['OnlineSecurity'].dtype == 'object':
    le_onlinesecurity.fit(['Yes','No','No internet service'])
    df['OnlineSecurity'] = le_onlinesecurity.transform(df['OnlineSecurity'])
    #No = 0, No internet service = 1, Yes = 2
else:
    print("Column 'OnlineSecurity' is already numeric, skipping encoding.")

# Encoder for 'OnlineBackup'
le_onlinebackup = preprocessing.LabelEncoder()
if df['OnlineBackup'].dtype == 'object':
    le_onlinebackup.fit(['Yes','No','No internet service'])
    df['OnlineBackup'] = le_onlinebackup.transform(df['OnlineBackup'])
    #No = 0, No internet service = 1, Yes = 2
else:
    print("Column 'OnlineBackup' is already numeric, skipping encoding.")

# Encoder for 'DeviceProtection'
le_deviceprotection = preprocessing.LabelEncoder()
if df['DeviceProtection'].dtype == 'object':
    le_deviceprotection.fit(['Yes','No','No internet service'])
    df['DeviceProtection'] = le_deviceprotection.transform(df['DeviceProtection'])
    #No = 0, No internet service = 1, Yes = 2
else:
    print("Column 'DeviceProtection' is already numeric, skipping encoding.")

# Encoder for 'TechSupport'
le_techsupport = preprocessing.LabelEncoder()
if df['TechSupport'].dtype == 'object':
    le_techsupport.fit(['Yes','No','No internet service'])
    df['TechSupport'] = le_techsupport.transform(df['TechSupport'])
    #No = 0, No internet service = 1, Yes = 2
else:
    print("Column 'TechSupport' is already numeric, skipping encoding.")

# Encoder for 'StreamingTV'
le_streamingtv = preprocessing.LabelEncoder()
if df['StreamingTV'].dtype == 'object':
    le_streamingtv.fit(['Yes','No','No internet service'])
    df['StreamingTV'] = le_streamingtv.transform(df['StreamingTV'])
    #No = 0, No internet service = 1, Yes = 2
else:
    print("Column 'StreamingTV' is already numeric, skipping encoding.")

# Encoder for 'StreamingMovies'
le_streamingmovies = preprocessing.LabelEncoder()
if df['StreamingMovies'].dtype == 'object':
    le_streamingmovies.fit(['Yes','No','No internet service'])
    df['StreamingMovies'] = le_streamingmovies.transform(df['StreamingMovies'])
    #No = 0, No internet service = 1, Yes = 2
else:
    print("Column 'StreamingMovies' is already numeric, skipping encoding.")

# Encoder for 'Contract'
le_contract = preprocessing.LabelEncoder()
if df['Contract'].dtype == 'object':
    le_contract.fit(['Month-to-month','One year','Two year'])
    df['Contract'] = le_contract.transform(df['Contract'])
    #Month-to-month = 0, One year = 1, Two year = 2
else:
    print("Column 'Contract' is already numeric, skipping encoding.")

# Encoder for 'PaperlessBilling'
le_paperlessbilling = preprocessing.LabelEncoder()
if df['PaperlessBilling'].dtype == 'object':
    le_paperlessbilling.fit(['Yes','No'])
    df['PaperlessBilling'] = le_paperlessbilling.transform(df['PaperlessBilling'])
    #No = 0, Yes = 1
else:
    print("Column 'PaperlessBilling' is already numeric, skipping encoding.")

# Encoder for 'Churn'
le_churn = preprocessing.LabelEncoder()
if df['Churn'].dtype == 'object':
    le_churn.fit(['Yes','No'])
    df['Churn'] = le_churn.transform(df['Churn'])
    #No = 0, Yes = 1
else:
    print("Column 'Churn' is already numeric, skipping encoding.")


In [None]:
df.head(5)

##**Feature Engineering**

In [None]:
#TenureGroup
df['TenureGroup'] = pd.cut(df['tenure'],
                           bins=[0, 12, 24, 36, np.inf],
                           labels=['0-12', '12-24', '24-36', '36+'])

le_TenureGroup = preprocessing.LabelEncoder()
df['TenureGroup'] = le_TenureGroup.fit_transform(df['TenureGroup'].astype(str))

df.head(5)

In [None]:
#ChargerCategory
df['ChargerCategory'] = pd.cut(df['MonthlyCharges'],
                               bins=[0, 50, 100, np.inf],
                               labels=['Low', 'Medium', 'High'])

le_ChargerCategory = preprocessing.LabelEncoder()
df['ChargerCategory'] = le_ChargerCategory.fit_transform(df['ChargerCategory'].astype(str))

df.head(5)

In [None]:
#Splitting into Training and Testing sets
X = df.drop('Churn', axis=1)
y = df['Churn']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#Rechecking Missing Values
df.isnull().sum()

In [None]:
#Rechecking Duplicate Values
duplicates = df[df.duplicated()]
duplicates.shape

In [None]:
#Removing Duplicate Values
df = df.drop_duplicates() #Remove exact duplicates first
df = df.drop_duplicates(subset=df.columns.difference(['TotalCharges'])) #Remove near-duplicates ignoring 'TotalCharges'
df = df.iloc[0:0] #Remove all remaining rows
print(df.shape)

##Decision Tree

In [None]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15,20],
    'min_samples_split': [2, 5, 10]
}

dt_clf = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(estimator=dt_clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=1)

grid_search.fit(X_train, y_train)
print("Complete fiiting of Grid Search")

In [None]:
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

In [None]:
tuned_dt = grid_search.best_estimator_
print(tuned_dt)

In [None]:
#Do predictions on the train set
tuned_dt_pred_train = tuned_dt.predict(X_train)

In [None]:
##Accuracy of Train
print(accuracy_score(y_train, tuned_dt_pred_train))

In [None]:
#Do predictions on the test set
tuned_dt_pred_test = tuned_dt.predict(X_test)

In [None]:
##Accuracy of Test
print(accuracy_score(y_test, tuned_dt_pred_test))

##Classification Report

In [None]:
print("Classification Report for Decision Tree Test:")
print(classification_report(y_test, tuned_dt_pred_test))

print("Classification Report for Decision Tree Train:")
print(classification_report(y_train, tuned_dt_pred_train))

##Confusion Matrix

In [None]:
print("Confusion Matrix for Decision Tree Test:")
print(confusion_matrix(y_test, tuned_dt_pred_test))

print("Confusion Matrix for Decision Tree Train:")
print(confusion_matrix(y_train, tuned_dt_pred_train))

##ROC Curve + AUC

In [None]:
#Predict probabilities for the test set
#clf.predict_proba(X_test) returns probabilities for both classes (0 and 1)
#We take the probability of the positive class (class1)
dt_probs = tuned_dt.predict_proba(X_test)
dt_probs = dt_probs[:, 1] #probability of class'1'

fpr, tpr, thresholds = roc_curve(y_test, dt_probs) #Compute False Positive Rate(FPR), True Positive Rate (TPR), and thresholds
roc_auc = auc(fpr, tpr) #Compute area under the curve (AUC) for ROC

plt.figure(figsize=(8, 6)) #Figure size of the ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0,1], [0,1], linestyle="--") #Plot the diagnol line for a random classifier
plt.xlim([0.0, 1.0]) #Axis limits
plt.ylim([0.0, 1.05]) #Axis limits
plt.xlabel('False Positive Rate') #X axis label
plt.ylabel('True Positive Rate') #Y axis label
plt.title('Decision Tree ROC Curve') #Title of the ROC curve
#Legend and grid for better readability
plt.legend()
plt.grid()
plt.show() #Displays the ROC curve

##Final Tree Visualization

In [None]:
plt.figure(figsize=(20,12)) #Figure size of the Tree
#clf: the trained decision tree classifer
#filled=Ture: color the nodes based on the predicted class
tree.plot_tree(tuned_dt, filled=True)
plt.show() #Displays the Tree

# **Neural Network**

In [None]:
# Build Neural Network - Model definition function (IMPORTANT CHANGE)
def create_model(optimizer='adam'):
    """Function that creates and returns a fresh Keras model.
       This is required because GridSearchCV will clone this estimator."""
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(128, input_dim=X_train.shape[1], activation='relu'))
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(64, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
import numpy as np

class KerasBinaryClassifier(BaseEstimator, ClassifierMixin):
    """Custom scikit-learn estimator for Keras binary classification models."""

    def __init__(self, build_fn=create_model, epochs=10, batch_size=32, optimizer='adam', verbose=0):
        self.build_fn = build_fn
        self.epochs = epochs
        self.batch_size = batch_size
        self.optimizer = optimizer
        self.verbose = verbose
        self.model_ = None  # Trained model
        self.history_ = None  # Training history

    def fit(self, X, y, validation_data=None, callbacks=None):
        """Fit the Keras model to training data."""
        # Input validation
        X, y = check_X_y(X, y)

        # Create a fresh model instance
        self.model_ = self.build_fn(optimizer=self.optimizer)

        # Store class information
        self.classes_ = np.unique(y)
        self.n_classes_ = len(self.classes_)

        # Train the model
        self.history_ = self.model_.fit(
            X, y,
            epochs=self.epochs,
            batch_size=self.batch_size,
            validation_data=validation_data,
            callbacks=callbacks if callbacks else [],
            verbose=self.verbose,
            shuffle=True
        )

        return self

    def predict(self, X):
        """Make binary predictions (0 or 1)."""
        check_is_fitted(self, ['model_', 'classes_'])
        X = check_array(X)

        predictions = self.model_.predict(X, verbose=0)
        return (predictions > 0.5).astype(int).flatten()

    def predict_proba(self, X):
        """Return probability estimates."""
        check_is_fitted(self, ['model_', 'classes_'])
        X = check_array(X)

        proba = self.model_.predict(X, verbose=0)
        # For binary classification: return probabilities for both classes
        return np.hstack([1 - proba, proba])

    def score(self, X, y):
        """Return the mean accuracy on the given test data and labels."""
        y_pred = self.predict(X)
        return np.mean(y_pred == y)

# Create the custom estimator (REPLACES KerasClassifier)
nn_classifier = KerasBinaryClassifier(
    build_fn=create_model,
    epochs=10,
    batch_size=32,
    optimizer='adam',
    verbose=1
)

param_grid = {'batch_size': [32, 64],
              'epochs': [10, 20],
              'optimizer': ['adam', 'rmsprop']
              }

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

grid_search = GridSearchCV(
    estimator=nn_classifier,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=1
)

grid_result = grid_search.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    callbacks=[early_stopping]
)

print("Best Parameters:", grid_result.best_params_)
print("Best Score:", grid_result.best_score_)

# The best estimator from GridSearchCV
best_nn_model = grid_result.best_estimator_

# Make predictions with the best model
nn_probs_test = best_nn_model.predict(X_test)
nn_probs_train = best_nn_model.predict(X_train)

# Now continue with your accuracy calculations, classification reports, etc.
# Use nn_probs_test and nn_probs_train as before

#Training the model

In [None]:
best_model = grid_search.best_estimator_.model_

In [None]:
#Accuracy of Test
nn_probs_test = best_model.predict(X_test)
nn_probs_test = (nn_probs_test > 0.5).astype(int)
print(accuracy_score(y_test, nn_probs_test))

In [None]:
#Accuracy of Train
nn_probs_train = best_model.predict(X_train)
nn_probs_train = (nn_probs_train > 0.5).astype(int)
print(accuracy_score(y_train, nn_probs_train))

#Classification

In [None]:
print("Classification Report for Neural Network Test:")
print(classification_report(y_test, nn_probs_test))

print("Classification Report for Neural Network Train:")
print(classification_report(y_train, nn_probs_train))

#Confusion Matrix

In [None]:
print("Confusion Matrix for Neural Network Test:")
print(confusion_matrix(y_test, nn_probs_test))

print("Confusion Matrix for Neural Network Train:")
print(confusion_matrix(y_train, nn_probs_train))

#ROC Curve + AUC

In [None]:
fpr_nn, tpr_nn, thresholds_nn = roc_curve(y_test, nn_probs_test)
roc_auc_nn = auc(fpr_nn, tpr_nn)

plt.figure(figsize=(8, 6))
plt.plot(fpr_nn, tpr_nn, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc_nn)
plt.plot([0,1], [0,1], linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Neural Network ROC Curve')
plt.legend()
plt.grid()
plt.show()

# **Comparison of Decision Tree Model and Neural Network Model**

#Store results from both model

In [None]:
results = {
    "Model": ["Tuned Descision Tree","Tuned Neural Network"],

    "Train Accuracy": [
        accuracy_score(y_train, tuned_dt_pred_train),
        accuracy_score(y_train, nn_probs_train)
    ],

    "Test Accuracy": [
        accuracy_score(y_test, tuned_dt_pred_test),
        accuracy_score(y_test, nn_probs_test)
    ],

    "AUC": [
        roc_auc,
        roc_auc_nn
    ]
}

results_df = pd.DataFrame(results)
print("Comparison of Decision Tree Model and Neural Network Model:")
print(results_df)

#Precision, Recall, F1-Score Comparison

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

metrics_df = pd.DataFrame({
    'Model': ['Tuned Decision Tree', 'Tuned Neural Network'],
    'Precision': [
        precision_score(y_test, tuned_dt_pred_test),
        precision_score(y_test, nn_probs_test)
    ],
    "Recall": [
        recall_score(y_test, tuned_dt_pred_test),
        recall_score(y_test, nn_probs_test)
    ],
    "F1-Score": [
        f1_score(y_test, tuned_dt_pred_test),
        f1_score(y_test, nn_probs_test)
    ]
})

print("Precision, Recall, F1-Score Comparison:")
print(metrics_df)

#Confusion Matrix

In [None]:
#Create a figure with 1 row and 2 columns for side-by-side plots
fig, ax = plt.subplots(1, 2, figsize=(12,5))

#Plot the confusion matrix for the Decision Tree
sns.heatmap(
    confusion_matrix(y_test, tuned_dt_pred_test), #Compute confusion matrix
    #annot=True - Show the numbers in each cell
    #fmt="d" - Format as integers
    #cmap="Blues" - Color map
    #ax=ax[0] - Plot in the first subplot
    annot=True, fmt="d", cmap="Blues", ax=ax[0]
)

ax[0].set_title("Decision Tree Confusion Matrix")
ax[0].set_xlabel("Predicted")
ax[0].set_ylabel("Actual")

#Plot the confusion matrix for the Neural Network
sns.heatmap(
    confusion_matrix(y_test, nn_probs_test), #Compute confusion matrix
    #annot=True - Show the numbers in each cell
    #fmt="d" - Format as integers
    #cmap="Blues" - Color map
    #ax=ax[0] - Plot in the second subplot
    annot=True, fmt="d", cmap="Blues", ax=ax[1]
)

ax[1].set_title("Neural Network Confusion Matrix")
ax[1].set_xlabel("Predicted")
ax[1].set_ylabel("Actual")

plt.tight_layout() #Adjust layout to prevent overlapping title/labels
plt.show()

#ROC Curve Comparison

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label=f"Decision Tree (AUC = {roc_auc:.2f})")
plt.plot(fpr_nn, tpr_nn, label=f"Neural Network (AUC = {roc_auc_nn:.2f})")
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.grid()
plt.show()