# Spam Classification

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from pygam import LogisticGAM, s, f
from ast import literal_eval

In [None]:
import random

In [None]:
#Importing the dataset
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data'
column_names = ["word_freq_make", "word_freq_address", "word_freq_all", "word_freq_3d", 
                "word_freq_our", "word_freq_over", "word_freq_remove", "word_freq_internet", 
                "word_freq_order", "word_freq_mail", "word_freq_receive", "word_freq_will", 
                "word_freq_people", "word_freq_report", "word_freq_addresses", "word_freq_free", 
                "word_freq_business", "word_freq_email", "word_freq_you", "word_freq_credit", 
                "word_freq_your", "word_freq_font", "word_freq_000", "word_freq_money", "word_freq_hp", 
                "word_freq_hpl", "word_freq_george", "word_freq_650", "word_freq_lab", "word_freq_labs", 
                "word_freq_telnet", "word_freq_857", "word_freq_data", "word_freq_415", "word_freq_85", 
                "word_freq_technology", "word_freq_1999", "word_freq_parts", "word_freq_pm", "word_freq_direct", 
                "word_freq_cs", "word_freq_meeting", "word_freq_original", "word_freq_project", "word_freq_re", 
                "word_freq_edu", "word_freq_table", "word_freq_conference", "char_freq_;", 
                "char_freq_(", "char_freq_[", "char_freq_!", "char_freq_$", "char_freq_#", 
                "capital_run_length_average", "capital_run_length_longest", "capital_run_length_total", "Spam"]

data = pd.read_csv(url, names=column_names, sep=',',skipinitialspace=True)

In [None]:
#Check for missing values
data.isna().sum()

In [None]:
#Split into training and testing data
training_data, testing_data = train_test_split(data, test_size = 0.2, random_state = 5)

In [None]:
training_features = training_data.copy()
training_classes = training_features.pop('Spam')
testing_features = testing_data.copy()
testing_classes = testing_features.pop('Spam')

In [None]:
data.describe().transpose()

# Deep neural network

In [None]:
#58 input variables so 58 input neurons?
#1 output for the set of inputs so 1 output neuron
#hidden layers?

#specify optimiser such as nesterov momentum?

dnn_spam_model = tf.keras.Sequential([
    tf.keras.Input(shape = (57)),
    layers.Dense(36, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation = 'sigmoid')
])
#sigmoid outputs between 0 and 1 which is perfect for probabilities

dnn_spam_model.summary()


In [None]:
#adam optimiser for advanced gradient descent, loss is binary cross entropy since binary classification
#problem
random.seed(5)
dnn_spam_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
                  loss = tf.keras.losses.BinaryCrossentropy(),
                  metrics = [tf.keras.metrics.BinaryAccuracy(name = 'acc')])

In [None]:
#fit model with 100 epochs
dnn_fit = dnn_spam_model.fit(x= training_features, 
               y= training_classes, 
               epochs = 100, 
               validation_split = 0.2,
               class_weight = {0:10, 1:1}
                )
#maybe write code to generate the best number of epochs and learning rate?

In [None]:
#Check the training progress/error by epoch, store in dataframe
history = pd.DataFrame(dnn_fit.history)
history['epoch'] = dnn_fit.epoch
history

In [None]:
plt.plot(
    np.arange(1, 91), 
    dnn_fit.history['acc'], 
    label='Accuracy'
)

plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.show()

In [None]:
#Moving onto predicting
dnn_test_pred = dnn_spam_model.predict(testing_features)
dnn_pred_classes = [
    1 if prob > 0.9993 else 0 for prob in np.ravel(dnn_test_pred)
]
print(confusion_matrix(testing_classes, dnn_pred_classes))

print(f'Accuracy: {accuracy_score(testing_classes, dnn_pred_classes)}')


# Classification GAM

In [None]:
classifier = LogisticGAM()
classifier.fit(training_features, training_classes)

In [None]:
gam_pred_classes = classifier.predict(testing_features)
gam_pred_classes = pred_classes * 1

In [None]:
classifier.summary()

In [None]:
print(confusion_matrix(testing_classes, gam_pred_classes))
