Email Spam Classification
Rylan Harris

Details about the dataset: https://www.kaggle.com/datasets/colormap/spambase/data

## Importing Libararies and Data

In [1]:
import pandas as pd
import numpy as np
import math

# This was imported via a connected Google Drive. Importing the data otherwise may be required
source_dataset = pd.read_csv("/content/drive/MyDrive/CS470/spambase.csv")
source_dataset.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


## Splitting Test Data

splitting into randomly shuffled, evenly split sets for 5-cross validation

In [2]:
SPLIT = 5

split_data = np.array_split(source_dataset.sample(frac = 1), SPLIT)

##Naive Bayes

My first implementation was based of lectures that I found online and wasn't in line with the type of Naive Bayes system that we covered in class. I believe I implemented everything correctly with the Naive Bayes system though it isn't very well performing so I don't doubt I may have done something wrong.

### Naive Bayes ver. 1 (deprecated)

In [3]:
# class Naive_Bayes():

#     def fit(self, features: np.ndarray, target: np.ndarray) -> None:
#         n_samples, n_features = features.shape

#         self.classes = np.unique(target)
#         n_classes = len(self.classes)

#         self.means = np.zeros((n_classes, n_features))
#         self.variances = np.zeros((n_classes, n_features))
#         self.priors = np.zeros(n_classes)

#         for index, group in enumerate(self.classes):
#             class_features = features[target == group]

#             self.means[index, :] = class_features.mean( axis= 0 )
#             self.variances[index, :] = class_features.var( axis= 0 )
#             self.priors[index] = class_features.size / float(n_samples)

#     def predict(self, input_set: np.ndarray) -> np.array:
#         predictions = []

#         for input in input_set:

#             posteriors = []

#             for index, feature in enumerate(self.classes):
#                 prior = np.log(self.priors[index])
#                 posterior = np.sum(np.log(self.prob_function(index, input)))

#                 posteriors.append(posterior + prior)

#             predictions.append( self.classes[np.argmax(posteriors)] )

#         return np.array(predictions)

#     def prob_function(self, class_index, value):
#         mean = self.means[class_index]
#         variance = self.variances[class_index]

#         numerator = np.exp(-((value - mean) ** 2) / (2 * variance))
#         denominator = np.sqrt(2 * np.pi * variance)

#         return numerator/denominator


#     def __str__(self):
#         return "Naive Bayes Model"

###Naive bayes ver. 2


   

In [4]:
def data_correction(input_dataset):
    return input_dataset[:,:-3].astype(bool)

class Naive_Bayes():
    def __init__(self, threshold = .5):
        self.threshold = threshold

    def fit(self, features: np.ndarray, target: np.ndarray) -> None:
        n_samples, n_features = features.shape

        # adjust features to exclude non-applicable features and get occurence bool instead of frequency
        features_adj = data_correction(features)

        # calculate overall probability
        self.p_c = sum(target) / n_samples
        # calculate probability for each feature with Laplace smoothing
        self.probabilities = np.apply_along_axis(lambda x: ((sum(x) + 1) / (n_samples + 1)) , 0, features_adj)

    def predict_prob(self, input_set: np.ndarray) -> np.array:
        input_adj = data_correction(input_set)

        predictions = np.apply_along_axis(lambda x: np.prod( self.probabilities[x] / self.p_c ),
                                          1, input_adj)

        return np.array(predictions)

    def predict(self, input_set: np.ndarray) -> np.array:
        y_prob = self.predict_prob(input_set)
        return np.array([1 if prob > self.threshold else 0 for prob in y_prob])

    def __str__(self):
        return "Naive Bayes Model"

## Logistic Regression

I never quite got this to function properly with performance feedback but it seems to function well despite that. There is some issue with warnings from the sigmoid function getting values that are too large and I'm not sure how to address that. Overall, I think this implementation is satisfactory but not quick how you had wanted us to make it.

In [5]:
# I think there's some problem with this because it regularly throws warnings
def sigmoid(x: np.ndarray) -> np.ndarray:
    return 1/(1 + np.exp(-x))

class LogReg:
    def __init__(self, learning_rate = 0.001, n_iters=10000, threshold = .5):
        self.learning_rate: float = learning_rate
        self.n_iters: int = n_iters
        self.weights: np.ndarray  = None
        self.pred_prob: np.ndarray  = None
        self.threshold = threshold

    def fit(self, features: np.ndarray, target: np.ndarray) -> None:
        n_samples, n_features = features.shape
        X: np.ndarray = np.c_[np.ones_like(target), features]

        self.weights = np.random.randn(n_features + 1)
        best_performance = 0

        for _ in range(self.n_iters):
            pred_y = self.predict_prob(features)

            gradient_M = (2 / n_samples) * np.dot((pred_y - target), X)

            # not currently performance dependent
            # performance = calculate_f1_score(confusion_matrix(pred_y, target))

            # if performance > best_performance:
            #     best_performance = performance
            #     self.best_weights = self.weights

            self.weights = self.weights - (self.learning_rate * gradient_M)

    def predict_prob(self, input: np.ndarray) -> np.ndarray:
        X: np.ndarray = np.c_[np.ones(input.shape[0]), input]
        return sigmoid(np.dot(X, self.weights))

    def predict(self, input: np.ndarray) -> np.ndarray:
        X: np.ndarray = np.c_[np.ones(input.shape[0]), input]
        y_prob = self.predict_prob(input)
        return np.array([1 if prob > self.threshold else 0 for prob in y_prob])

    def __str__(self):
        return "Logistic Regression Model"

## KNN

I likely made a mistake in how I'm using cosine similarity so it doesn't work and I just fell back on using euclidean distance instead. I could definitely refine doing the majority rule counting of the nearest neighbors but left this as is because it works. I'm sure there is something I could improve here to make it perform a bit more quickly as it is clearly the slowest classifier here but I imagine some amount of that is unavoidable.

In [6]:
def calculate_distance(input1: float, input2: float) -> float:
    return np.sqrt( np.sum((input1 - input2) ** 2) )

def calc_distance_cos(input1, input2):
    return np.dot(input1, input2)/(np.linalg.norm(input1)*np.linalg.norm(input2))

class KNN:
    def __init__(self, k=5, threshold = .5):
        self.k = k
        self.threshold = threshold

    def fit(self, features: np.ndarray, target: np.ndarray) -> None:
        self.X = features
        self.Y = target

    def predict_prob(self, input_set: np.ndarray) -> np.array:
        predictions = []

        for input in input_set:

            # calculate distances, I am using euclidean because I either made
            # some mistake in cosine similarity (likely) or it just works worse
            distances = np.array([calculate_distance(input, x) for x in self.X])

            # get the closests k
            k_indices = np.argsort(distances)[:self.k]
            k_labels = [self.Y[k] for k in k_indices]

            # get label with majority vote
            unique_labels, counts = np.unique(k_labels, return_counts = True)

            counts_dict: dict = dict()
            for index, label in enumerate(unique_labels):
                counts_dict.update({label: counts[index]})

            if 1 in counts_dict.keys():
                predictions.append( counts_dict[1] / self.k )
            else:
                predictions.append( 0 )

        return np.array(predictions)

    def predict(self, input_set: np.ndarray) -> np.array:
        y_prob = self. predict_prob(input_set)
        return np.array([1 if prob > self.threshold else 0 for prob in y_prob])

    def __str__(self):
        return "K Nearest Neighbors Model"

## Performance Assessment

I had to do some janky patchwork to get the f1 score to work when the classifier predicted all to be one of the classes. That could definitely be improved. I also didn't find a great resource for how to implement ROC_AUC so I didn't implement that.

In [7]:
def accuracy(y_pred, y_actual):
    return np.sum(y_pred==y_actual)/len(y_actual)

def confusion_matrix(y_pred: np.ndarray, y_actual: np.ndarray):
    y_actu = pd.Series(y_actual, name='Actual')
    y_pred = pd.Series(y_pred, name='Predicted')

    return pd.crosstab(y_actu, y_pred)

def target_split(input_dataset, target_col = "spam"):
    target = input_dataset[target_col].to_numpy()
    features = input_dataset.loc[:, input_dataset.columns != target_col].to_numpy()

    return features, target

def calculate_f1_score(conf_matrix):
    tn = conf_matrix[0][0]
    fn = conf_matrix[0][1]

    # this isn't guaranteed to work
    if (conf_matrix.shape[1] > 1):
        fp = conf_matrix[1][0]
        tp = conf_matrix[1][1]
    else:
        fp = 0
        tp = 0

    return (2* tp) / (2 * tp + fp + fn)

# couldn't figure out the methodology for this
def calculate_roc_auc(y_pred: np.ndarray, y_actual: np.ndarray):
    pass

def calculate_metrics(y_pred: np.ndarray, y_actual: np.ndarray):

    conf_matrix = confusion_matrix(y_pred, y_actual)
    print(conf_matrix)

    f1_score = calculate_f1_score(conf_matrix)

    print(f"\nf1 score: {f1_score}")

    print(f"Accuracy: {accuracy(y_pred, y_actual)}\n")

methods  = [Naive_Bayes, LogReg, KNN]

## Testing

I iterate through each testing/training split and the through each method and display the results of those models here.

In [8]:
for index, testing_set in enumerate(split_data):
    print(f"Test Set {index + 1}\n----------------\n")

    training_set = source_dataset.drop(testing_set.index)

    training_features, training_target = target_split(training_set)
    testing_features, testing_target = target_split(testing_set)

    for method in methods:
        classifier = method()
        print(classifier, "\n")

        classifier.fit( training_features, training_target )
        prediction = classifier.predict( testing_features )

        calculate_metrics(prediction, testing_target)

Test Set 1
----------------

Naive Bayes Model 

Predicted    0    1
Actual             
0          420  129
1          303   69

f1 score: 0.24210526315789474
Accuracy: 0.5309446254071661

Logistic Regression Model 



  return 1/(1 + np.exp(-x))


Predicted    0    1
Actual             
0          170  379
1            6  366

f1 score: 0.6553267681289168
Accuracy: 0.5819761129207384

K Nearest Neighbors Model 

Predicted    0    1
Actual             
0          469   80
1           90  282

f1 score: 0.7683923705722071
Accuracy: 0.8154180238870793

Test Set 2
----------------

Naive Bayes Model 

Predicted    0    1
Actual             
0          440  109
1          296   75

f1 score: 0.2702702702702703
Accuracy: 0.5597826086956522

Logistic Regression Model 



  return 1/(1 + np.exp(-x))


Predicted    0    1
Actual             
0          421  128
1           34  337

f1 score: 0.80622009569378
Accuracy: 0.8239130434782609

K Nearest Neighbors Model 

Predicted    0    1
Actual             
0          459   90
1           96  275

f1 score: 0.7472826086956522
Accuracy: 0.7978260869565217

Test Set 3
----------------

Naive Bayes Model 

Predicted    0    1
Actual             
0          402  144
1          299   75

f1 score: 0.25295109612141653
Accuracy: 0.5184782608695652

Logistic Regression Model 



  return 1/(1 + np.exp(-x))


Predicted    0    1
Actual             
0          242  304
1            7  367

f1 score: 0.7023923444976077
Accuracy: 0.6619565217391304

K Nearest Neighbors Model 

Predicted    0    1
Actual             
0          452   94
1           94  280

f1 score: 0.7486631016042781
Accuracy: 0.7956521739130434

Test Set 4
----------------

Naive Bayes Model 

Predicted    0    1
Actual             
0          424  133
1          289   74

f1 score: 0.2596491228070175
Accuracy: 0.5413043478260869

Logistic Regression Model 



  return 1/(1 + np.exp(-x))


Predicted    0   1
Actual            
0          539  18
1          306  57

f1 score: 0.2602739726027397
Accuracy: 0.6478260869565218

K Nearest Neighbors Model 

Predicted    0    1
Actual             
0          473   84
1          105  258

f1 score: 0.7319148936170212
Accuracy: 0.7945652173913044

Test Set 5
----------------

Naive Bayes Model 

Predicted    0    1
Actual             
0          466  121
1          276   57

f1 score: 0.22309197651663404
Accuracy: 0.5684782608695652

Logistic Regression Model 



  return 1/(1 + np.exp(-x))


Predicted    0   1
Actual            
0          570  17
1          289  44

f1 score: 0.2233502538071066
Accuracy: 0.6673913043478261

K Nearest Neighbors Model 

Predicted    0    1
Actual             
0          495   92
1           87  246

f1 score: 0.7332339791356185
Accuracy: 0.8054347826086956

