In [1]:
import pandas as pd
import altair as alt
import numpy as np
from collections import ChainMap
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GridSearchCV
import random
alt.data_transformers.disable_max_rows()

#keep track of random events for reproducability
np.random.seed(3127)

In [2]:
# defining array of predictor variables to help keep code clean
predictor_variables = ["word_freq_credit", "word_freq_000","word_freq_free", "capital_run_length_longest"]
predictor_variables_scaled = ["word_freq_credit_scaled", "word_freq_000_scaled","word_freq_free_scaled","capital_run_length_longest_scaled"]
predictor_variables_spam = ["word_freq_credit", "word_freq_000","word_freq_free","capital_run_length_longest", "is_spam"]
predictor_variables_predicted = ["word_freq_credit", "word_freq_000","word_freq_free","capital_run_length_longest", "predicted"]

In [3]:
#reading in data
spam = pd.read_csv("spambase.data", header = None)

# wrangling:

#adding the column headers
spam_titles = pd.read_csv("spambase.names", skiprows = 31)
spam_titles_split = spam_titles["1"].str.split(":", expand = True)
spam_headers = spam_titles_split[[0]].to_dict()
spam_headers = dict(ChainMap(*spam_headers.values()))
spam_tidy = spam.rename(columns = spam_headers).rename(columns = {57: "is_spam"})

#casting the 0 1 system of is_spam to "Spam" and "Normal"
spam_tidy["is_spam"] = spam_tidy["is_spam"].replace({0: "Non_Spam", 1: "Spam"}).astype("category")

#filtering to just the predictor variables and labels
spam = spam_tidy[predictor_variables_spam]

#splitting data into 75/25 train test split
spam_train, spam_test = train_test_split(spam, train_size = 0.75, stratify = spam["is_spam"])

#checking to make sure we don't have a large class imbalance (it's only about a 10% imbalance so it's fine)
spam["is_spam"].value_counts(normalize = True)
spam_tidy.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,is_spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,Spam
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,Spam
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,Spam
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,Spam
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,Spam


In [4]:
#preprocessing and making pipeline 

# empty perameter k neighbors so that we can optimize k
knn = KNeighborsClassifier()

# specify remainder to keep other columns
preprocessor = make_column_transformer((StandardScaler(), predictor_variables),
                                      remainder = "passthrough")

#creating varibles to store predicing variables and labels
X_train = pd.DataFrame(spam_train[predictor_variables])
y_train = spam_train["is_spam"]
X_test = pd.DataFrame(spam_test[predictor_variables])
y_test = spam_test["is_spam"]

#creating fitted pipeline
pipeline = make_pipeline(preprocessor, knn)


In [5]:
# scaling training data for exploratory data analysis
spam_train_scaled = (pd.DataFrame(preprocessor.fit_transform(spam_train)).rename(columns = {
                                                                                            0: "word_freq_credit_scaled",
                                                                                            1: "word_freq_000_scaled",
                                                                                            2: "word_freq_free_scaled",
                                                                                            3:"capital_run_length_longest_scaled",
                                                                                            4: "is_spam"
                                                                                            })
                    )

spam_train_scaled.head()

Unnamed: 0,word_freq_credit_scaled,word_freq_000_scaled,word_freq_free_scaled,capital_run_length_longest_scaled,is_spam
0,-0.165311,-0.28698,-0.334652,0.258344,Non_Spam
1,2.555242,-0.28698,-0.334652,-0.128774,Spam
2,-0.165311,-0.28698,-0.334652,-0.199588,Non_Spam
3,-0.165311,-0.28698,3.07175,0.820136,Spam
4,-0.165311,-0.28698,-0.334652,-0.16182,Non_Spam


In [6]:
#exploratory data analysis graph. This is scaled training data but the data seems to be crammed together because of the scale of the axis
#dispite scaling. Most of the action seems to be happening below x or y = 2 so we can "zoom in" on that section to take a closer look

# we are also using multiple prediction variables, so to avoid multidimentional graphs, we used a repeat chart
exploratory_matrix = (
    alt.Chart(spam_train_scaled, title = "Spam data")
    .mark_circle()
    .encode(
        x = alt.X(alt.repeat("column"), type = "quantitative"), 
        y = alt.Y(alt.repeat("row"), type = "quantitative"),
        color = alt.Color("is_spam:N", title = "Is spam?")
    )
    .properties(width=200, height=200)
    .repeat(row = predictor_variables_scaled, column = predictor_variables_scaled)
    .configure_axis(labelFontSize = 15, titleFontSize = 15)
    .configure_title(fontSize = 15)
   )

exploratory_matrix

  for col_name, dtype in df.dtypes.iteritems():


In [7]:
# filtering data to only those that have values less than 2 for all variables
spam_train_zoom = spam_train_scaled[
                                    (spam_train_scaled["word_freq_credit_scaled"] < 2) &
                                   (spam_train_scaled["word_freq_000_scaled"] < 2) &
                                   (spam_train_scaled["word_freq_free_scaled"] < 2) &
                                   (spam_train_scaled["capital_run_length_longest_scaled"] < 2)
                                    ].reset_index().drop(columns = ["index"])

spam_train_zoom.head(5)

Unnamed: 0,word_freq_credit_scaled,word_freq_000_scaled,word_freq_free_scaled,capital_run_length_longest_scaled,is_spam
0,-0.165311,-0.28698,-0.334652,0.258344,Non_Spam
1,-0.165311,-0.28698,-0.334652,-0.199588,Non_Spam
2,-0.165311,-0.28698,-0.334652,-0.16182,Non_Spam
3,-0.165311,-0.28698,-0.334652,-0.242076,Non_Spam
4,-0.165311,-0.28698,-0.334652,-0.175983,Non_Spam


In [8]:
# the "zoomed in" graph for all data points below 2

zoom_matrix = (
    alt.Chart(spam_train_zoom, title = "Zoomed-in spam data")
    .mark_circle()
    .encode(
        x = alt.X(alt.repeat("column"), type = "quantitative"), 
        y = alt.Y(alt.repeat("row"), type = "quantitative"),
        color = alt.Color("is_spam:N", title = "Is spam?")
    )
    .properties(width=200, height=200)
    .repeat(row = predictor_variables_scaled, column = predictor_variables_scaled)
    .configure_axis(labelFontSize = 15, titleFontSize = 15)
    .configure_title(fontSize = 15)
   )

zoom_matrix

  for col_name, dtype in df.dtypes.iteritems():


In [9]:
# Begin data analysis

# picked the first 100 values of k to try as it covers a good range of k values and a really large value of k would be really slow
param_grid = {"kneighborsclassifier__n_neighbors": range(1, 101, 1)}

In [10]:
# used standard 5 cross validations
spam_grid = GridSearchCV(estimator = pipeline, param_grid = param_grid, cv = 5)

# fit grid to training data
grid_fit = spam_grid.fit(X_train, y_train)

# knn model with best k value
best_model = spam_grid.best_estimator_

In [11]:
#visualize best k value
accuracies_grid = pd.DataFrame(grid_fit.cv_results_)

k_vals = (
    alt.Chart(accuracies_grid, title = "Accuracy of different K values")
    .mark_line(point = True)
    .encode(
        x = alt.X("param_kneighborsclassifier__n_neighbors", title = "K values", scale = alt.Scale(zero= False)), 
        y = alt.Y("mean_test_score", title = "Estimated accuracy percentage", scale = alt.Scale(zero= False))
    )
    .properties(width = 1000)
    .configure_axis(titleFontSize = 20, labelFontSize = 20)
    .configure_title(fontSize = 20)
)

k_vals

  for col_name, dtype in df.dtypes.iteritems():


In [12]:
#best k value (6) and the training error of best model
train_error = grid_fit.best_score_
best_estimator = grid_fit.best_params_

print(train_error, best_estimator, sep='\n'*2)

0.8385507246376811

{'kneighborsclassifier__n_neighbors': 6}


In [13]:
#predictions on test data
predictions = best_model.predict(X_test)

#concat the predictions onto the test data
test_predictions = spam_test.assign(predicted = predictions)

test_predictions.head(10)

Unnamed: 0,word_freq_credit,word_freq_000,word_freq_free,capital_run_length_longest,is_spam,predicted
1642,1.07,0.0,0.85,36,Spam,Spam
2317,0.0,0.0,0.0,4,Non_Spam,Non_Spam
3620,0.0,0.0,0.0,1,Non_Spam,Non_Spam
3527,0.0,0.09,0.0,18,Non_Spam,Non_Spam
3347,0.0,0.0,0.0,7,Non_Spam,Non_Spam
1831,0.0,0.0,0.0,5,Non_Spam,Non_Spam
4441,0.0,0.0,0.49,55,Non_Spam,Spam
3381,0.0,0.0,0.0,85,Non_Spam,Non_Spam
4334,0.0,0.0,0.0,34,Non_Spam,Non_Spam
314,0.0,0.11,0.46,48,Spam,Spam


In [14]:
# predicted error on test data
test_error = best_model.score(X_test, y_test)
test_error

0.8253692441355344

In [15]:
# confusion matrix to see what the cases where the model predicted wrong
confusion_matrix = pd.crosstab(test_predictions["is_spam"], test_predictions["predicted"])
confusion_matrix

predicted,Non_Spam,Spam
is_spam,Unnamed: 1_level_1,Unnamed: 2_level_1
Non_Spam,633,64
Spam,137,317


In [16]:
# plotting the test data with predicted labels
test_plot = (alt.Chart(test_predictions, title = "Testing Data with Predicted Labels")
             .mark_circle()
             .encode(
                x = alt.X(alt.repeat("column"), type = "quantitative"), 
                y = alt.Y(alt.repeat("row"), type = "quantitative"),
                color = alt.Color("predicted:N", title = "Is spam?")
             )
             .properties(width=200, height=200)
             .repeat(row = predictor_variables, column = predictor_variables)
             .configure_axis(labelFontSize = 15, titleFontSize = 15)
             .configure_title(fontSize = 15)
            )

test_plot

# Since our test data is only 25% of the original data, filtering it to values less than 2 unfortunately produced too few data points
# for a meaningful graph

  for col_name, dtype in df.dtypes.iteritems():


In [17]:
# visualizing the "barrier" between where the classifer catagorized spam or non spam

# we need to create a fake dataframe and cover a grid of values from the range of the largest to smallest data value in each varible column

# these lines get a evenly spaced out list of values that lie between each variable's min and max values
credit_grid = np.linspace(spam["word_freq_credit"].min(), spam["word_freq_credit"].max(), 25)
zeros_grid = np.linspace(spam["word_freq_000"].min(), spam["word_freq_000"].max(), 25)
free_grid = np.linspace(spam["word_freq_free"].min(), spam["word_freq_free"].max(), 25)
capital_grid = np.linspace(spam["capital_run_length_longest"].min(), spam["capital_run_length_longest"].max(), 25)

# these lines "mesh" the lists together into a grid. Since we had 4 variables and can't make a 4 dimensional grid, we grouped the variables
# into 2 grids then used concat to create a single data frame that had a grid of values
grid_pairs1 = pd.DataFrame(np.array(np.meshgrid(credit_grid, zeros_grid)).reshape(2, -1).T).rename(columns = {
                                                                                                            0: "word_freq_credit",
                                                                                                            1: "word_freq_000"})
                                                                                                                 
grid_pairs2 = pd.DataFrame(np.array(np.meshgrid(free_grid, capital_grid)).reshape(2, -1).T).rename(columns = {
                                                                                                            0: "word_freq_free",
                                                                                                            1: "capital_run_length_longest"})
bkg_grid = pd.concat([grid_pairs1, grid_pairs2], axis = 1)

# predicting on the new fake data set using our model
bkg_grid = bkg_grid.assign(predicted = best_model.predict(bkg_grid))

bkg_grid.head(10)

Unnamed: 0,word_freq_credit,word_freq_000,word_freq_free,capital_run_length_longest,predicted
0,0.0,0.0,0.0,1.0,Non_Spam
1,0.7575,0.0,0.833333,1.0,Spam
2,1.515,0.0,1.666667,1.0,Spam
3,2.2725,0.0,2.5,1.0,Spam
4,3.03,0.0,3.333333,1.0,Spam
5,3.7875,0.0,4.166667,1.0,Spam
6,4.545,0.0,5.0,1.0,Spam
7,5.3025,0.0,5.833333,1.0,Spam
8,6.06,0.0,6.666667,1.0,Spam
9,6.8175,0.0,7.5,1.0,Spam


In [18]:
# this is the predictions on the fake data. By blowing up the size of the circles to 500, we can effectively "shade the background" where the 
# model would have predicted spam or not spam.
knn_predictions = (
    alt.Chart(bkg_grid, title = "Areas of different classification")
    .mark_circle(size = 50, opacity = 1.0)
    .encode(
        x = alt.X(alt.repeat("column"), type = "quantitative"), 
        y = alt.Y(alt.repeat("row"), type = "quantitative"),
        color = alt.Color("predicted", title = "Is spam?")
    )
    .properties(width=200, height=200)
    .repeat(row = predictor_variables, column = predictor_variables)
    .configure_axis(labelFontSize = 15, titleFontSize = 15)
    .configure_title(fontSize = 15)
)

knn_predictions

# again because the data is so clustered in the cornner, we can't see much of what is going on. Since we are generating the data for 
# the "barrier" plot, we can do it for a set of zoomed in values (again less than 2)

  for col_name, dtype in df.dtypes.iteritems():


In [19]:
# creating new dataframe that is less than 2
credit_grid_zoom = np.linspace(spam["word_freq_credit"].min(), 2, 25)
zeros_grid_zoom = np.linspace(spam["word_freq_000"].min(), 2, 25)
free_grid_zoom = np.linspace(spam["word_freq_free"].min(), 2, 25)
capital_grid_zoom = np.linspace(spam["capital_run_length_longest"].min(), 2, 25)

In [20]:
# meshing lists into cooridnates
first_pair_cols = pd.DataFrame(np.array(np.meshgrid(credit_grid_zoom, zeros_grid_zoom)).reshape(2, -1).T).rename(columns = {
                                                                                                                    0: "word_freq_credit",
                                                                                                                    1: "word_freq_000"
                                                                                                                            })
                                                                                                                 
second_pair_cols = pd.DataFrame(np.array(np.meshgrid(free_grid_zoom, capital_grid_zoom)).reshape(2, -1).T).rename(columns = {
                                                                                                                    0: "word_freq_free",
                                                                                                                    1: "capital_run_length_longest"
                                                                                                                            })
 
# concat into a single data frame                                                                                                                             
whole = pd.concat([first_pair_cols, second_pair_cols], axis = 1)

# predicting on the dataframe                                                                                                                             
whole = whole.assign(predicted = best_model.predict(whole))

In [21]:
# "barrier" plot for the zoomed in data
predictions = (alt.Chart(whole, title = "Areas of different classification (zoomed)").mark_circle(size = 50, opacity = 1).encode(
        x = alt.X(alt.repeat("column"), type = "quantitative"), 
        y = alt.Y(alt.repeat("row"), type = "quantitative"),
        color = alt.Color("predicted", title = "Is spam?"))
               .properties(width=200, height=200)
               .repeat(row = predictor_variables, column = predictor_variables)
               .configure_axis(labelFontSize = 15, titleFontSize = 15)
               .configure_title(fontSize = 15)
               )

predictions

# unfortunately, because we used repeat plots, we can't layer them and thus cannot show the effect of the original data points overlaying the 
# barrier plot

  for col_name, dtype in df.dtypes.iteritems():


# Methods and Results

We will use classification with a model that we build and test. We will then use the results to assess how well certain variables of an e-mail message can be used to correctly classify an email as spam and non-spam. We chose four predictor variables we thought were indicative of whether an email may be spam. We chose them because the mean values (mean of all the observed e-mails in the data set) of these variables showed large differences for spam and non-spam. Therefore, we believe high values for these variables are characteristic of spam mail and allow us to build an accurate prediction model using the sklearn package and the $K$-nearest neighbours algorithm. We will also verify the accuracy of our model by putting aside a testing set. To visualize our results, we will use line plots showing the accuracy of our model for varying values of $K$. We will also use multiple scatter plots showing trends between spam and non-spam mail and our predictor variables, where spam and non-spam would be identified by colour and shape. One plot will show this trend using our training set and correct labels, and the other will use our testing set and predicted labels.

Our dataset was lacking headers, so our first order of business was to read in the data, then add the headers in from a separate names file. We then replaced the 0’s and 1’s in our dataset with “normal” and “spam”, before then filtering for our predictor variables. In our case, our predictor variables were the frequencies of the strings: “credit” “000”. “Free”, as well as the longest capital run length and whether or not the email was spam. 

After cleaning the data, we split it into a 75/25 train test split, stratified by whether or not the email was spam. We then verified that there was not a large class imbalance; in our case it was about 10% which was acceptable. We then use StandardScaler to standardize our predictor variables, allowing the remainder to pass through. We then create our X and y train and test variables, before then making a pipeline with knn and our preprocessor. Then, we scale our training data, and visualize the predictor variables using a scatterplot matrix. Because most of the points seemed to cluster under x and y = 2, we filtered our data to include only those with values under 2. We then visualized the predictor variables again using a scatterplot matrix, this time the data seeming more meaningful due to the increased detail we were able to observe from zooming in. The result we found from that is that normal emails tended to stay close to the bottom left corner of our graphs, meaning that with an increase of our predictor variables the frequency of spam emails increased accordingly.

Now, our next step was to figure out the best value of k. We decided to check the first 100 values of k; this number seemed appropriate because we wanted to be as thorough as possible while still having a reasonable run time. Using the standard 5 cross validations, it was determined that k=6 was optimal. Estimated accuracy of training predictions was 84% and estimated accuracy of test predictions was 83%. Using a confusion matrix, we were able to see where the model correctly predicted whether or not an email was spam. After that, we plotted out the test data, filtering to x and y <6 this time because filtering it to <2 produced too few data points. The same result as with the training data was found, where normal emails tended to have lower values.

We decided to go beyond the scope of the project and do a deeper dive into our model, so we created a fake dataframe and predicted on the fake data using our model. The result we found aligns with both the training and testing data.


# Discussion 

## Results

After cleaning and wrangling our data, we performed various forms of analysis to determine how well our chosen predictor variables can predict spam mail. We examined relationships between the predictor variables, built and tested a classification model, and then evaluated its output.

### Relationships

Using scatterplot matrices, we visualized the relationships between each of our predictor variables. From these plots, we could easily see a common trend; the data points consistently formed an L-shape, where the data points would be numerous around the origin and gradually decreased along each axis, with very few points in the areas further away from the axes. This showed us that the characteristics tend to be mutually exclusive in any given e-mail. For example, an e-mail could have a frequency of the word “credit” and another e-mail could have a high frequency of “000”, but an e-mail would rarely possess both of these attributes. The only relationship with a relatively higher amount of spread away from the axes is the frequency of “free” and “000”, which makes sense since this could be characteristic of spam e-mails advertising free products or savings of high monetary value. 

Additionally, by colouring the data points by their class, we could see that the points classified as non-spam are consistently located near the origin. This confirms our expectation that non-spam e-mails rarely possess high frequencies of suspicious words or capital letters, which are the characeristics of interest. This phenomenon forms the basis of our investigation, as we can apply it to build a predictive model.

### Model

Using our data, we proceeded to built a classification model that utilizes the $K$-nearest neighbours algorithm and the `scikit` package . We started by scaling our data with a preprocessor, creating a pipeline with the preprocessor and a KNN classifier. We did not specify the number of $K$-neigbours, as we want to examine the model for varying $K\in [1,\ldots 100]$. We carried out this step using a Grid Search with cross-validation to improve the accuracy of the model for each $K$

By varying $K$, we expect the model’s perfomance to fluctuate. From the lecture readings, we saw many line plots of $K$ vs. Accuracy. In these plots, the common trend was that the accuracy was significantly lower for low values of $K$, rose sharply for increasing $K$, then decreased for very large $K$. The rate of decrease depended on the size of the data set. 

In our case, the line plot showed a similar trend, however there was only a small decrease in accuracy for larger $K$ since the data set is quite large (~3,500 observations in the training set), so the models with higher $K$ were not severely underfit. From the plot, we can see the accuracy was high for values between 4 and 10, with the best estimator for $K=6$ and an estimated accuracy of $83.85\%$.

This result fit our expectations, as the model would be more overfit for very small $K$ values, and would decrease in accuracy, although only slightly for larger $K$as it would include data points “far” away from the data point to predict. 

### Evaluation of Results

Using our best model, we predicted the class of the data in our test set and assigned the predicted class to a new column in the test set dataframe. We obtained the accuracy of our model with the built-in score method, which produced a score of $82.53\%$. The model’s accuracy is very close to its accuracy when applied to the training set. However, it is slightly lower, which is expected since the model was not built with the training set, and has not seen the test set before. 

We believe that this score is satisfactory but could be improved, as it would only correctly classify roughly 4 out of 5 e-mails. An error for every 1 out of 5 predictions would not be optimal, since the severity of an incorrect classification is not low. By that we mean, if for example our model served as a filter for someone’s inbox and incorrectly classified an e-mail, the consequences may be significant. A spam e-mail that lands in someone’s inbox may lead to that person falling for a scam and having their data compromised. Or, a crucial e-mail may be lost in the spam folder. Personally, one of our group members has had an interview invitation sit in their spam folder for while before finding it. 

Moreover, using a confusion matrix, we are able to see the amount of false negatives and false positives (positive being a classification of "spam"). The matrix shows us that the model predicted a small amount of false positives relative to true positives ($\frac{64}{633}\approx 10\%$), while there was a much larger amount of false negatives relative to true negatives ($\frac{137}{317}\approx 43\%$). This may be because it is "easier" to classify non-spam since almost all non-spam data points are located near the origin. On the other hand, many spam data points that are located just a bit further away from the origin, but still near the non-spam data points may be incorrectly classified as non-spam.

Similar to our exploratory analysis, we again made a scatterplot matrix, this time with our test set and the predicted labels of our classes. Not suprisingly, these plots exhibited similar trends and shapes, with the non-spam e-mails concentrated near the origin. One unexpected error to note is where the model classified an e-mail with a very high frequency of the word “free” as non-spam. This could be due to the model not being trained for this test set. 

Furthermore, we visualized for what values of a certain predictor variable lead to a “spam” or “non-spam” classification from our model. This resulted in a “barrier-graph”; that is barrier between the different classes. Each graph showed a similar trend - low (scaled) values of predictor variables ($<0.5$) very often resulted in a non-spam classification. This was as expected, since mon-spam e-mails rarely possess these characteristics (suspicious words such as “free”, “credit”, etc. and capital letters)

### Reflection

Given these findings, we must ask ourselves the following : How could our model be improved, and what impact could our results have for real applications ? 

There are few things we could change on a lower level that might improve our model. For instance, our choices for the training / testing split ratio may have beem suboptimal. A larger training set may optimize the model, but a smaller test set would also make our obtained accuracy less telling and less reliable. We could also have applied a higher degree of cross validation. This would improve the model, but one must weigh this benefit against the increased computational time associated with it. 

On a grander scale, it is possible that alternative predictor variables, and a higher amount of them could alter our results. In our introductory report, we included a table of the predictor variables ordered by the relative magnitude in difference of a given predictor variable between e-mails labelled as spam and non-spam. We chose a mix of predictor variables that were related to each other - “credit”, “free”, “000” and capital letters are related to monetary transactions or offers. However, these predictor variables were not necessarily at the top in the aforementioned table. If we had simply chosen the top four predictor variables instead, it is possible that our model may have performed better. 

As discussed in Kaddoura et. al. (2022) [1], there are in fact researchers who have achieved much higher rates of accuracy, for the most part due to the more complex algorithms used, involving as rule-based classificaiton with weights and machine learning. Mujtaba et. al. [2] also showed techniques like content based learning and statistical learning.

While our model is certainly not as accurate the ones mentioned in [1], spam classification models do have many important applications, as shown in Mujtaba et. al. [2]. The most prevalent one is spam / phishing, but other uses include multi folder categorization, language classification and private /official classification. As e-mails are such a common part of our lives, it is important to have classification models in place to identify and protect users from emails with nefarious intent and keep inboxes free of the clutter caused by spam.  

Looking forward, we ask ourselves how email classifiers may generally be improved in the future. A few ways researchers aim to improve email classification are presented in [2]. One that we found interesting in particular is real-time learning. The digital world is constantly changing, and with it the characteristics of emails are evolving. Therefore, models trained with outdated datasets may not be perform optimally when applied to emails of the present. Other methods for improvement include hierarchical classification, deep learning and overcoming language and dataset barriers.

## References 

     Kaddoura, S., Chandrasekaran, G., Elena Popescu, D., & Duraisamy, J. H. (2022). A systematic literature review on spam content detection and classification. PeerJ Computer Science, 8, e830. https://doi.org/10.7717/peerj-cs.830

    Mujtaba, G., Shuib, L., Raj, R. G., Majeed, N., & Al-Garadi, M. A. (2017). Email Classification Research Trends: Review and Open Issues. IEEE Access, 5, 9044–9064. https://doi.org/10.1109/access.2017.2702187