In [1]:
import pandas as pd
import altair as alt
import numpy as np
from collections import ChainMap
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GridSearchCV
import random
alt.data_transformers.disable_max_rows()

#keep track of random events for reproducability
np.random.seed(3127)

In [2]:
#reading in data
spam = pd.read_csv("spambase.data", header = None)

#wrangling:

#adding the column headers
spam_titles = pd.read_csv("spambase.names", skiprows = 31)
spam_titles_split = spam_titles["1"].str.split(":", expand = True)
spam_headers = spam_titles_split[[0]].to_dict()
spam_headers = dict(ChainMap(*spam_headers.values()))
spam_tidy = spam.rename(columns = spam_headers).rename(columns = {57: "is_spam"})

#casting the 0 1 system of is_spam to "Spam" and "Normal"
spam_tidy["is_spam"] = spam_tidy["is_spam"].replace({0: "Normal", 1: "Spam"}).astype("category")

#filtering to just the predictor variables and labels
spam = spam_tidy[["word_freq_credit", "word_freq_000","word_freq_free","capital_run_length_longest", "is_spam"]]

#splitting data into 75/25 train test split
spam_train, spam_test = train_test_split(spam, train_size = 0.75, stratify = spam["is_spam"])

#checking to make sure we don't have a large class imbalance (it's only about a 10% imbalance so it's fine)
spam["is_spam"].value_counts(normalize = True)

Normal    0.605955
Spam      0.394045
Name: is_spam, dtype: float64

In [3]:
#preprocessing and making pipeline 

#empty perameter k neighbors so that we can optimize k
knn = KNeighborsClassifier()
preprocessor = make_column_transformer((StandardScaler(), ["word_freq_credit", "word_freq_000","word_freq_free","capital_run_length_longest"]),
                                      remainder = "passthrough")

#creating varibles to store predicing variables and labels
X_train = pd.DataFrame(spam_train[["word_freq_credit", "word_freq_000","word_freq_free","capital_run_length_longest"]])
y_train = spam_train["is_spam"]
X_test = pd.DataFrame(spam_test[["word_freq_credit", "word_freq_000","word_freq_free","capital_run_length_longest"]])
y_test = spam_test["is_spam"]

#creating fitted pipeline
pipeline = make_pipeline(preprocessor, knn)


In [4]:
#scaling training data for exploratory data analysis
spam_train_scaled = (pd.DataFrame(preprocessor.fit_transform(spam_train)).rename(columns = {0: "word_freq_credit_scaled",
                                                                                        1: "word_freq_000_scaled",
                                                                                        2: "word_freq_free_scaled",
                                                                                        3:"capital_run_length_longest_scaled"})
                     .rename(columns = {4: "is_spam"}))

spam_train_scaled

Unnamed: 0,word_freq_credit_scaled,word_freq_000_scaled,word_freq_free_scaled,capital_run_length_longest_scaled,is_spam
0,-0.165311,-0.28698,-0.334652,0.258344,Normal
1,2.555242,-0.28698,-0.334652,-0.128774,Spam
2,-0.165311,-0.28698,-0.334652,-0.199588,Normal
3,-0.165311,-0.28698,3.07175,0.820136,Spam
4,-0.165311,-0.28698,-0.334652,-0.16182,Normal
...,...,...,...,...,...
3445,-0.165311,-0.28698,0.023917,-0.16182,Normal
3446,-0.165311,-0.28698,0.216992,-0.166541,Normal
3447,-0.165311,-0.28698,-0.334652,-0.020192,Normal
3448,-0.165311,-0.28698,1.430609,-0.190146,Normal


In [23]:
#exploratory data analysis graph. This is scaled training data but the data seems to be crammed together because of the scale of the axis
#dispite scaling. Most of the action seems to be happening below x or y = 2 so we can "zoom in" on that section to take a closer look

# we are also using multiple prediction variables, so to avoid multidimentional graphs, we used a repeat chart
exploratory_matrix = (alt.Chart(spam_train_scaled, title = "Spam data").mark_circle().encode(
    x = alt.X(alt.repeat("column"), type = "quantitative"), 
    y = alt.Y(alt.repeat("row"), type = "quantitative"),
    color = alt.Color("is_spam:N", title = "Is spam?"))
               .repeat(row = ["word_freq_credit_scaled", "word_freq_000_scaled","word_freq_free_scaled","capital_run_length_longest_scaled"],
                       column = ["word_freq_credit_scaled", "word_freq_000_scaled","word_freq_free_scaled","capital_run_length_longest_scaled"])
                      .configure_axis(labelFontSize = 15, titleFontSize = 15)
                      .configure_title(fontSize = 15)
                      
               )

exploratory_matrix

In [6]:
# filtering data to only those that have values less than 2 for all variables
spam_train_zoom = spam_train_scaled[(spam_train_scaled["word_freq_credit_scaled"] < 2) &
                                   (spam_train_scaled["word_freq_000_scaled"] < 2) &
                                   (spam_train_scaled["word_freq_free_scaled"] < 2) &
                                   (spam_train_scaled["capital_run_length_longest_scaled"] < 2)].reset_index().drop(columns = ["index"])
spam_train_zoom


Unnamed: 0,word_freq_credit_scaled,word_freq_000_scaled,word_freq_free_scaled,capital_run_length_longest_scaled,is_spam
0,-0.165311,-0.28698,-0.334652,0.258344,Normal
1,-0.165311,-0.28698,-0.334652,-0.199588,Normal
2,-0.165311,-0.28698,-0.334652,-0.16182,Normal
3,-0.165311,-0.28698,-0.334652,-0.242076,Normal
4,-0.165311,-0.28698,-0.334652,-0.175983,Normal
...,...,...,...,...,...
3060,-0.165311,-0.28698,0.023917,-0.16182,Normal
3061,-0.165311,-0.28698,0.216992,-0.166541,Normal
3062,-0.165311,-0.28698,-0.334652,-0.020192,Normal
3063,-0.165311,-0.28698,1.430609,-0.190146,Normal


In [24]:
# the "zoomed in" graph for all data points below 2
zoom_matrix = (alt.Chart(spam_train_zoom, title = "Zoomed-in spam data").mark_circle().encode(
    x = alt.X(alt.repeat("column"), type = "quantitative"), 
    y = alt.Y(alt.repeat("row"), type = "quantitative"),
    color = alt.Color("is_spam:N", title = "Is spam?"))
               .repeat(row = ["word_freq_credit_scaled", "word_freq_000_scaled","word_freq_free_scaled","capital_run_length_longest_scaled"],
                       column = ["word_freq_credit_scaled", "word_freq_000_scaled","word_freq_free_scaled","capital_run_length_longest_scaled"])
               .configure_axis(labelFontSize = 15, titleFontSize = 15)
               .configure_title(fontSize = 15)
               )

zoom_matrix

In [8]:
# Begin data analysis

# picked the first 100 values of k to try as it covers a good range of k values and a really large value of k would be really slow
param_grid = {"kneighborsclassifier__n_neighbors": range(1, 101, 1)}


In [9]:
#used standard 5 cross validations
spam_grid = GridSearchCV(estimator = pipeline, param_grid = param_grid, cv = 5)
#fit grid to trainig data
grid_fit = spam_grid.fit(X_train, y_train)
# knn model with best k value
best_model = spam_grid.best_estimator_

In [26]:
#visualize best k value
accuracies_grid = pd.DataFrame(grid_fit.cv_results_)
k_vals = (alt.Chart(accuracies_grid, title = "Accuracy of different K values").mark_line(point = True).encode(
    x = alt.X("param_kneighborsclassifier__n_neighbors", title = "K values", scale = alt.Scale(zero= False)), 
    y = alt.Y("mean_test_score", title = "Estimated accuracy percentage", scale = alt.Scale(zero= False)))
          .properties(width = 900)
          .configure_axis(titleFontSize = 20, labelFontSize = 20)
         .configure_title(fontSize = 20))
k_vals

In [11]:
#best k value(6) and the training error of best model
train_error = grid_fit.best_score_
best_estimator = grid_fit.best_params_
train_error

0.8385507246376811

In [12]:
best_estimator

{'kneighborsclassifier__n_neighbors': 6}

In [13]:
#predictions on test data
predictions = best_model.predict(X_test)

#concat the predictions onto the test data
test_predictions = spam_test.assign(predicted = predictions)
test_predictions

Unnamed: 0,word_freq_credit,word_freq_000,word_freq_free,capital_run_length_longest,is_spam,predicted
1642,1.07,0.00,0.85,36,Spam,Spam
2317,0.00,0.00,0.00,4,Normal,Normal
3620,0.00,0.00,0.00,1,Normal,Normal
3527,0.00,0.09,0.00,18,Normal,Normal
3347,0.00,0.00,0.00,7,Normal,Normal
...,...,...,...,...,...,...
145,0.00,2.53,0.00,27,Spam,Spam
138,0.00,0.00,2.98,2,Spam,Spam
3753,0.00,0.00,0.00,45,Normal,Normal
352,0.00,0.00,1.28,41,Spam,Spam


In [14]:
#predicted error on test data
test_error = best_model.score(X_test, y_test)
test_error

0.8253692441355344

In [27]:
#plotting the test data in original labels
test_plot = (alt.Chart(test_predictions, title = "Testing data").mark_circle().encode(
    x = alt.X(alt.repeat("column"), type = "quantitative"), 
    y = alt.Y(alt.repeat("row"), type = "quantitative"),
    color = alt.Color("is_spam:N", title = "Is spam?"))
               .repeat(row = ["word_freq_credit", "word_freq_000","word_freq_free","capital_run_length_longest"],
                       column = ["word_freq_credit", "word_freq_000","word_freq_free","capital_run_length_longest"])
             .configure_axis(labelFontSize = 15, titleFontSize = 15)
             .configure_title(fontSize = 15)
            )
test_plot

# unfortunatly because our test data is only 25% of the original data, filtering it to values less than 2 produced too few data points
# for a meaningful graph

In [16]:
# visualizing the "barrier" between where the classifer catagorized spam or non spam

# we need to create a fake dataframe and cover a grid of values from the range of the largest to smallest data value in each varible column

# these lines get a evenly spaced out list of values that lie between each varible's min and max values
credit_grid = np.linspace(spam["word_freq_credit"].min(), spam["word_freq_credit"].max(), 25)
zeros_grid = np.linspace(spam["word_freq_000"].min(), spam["word_freq_000"].max(), 25)
free_grid = np.linspace(spam["word_freq_free"].min(), spam["word_freq_free"].max(), 25)
capital_grid = np.linspace(spam["capital_run_length_longest"].min(), spam["capital_run_length_longest"].max(), 25)

# these lines "mesh" the lists together into a grid. Since we had 4 varibles and can't make a 4 dimentional grid, we grouped the variables
# into 2 grids then used concat to create a single data frame that had a grid of values
grid_pairs1 = pd.DataFrame(np.array(np.meshgrid(credit_grid, zeros_grid)).reshape(2, -1).T).rename(columns = {0: "word_freq_credit",
                                                                                                                 1: "word_freq_000"})
                                                                                                                 
grid_pairs2 = pd.DataFrame(np.array(np.meshgrid(free_grid, capital_grid)).reshape(2, -1).T).rename(columns = {0: "word_freq_free",
                                                                                                                  1: "capital_run_length_longest"})
bkg_grid = pd.concat([grid_pairs1, grid_pairs2], axis = 1)

# predicting on the new fake data set using our model
bkg_grid = bkg_grid.assign(predicted = best_model.predict(bkg_grid))

In [28]:
# this is the predictions on the fake data. By blowing up the size of the circles to 500, we can effectively "shade the background" where the 
# model would have predicted spam or not spam.
knn_predictions = (alt.Chart(bkg_grid, title = "Areas of different classification").mark_circle(size = 500, opacity = 0.1).encode(
        x = alt.X(alt.repeat("column"), type = "quantitative"), 
        y = alt.Y(alt.repeat("row"), type = "quantitative"),
        color = alt.Color("predicted", title = "Is spam?"))
               .repeat(row = ["word_freq_credit", "word_freq_000","word_freq_free","capital_run_length_longest"],
                       column = ["word_freq_credit", "word_freq_000","word_freq_free","capital_run_length_longest"])
                   .configure_axis(labelFontSize = 15, titleFontSize = 15)
                   .configure_title(fontSize = 15)
               )
knn_predictions

# again because the data is so clustered in the cornner, we can't see much of what is going on. Since we are generating the data for 
# the "barrier" plot, we can do it for a set of zoomed in values (again less than 2)

In [18]:
# creating new dataframe that is less than 2
credit_grid_zoom = np.linspace(spam["word_freq_credit"].min(), 2, 25)
zeros_grid_zoom = np.linspace(spam["word_freq_000"].min(), 2, 25)
free_grid_zoom = np.linspace(spam["word_freq_free"].min(), 2, 25)
capital_grid_zoom = np.linspace(spam["capital_run_length_longest"].min(), 2, 25)

In [19]:
# meshing lists into cooridnates
first_pair_cols = pd.DataFrame(np.array(np.meshgrid(credit_grid_zoom, zeros_grid_zoom)).reshape(2, -1).T).rename(columns = {0: "word_freq_credit",
                                                                                                                 1: "word_freq_000"})
                                                                                                                 
second_pair_cols = pd.DataFrame(np.array(np.meshgrid(free_grid_zoom, capital_grid_zoom)).reshape(2, -1).T).rename(columns = {0: "word_freq_free",
                                                                                                                             1: "capital_run_length_longest"})
 
# concat into a single data frame                                                                                                                             
whole = pd.concat([first_pair_cols, second_pair_cols], axis = 1)

# predicting on the dataframe                                                                                                                             
whole = whole.assign(predicted = best_model.predict(whole))


In [29]:
# "barrier" plot for the zoomed in data
predictions = (alt.Chart(whole, title = "Areas of differnt classification (zoomed)").mark_circle(size = 500, opacity = 0.1).encode(
        x = alt.X(alt.repeat("column"), type = "quantitative"), 
        y = alt.Y(alt.repeat("row"), type = "quantitative"),
        color = alt.Color("predicted", title = "Is spam?"))
               .repeat(row = ["word_freq_credit", "word_freq_000","word_freq_free","capital_run_length_longest"],
                       column = ["word_freq_credit", "word_freq_000","word_freq_free","capital_run_length_longest"])
               .configure_axis(labelFontSize = 15, titleFontSize = 15)
               .configure_title(fontSize = 15)
               )

predictions

# unfortunatly, because we used repreat plots, we can't layer them and thus cannot show the effect of the original data points overlaying the 
# barrier plot