In [309]:
import pandas as pd
import numpy  as np

import sklearn
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import cm
import seaborn as sns

import imblearn
from imblearn.over_sampling import RandomOverSampler

In [310]:
df = pd.read_csv('../data/processed/spambase.data', sep=',', decimal = '.', header=0)
df.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


## Train, validation, test datasets

In [311]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6 * len(df)), int(0.8 * len(df))])

df_shuffeled = df.sample(frac=1).reset_index(drop=True)

train_end = int(0.6 * len(df_shuffeled))
valid_end = int(0.8 * len(df_shuffeled))

train = df_shuffeled.iloc[:train_end]
valid = df_shuffeled.iloc[train_end:valid_end]
test  = df_shuffeled.iloc[valid_end:]

  return bound(*args, **kwds)


In [312]:
print(len(df[df['spam'] == 1]))
print(len(df[df['spam'] == 0]))

1813
2788


In [313]:
from pandas import DataFrame

def scale_dataset(dataframe: DataFrame, oversample=False):
	X = dataframe[dataframe.columns[:-1]].values
	y = dataframe[dataframe.columns[-1]].values

	scaler = StandardScaler()
	X = scaler.fit_transform(X)

	if oversample:
		ros = RandomOverSampler()
		X, y = ros.fit_resample(X, y)

	data = np.hstack((X, np.reshape(y, (-1, 1))))

	return data, X, y

In [314]:
train, X_train, y_train = scale_dataset(train, True)
valid, X_valid, y_valid = scale_dataset(valid, False)
test, X_test, y_test = scale_dataset(test, False)

## k-nearest neighbours (KNN)

In [315]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [316]:
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model = knn_model.fit(X_train, y_train)
knn_model

0,1,2
,n_neighbors,3
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [317]:
y_pred = knn_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92       576
           1       0.85      0.89      0.87       345

    accuracy                           0.90       921
   macro avg       0.89      0.90      0.90       921
weighted avg       0.90      0.90      0.90       921



## Naive Bayes

In [318]:
from sklearn.naive_bayes import GaussianNB

In [319]:
nb_model = GaussianNB()
nb_model = nb_model.fit(X_train, y_train)
nb_model

0,1,2
,priors,
,var_smoothing,1e-09


In [320]:
y_pred = nb_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.76      0.84       576
           1       0.70      0.94      0.80       345

    accuracy                           0.82       921
   macro avg       0.82      0.85      0.82       921
weighted avg       0.86      0.82      0.83       921



## Logistic regression

In [321]:
from sklearn.linear_model import LogisticRegression

In [322]:
log_model = LogisticRegression()
log_model = log_model.fit(X_train, y_train)
log_model

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [323]:
y_pred = log_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.93      0.93       576
           1       0.88      0.91      0.89       345

    accuracy                           0.92       921
   macro avg       0.91      0.92      0.91       921
weighted avg       0.92      0.92      0.92       921



## SVM

In [324]:
from sklearn.svm import SVC

In [325]:
svm_model = SVC()
svm_model = svm_model.fit(X_train, y_train)
svm_model

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [326]:
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95       576
           1       0.91      0.91      0.91       345

    accuracy                           0.93       921
   macro avg       0.93      0.93      0.93       921
weighted avg       0.93      0.93      0.93       921

