## Exercise 4: Spam Classifier

### 1. Load data

In [1]:
%matplotlib inline
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pickle

def load_obj(name ):
    with open('./' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [2]:
# Fetch data
X_train = np.array(load_obj("X_train"))
y_train = np.array(load_obj("y_train"))
X_test = np.array(load_obj("X_test"))
y_test = np.array(load_obj("y_test"))

print("load data")
print("X_train:", len(X_train), "\ty_train:", len(y_train))
print("X_test:", len(X_test), "\ty_test:", len(y_test))

load data
X_train: 5481 	y_train: 5481
X_test: 1371 	y_test: 1371


### SGDClassifier

In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=42)
print("F1 Score:", cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring = 'f1'))
print("Accuracy:", cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring = 'accuracy'))

F1 Score: [0.98251192 0.98584906 0.98368298]
Accuracy: [0.98796499 0.99014778 0.98849945]


### Verify on Test Set Using SGDClassifier

In [4]:
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train)
y_pred = sgd_clf.predict(X_test)
correct_count = sum(y_pred == y_test)
print("Predict:", correct_count / len(y_test))

Predict: 0.9978118161925602


### Try Different Classifiers

In [139]:
# GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)
param_grid = [
    {
    "n_estimators": [100, 300, 1000, 3000, 10000],
    "max_features": [8, 9, 10],
    "bootstrap": [True],
    },
]
grid_search = GridSearchCV(forest_clf, param_grid, cv=3, scoring="f1", return_train_score=True, n_jobs=-1)
grid_search.fit(X_train, y_train)
cvres = grid_search.cv_results_
for mean_score, params in sorted(zip(cvres['mean_test_score'], cvres['params']), key=lambda x: x[0], reverse=True):
    print(mean_score, params)

0.9804508915385339 {'bootstrap': True, 'max_features': 10, 'n_estimators': 1000}
0.9793909236092351 {'bootstrap': True, 'max_features': 9, 'n_estimators': 3000}
0.9793880715119263 {'bootstrap': True, 'max_features': 10, 'n_estimators': 3000}
0.9793831256661922 {'bootstrap': True, 'max_features': 9, 'n_estimators': 10000}
0.9793831256661922 {'bootstrap': True, 'max_features': 10, 'n_estimators': 10000}
0.9793681588207632 {'bootstrap': True, 'max_features': 10, 'n_estimators': 300}
0.9791239542067468 {'bootstrap': True, 'max_features': 9, 'n_estimators': 1000}
0.9791142241129759 {'bootstrap': True, 'max_features': 8, 'n_estimators': 1000}
0.9788429641976099 {'bootstrap': True, 'max_features': 8, 'n_estimators': 3000}
0.9788429641976099 {'bootstrap': True, 'max_features': 8, 'n_estimators': 10000}
0.9785808400236481 {'bootstrap': True, 'max_features': 8, 'n_estimators': 100}
0.9785700718478565 {'bootstrap': True, 'max_features': 9, 'n_estimators': 300}
0.9780710849908693 {'bootstrap': Tru

### Predict Custom Email

In [7]:
custom_mail_indices = load_obj("custom_email")
result = sgd_clf.predict([custom_mail_indices])
print("Predict custom e-mail:", result)

Predict custom e-mail: [0]
