In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

This notebook is serving two purpose:

1. As the Intro notebook for my first dataset on kaggle. 🎉
2. My solution to the classification exercise i have been following up on the book <b color="#900C3F">Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow Concepts</b>. The exercise is on building spam classifer using the [Spam Assassin Public Dataset](https://homl.info/spamassassin). I already made a comprehensive dataset from the source thou.

**I hope you find this notebook inciteful 😊**

#### Question

Build a spam classifier (a more challenging exercise):

-  Download examples of spam and ham from Apache SpamAssassin’s public datasets. [link](https://homl.info/spamassassin)
-  Unzip the datasets and familiarize yourself with the data format.
-  Split the datasets into a training set and a test set.
-  Write a data preparation pipeline to convert each email into a feature vector. Your preparation pipeline should transform an email into a (sparse) vector that indicates the presence or absence of each possible word. For example, if all emails only ever contain four words, “Hello,” “how,” “are,” “you,” then the email “Hello you Hello Hello you” would be converted into a vector [1, 0, 0, 1] (meaning [“Hello” is present, “how” is absent, “are” is absent, “you” is present]), or [3, 0, 0, 2] if you prefer to count the number of occurrences of each word. 

You may want to add hyperparameters to your preparation pipeline to control whether or not to strip off email headers, convert each email to lowercase, remove punctuation, replace all URLs with “URL,” replace all numbers with “NUMBER,” or even perform stemming (i.e., trim off word endings; there are Python libraries available to do this).

**Download examples of spam and ham from Apache SpamAssassin’s public datasets.✔️**

**Unzip the datasets and familiarize yourself with the data format.✔️**

In [None]:
dataset = pd.read_csv("../input/spam-assassin-email-classification-dataset/spam_assassin.csv")

In [None]:
dataset.head()

In [None]:
data, target = dataset.text, dataset.target

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

for train_index, test_index in sss.split(data, target):
    train_X, test_X = data.loc[train_index], data.loc[test_index]
    train_y, test_y = target.loc[train_index], target.loc[test_index]

**Split the datasets into a training set and a test set.✔️**

I already went through the process of extracting irrelevant and meaningless words from the spam dataset and serializing them with pickle, thus upon load we can join them with the `nltk.corpus.stopwords.words('english')` to create a much better set of stopwords for the `TfidfVectorizer`.

In [None]:
import pickle
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
spam_assassin_stopwords = pickle.load(open('../input/spamassassin-stopwords/spamassassin_stopwords.p', 'rb'))
english_stopwords = stopwords.words('english')

In [None]:
stop_words = spam_assassin_stopwords + english_stopwords

In [None]:
tfidf = TfidfVectorizer(stop_words=stop_words, token_pattern=r'(?u)\b([a-zA-Z]{4,12})\b')

In [None]:
tfidf.fit(train_X)

**Write a data preparation pipeline to convert each email into a feature vector.✔️**

The question actually required that I make use of the `CountVectorizer`, but I opt-for the `TfidfVectorizer` because it takes term frequency into account.

## Choosing a model

Now that we've accomplished that! Let's proceed to testing this models in various classifiers. I will be working just two classifiers, the decision tree and the random forest classifier.

In [None]:
train_X = tfidf.transform(train_X)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict

In [None]:
dt_clf = DecisionTreeClassifier(random_state=0)
cross_val_score(dt_clf, train_X, train_y, cv=5, n_jobs=3)

In [None]:
rf_clf = RandomForestClassifier(random_state=0)
cross_val_score(rf_clf, train_X, train_y, cv=5, n_jobs=3)

Since the random forest model performs best we'll be making use of it. Let's now move on to fine tune the model.

## Fine-tuning the model

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
space = {
    'n_estimators': range(100, 351, 50),
    'bootstrap': [True, False]
}

In [None]:
grid_search = GridSearchCV(rf_clf, space, cv=3, n_jobs=-1, scoring='accuracy')

In [None]:
grid_search.fit(train_X, train_y)

In [None]:
grid_search.best_params_

In [None]:
model = grid_search.best_estimator_

In [None]:
model.fit(train_X, train_y)

In [None]:
test_X = tfidf.transform(test_X)

In [None]:
predictions = model.predict(test_X)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(predictions, test_y)

🎉... **99% Accuracy on the test set!!!**

Well that concludes this exercise for me! I can't wait to see what the pro's can actually make out of this 😊.