In [1]:
import numpy as np 
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score, precision_score, f1_score

In [2]:
DATA_JSON_FILE = 'UniversityProject_SpamFilter/01_Processing/email-text-data.json'

In [3]:
data = pd.read_json(DATA_JSON_FILE)

In [4]:
data.tail()

Unnamed: 0,Message,Category,File_Name
5791,"I'm one of the 30,000 but it's not working ver...",0,00609.dd49926ce94a1ea328cce9b62825bc97
5792,Damien Morton quoted:\n\n>W3C approves HTML 4 ...,0,00957.e0b56b117f3ec5f85e432a9d2a47801f
5793,"On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\...",0,01127.841233b48eceb74a825417d8d918abf8
5794,"Once upon a time, Manfred wrote :\n\n\n\n> I w...",0,01178.5c977dff972cd6eef64d4173b90307f0
5795,"If you run Pick, and then use the ""New FTOC"" b...",0,00747.352d424267d36975a7b40b85ffd0885e


In [5]:
data.shape

(5796, 3)

In [6]:
data.sort_index(inplace = True)

In [7]:
#create vectorizer

vectorizer = CountVectorizer(stop_words='english')

In [8]:
#created document term matrix in the previous method, however can do in one line with scikitlean

all_features = vectorizer.fit_transform(data.Message)

In [9]:
all_features.shape

(5796, 102694)

In [10]:
# vectorizer.vocabulary_ #--> not stemmed words

In [11]:
X_train, X_test, y_train, y_test = train_test_split(all_features, data.Category, test_size = 0.3, random_state = 88)

In [12]:
X_train.shape

(4057, 102694)

In [13]:
X_test.shape

(1739, 102694)

In [14]:
classifier = MultinomialNB()

In [15]:
 classifier.fit(X_train, y_train)

MultinomialNB()

In [16]:
nr_correct = (y_test == classifier.predict(X_test)).sum()

In [17]:
classifier.predict(X_test)

array([0, 0, 0, ..., 1, 0, 0])

In [18]:
print(f'{nr_correct} documents classified correctly')

1660 documents classified correctly


In [19]:
nr_incorrect = y_test.size - nr_correct

In [20]:
print(f'{nr_incorrect} documents classified incorrectly')

79 documents classified incorrectly


In [21]:
fraction_wrong = nr_incorrect/ (nr_correct +nr_incorrect)
print(f' The (testing) accuracy of the model is {1-fraction_wrong:.3%}')

 The (testing) accuracy of the model is 95.457%


In [22]:
classifier.score(X_test, y_test)

0.9545715928694652

# Recall Score

In [23]:
recall_score(y_test, classifier.predict(X_test))

0.8646209386281588

# Precision

In [24]:
precision_score(y_test, classifier.predict(X_test))

0.9917184265010351

# F1 Score

In [25]:
f1_score(y_test, classifier.predict(X_test))

0.9238187078109932

In [26]:
example = ['get viagra for free now!', 
          'need a mortgage? Reply to arrange. a call with a specialist and get a quote', 
          'Could you please help me with the project for tomorrow? ', 
          'Hello Jonathan, how about a game of Golf tomorrow? ', 
          'Gaelic football (Irish: Peil Ghaelach; short name Peil[1] or Caid), commonly referred to as football, Gaelic or GAA,[2] is an Irish team sport. It is played between two teams of 15          players on a rectangular grass pitch. The objective of the sport is to score by kicking or punching the ball into the other team\'s goals (3 points) or between two upright posts above the goals and over a crossbar 2.5 metres (8 ft 2 in) above the ground (1 point).']

In [27]:
dtm = vectorizer.transform(example)

In [28]:
classifier.predict(dtm)

array([1, 1, 0, 0, 0])