In [19]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [4]:
DATA_JSON_FILE = 'SpamData\\01_Processing\\email-text-data.json'

In [5]:
data = pd.read_json(DATA_JSON_FILE)

In [6]:
data.head()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",00249.5f45607c1bffe89f60ba1ec9f878039a
1,1,ATTENTION: This is a MUST for ALL Computer Use...,00373.ebe8670ac56b04125c25100a36ab0510
10,1,"Dear Consumers, Increase your Business Sales! ...",00159.b16f070a576c2eb1533aa9e2cf8e6b77
100,1,<HTML><TABLE WIDTH=100% BORDER=0 CELLPADDING=0...,00406.05e2214fea602970426862295f9b4a2e
1000,1,\n\nThe Internet's Online Pharmacy\n\n\n\nViag...,00056.64a6ee24c0b7bf8bdba8340f0a3aafda


In [8]:
vectorizer = CountVectorizer(stop_words='english')

In [9]:
all_features = vectorizer.fit_transform(data.MESSAGE)

In [10]:
all_features.shape

(5796, 102694)

In [11]:
vectorizer.vocabulary_

{'dear': 32719,
 'homeowner': 48034,
 'rates': 76350,
 'lowest': 59365,
 'point': 72297,
 '40': 7824,
 'years': 98506,
 'help': 47200,
 'best': 23129,
 'rate': 76347,
 'situation': 82318,
 'matching': 60930,
 'needs': 64750,
 'hundreds': 48607,
 'lenders': 58021,
 'home': 48006,
 'improvement': 51399,
 'refinance': 77074,
 'second': 80968,
 'mortgage': 63026,
 'equity': 38990,
 'loans': 59058,
 'perfect': 70478,
 'credit': 30975,
 'service': 81359,
 '100': 1496,
 'free': 42773,
 'owners': 68715,
 'new': 64988,
 'buyers': 25617,
 'obligation': 66813,
 'just': 55049,
 'quick': 75547,
 'simple': 82172,
 'form': 42425,
 'jump': 55000,
 'start': 84135,
 'future': 43330,
 'plans': 71939,
 'today': 88039,
 'visit': 92921,
 'http': 48497,
 '61': 10092,
 '145': 2275,
 '116': 1873,
 '186': 2748,
 'user0201': 91339,
 'index': 51639,
 'asp': 20429,
 'afft': 17606,
 'qm10': 75108,
 'unsubscribe': 90955,
 'light': 58472,
 'watch': 94281,
 'attention': 20740,
 'computer': 29755,
 'users': 91367,
 'sp

In [13]:
X_train, X_test, y_train, y_test = train_test_split(all_features, data.CATEGORY,
                                                    test_size=0.3, random_state=88)

In [16]:
all_features.shape

(5796, 102694)

In [18]:
all_features

<5796x102694 sparse matrix of type '<class 'numpy.int64'>'
	with 704684 stored elements in Compressed Sparse Row format>

In [20]:
classifier = MultinomialNB()

In [21]:
classifier.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [24]:
nr_correct = (y_test == classifier.predict(X_test)).sum()

In [26]:
print(f'{nr_correct} documents classfied correctly')

1650 documents classfied correctly


In [28]:
nr_incorrect = y_test.size - nr_correct
nr_incorrect

89

In [31]:
fraction_worng = nr_incorrect/(nr_correct + nr_incorrect)
print(f'{1-fraction_worng:.2},accuracy of the model')

0.95,accuracy of the model


In [33]:
from sklearn.metrics import recall_score,precision_score,confusion_matrix,classification_report,f1_score

In [34]:
recall_score(y_test,classifier.predict(X_test))

0.8453038674033149

In [35]:
precision_score(y_test,classifier.predict(X_test))

0.9892241379310345

In [36]:
confusion_matrix(y_test,classifier.predict(X_test))

array([[1191,    5],
       [  84,  459]], dtype=int64)

In [37]:
f1_score(y_test,classifier.predict(X_test))

0.9116186693147964

In [41]:
example = ['could you please help me with the project for tommoro',
          'These plots were generated with the default matplotlib parameters,plus a default colormap that was set to gray-scale and',
          'get viagra for free now!',
          'need a mortgage? Replay to arrange a call with a specilist and get a quote']

In [42]:
doc_term_matrix = vectorizer.transform(example)

In [43]:
classifier.predict(doc_term_matrix)

array([0, 0, 1, 1], dtype=int64)