In [1]:
# importing required modules
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings('ignore')

In [3]:
# importing the dataset
dataset = pd.read_csv('emails.csv', encoding='latin-1')
dataset.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [6]:
# count observations in each label
dataset.spam.value_counts()

0    4360
1    1368
Name: spam, dtype: int64

In [7]:
# for splitting dataset into train set and test set
from sklearn.model_selection import train_test_split

In [9]:
X_train,X_test,y_train,y_test = train_test_split(dataset["text"],dataset["spam"], test_size = 0.2, random_state = 10)

In [10]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4582,)
(1146,)
(4582,)
(1146,)


In [11]:
# for vectorizing words
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
vect = CountVectorizer(stop_words='english')
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [13]:
print(vect.get_feature_names()[0:20])
print(vect.get_feature_names()[-20:])

['00', '000', '0000', '00000000', '000000000003619', '000000000003991', '000000000003997', '000000000005168', '000000000005411', '000000000005413', '000000000005820', '000000000006238', '000000000007494', '000000000007498', '000000000007876', '000000000010552', '000000000011185', '000000000012677', '000000000012735', '000000000012736']
['zunaechst', 'zunf', 'zur', 'zurich', 'zusaetzlich', 'zuzana', 'zwabic', 'zwischen', 'zwlaszcza', 'zwrocic', 'zwwyw', 'zwzm', 'zxghlajf', 'zyban', 'zyc', 'zygoma', 'zymg', 'zzn', 'zzncacst', 'zzzz']


In [14]:
X_train_df = vect.transform(X_train)
X_test_df = vect.transform(X_test)
type(X_test_df)

scipy.sparse.csr.csr_matrix

In [15]:
prediction = dict()
# Naive Bayes Machine Learning Model
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_df,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
prediction["naive_bayes"] = model.predict(X_test_df)

In [17]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [18]:
# get accuracy
accuracy_score(y_test,prediction["naive_bayes"])

0.99127399650959858

In [19]:
print(classification_report(y_test, prediction['naive_bayes'], target_names = ["Ham", "Spam"]))

             precision    recall  f1-score   support

        Ham       0.99      1.00      0.99       861
       Spam       0.99      0.98      0.98       285

avg / total       0.99      0.99      0.99      1146



In [21]:
def classify(user_input):
    custom_train = [('text', [user_input])]
    custom_train = pd.DataFrame.from_items(custom_train)
    text = custom_train.iloc[:, 0].values
    return model.predict(vect.transform(text))

text = input('Type here something and see if it belongs to spam: ')
if classify(text):
    print('spam')
else:
    print('ham')

Type here something and see if it belongs to spam: IMPORTANT - You could be entitled up to £3,160 in compensation from mis-sold PPI on a credit card or loan. Please reply PPI for info or STOP to opt out.
spam


In [22]:
text = input('Type here something and see if it belongs to spam: ')
if classify(text):
    print('spam')
else:
    print('ham')

Type here something and see if it belongs to spam: Hey there! I hope you're free tonight, let's catch up at dinner!
ham
