In [91]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pickle

In [42]:
dataset=pd.read_csv('emails.csv')

In [21]:
dataset.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [31]:
dataset.text[0]

"Subject: naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within three business days . affordability : your  ma

In [22]:
dataset.shape

(5728, 2)

In [23]:
#dropping duplicates
dataset.drop_duplicates(inplace=True)

In [24]:
dataset.shape

(5695, 2)

In [25]:
#Checking for any null entries in the dataset
dataset.isnull().sum()

text    0
spam    0
dtype: int64

In [26]:
#Checking class distribution
dataset['spam'].value_counts()

0    4327
1    1368
Name: spam, dtype: int64

### Cleaning the texts

In [43]:
#Every mail starts with 'Subject :' lets remove this from each mail

dataset['text']=dataset['text'].map(lambda x:x[x.find(':'):])

In [50]:
dataset['text']=dataset['text'].apply(lambda x: re.sub('[^a-zA-Z0-9]+',' ',x)).apply(lambda x:x.lower().split())

In [87]:
#stemming and removing stop words
ps=PorterStemmer()
corpus=dataset['text'].apply(lambda word_list:' '.join([ps.stem(w) for w in word_list if w not in set(stopwords.words('english'))]))

In [76]:
# corpus=[]
# def fun(i):
#     #return (list(filter(lambda text:text not in set(stopwords.words('english')),i)))
#     return list(map(lambda word:ps.stem(word),(list(filter(lambda text:text not in set(stopwords.words('english')),i)))))
# corpus= dataset['text'][0:5].apply(lambda i: fun(i))

In [113]:
corpus[:5]

0    natur irresist corpor ident lt realli hard rec...
1    stock trade gunsling fanni merril muzo colza a...
2    unbeliev new home made easi im want show homeo...
3    4 color print special request addit inform cli...
4    money get softwar cd softwar compat great grow...
Name: text, dtype: object

In [88]:
corpus.shape

(5728,)

In [89]:
# Creating the Bag of Words model
cv = CountVectorizer()
cv.fit(corpus.values)
X = cv.transform(corpus.values).toarray()
y = dataset.iloc[:, 1].values

In [92]:
pickle.dump(cv,open('models/cv.pkl','wb'))

In [90]:
X.shape

(5728, 29222)

In [93]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

In [94]:
# Fitting classifier to the Training set
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
classifier.fit(X_train , y_train)

MultinomialNB()

In [95]:
#pickle.dump(classifier,open('models/clf.pkl','wb'))

In [96]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [100]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(accuracy_score(y_test,y_pred))

0.9930191972076788


In [99]:
cm

array([[871,   7],
       [  1, 267]], dtype=int64)

In [104]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print('10Fold Accuracies: ',accuracies)
print('Mean Accuracy: ',accuracies.mean())
print('Std Accuracy: ',accuracies.std())

10Fold Accuracies:  [0.9956427  0.98474946 0.98471616 0.98908297 0.98908297 0.98689956
 0.99126638 0.98471616 0.98908297 0.98689956]
Mean Accuracy:  0.9882138881753575
Std Accuracy:  0.003266439836394249


In [114]:
#fitting the full model
clf = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
clf.fit(X , y)
pickle.dump(clf,open('models/final_clf.pkl','wb'))