In [1]:
# data manimulation and visualization libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#avoid unnecessary warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# python modules
import re
import string
import math

In [3]:
# text cleaning libraries
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [4]:
#model libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,fbeta_score, confusion_matrix

In [5]:
data = pd.read_csv('emails.csv')

In [6]:
#data.head()

In [7]:
data.rename(columns={'spam':'class'},inplace=True)
data['label'] = np.where(data['class']==1,'spam','ham')

In [8]:
# DROP DUPLICATE VALUES
#data.drop_duplicates().reset_index(drop=True)
data.drop_duplicates(inplace=True)

In [9]:
# process text
stop_words = set(stopwords.words('english'))

stop_words.add('subject')
stop_words.add('http')

def preprocessing_text(x):
    #lower case
    x = x.lower()
    
    #email-id
    x = re.sub('\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailadd',x)
    
    #url
    x = re.sub('(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'urladd', x)
    
    #money symbols
    x = re.sub('£|\$', 'moneysymbols', x)
    
    #remove number and words containing no.
    # x = re.sub(r'\d+','',x)
    x = re.sub('\W*\d\w*','',x)
        
    #remove punctuation
    x = re.sub('[%s]' % re.escape(string.punctuation), '' , x)
    
    #remove leading and ending space(extra white spaces)
    #x = x.strip()
    x = re.sub(' +',' ',x)
    
    #remove stopword
    x = ' '.join([word for word in word_tokenize(x) if not word in stop_words])
    return x

#apply preprocessing text on text
data['text'] = data['text'].apply(lambda x: preprocessing_text(x))

In [10]:
ps = PorterStemmer()
#lemm = WordNetLemmatizer()

data['text'] = data['text'].apply(lambda x: ' '.join([ps.stem(word) for word in word_tokenize(x) if not word in stop_words]))

In [11]:
data['text'].shape

(5702,)

In [12]:
# extract dtv
cVect = CountVectorizer()

In [13]:
X = cVect.fit_transform(data['text']).toarray()
#X

In [14]:
y = data['class']
print(y.value_counts())

0    4331
1    1371
Name: class, dtype: int64


In [15]:
# first taking out independent and dependent variable
#x = data['text'].values
#y = data['class']
#print("shape of x:",X.shape)
#print("shape of y:",y.shape)

In [16]:
#x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,random_state=42)
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.20,random_state=0)
print("size of training data:",x_train.shape)
print("size of testing data:",x_test.shape)

size of training data: (4561, 25645)
size of testing data: (1141, 25645)


In [17]:
from sklearn.naive_bayes import GaussianNB
bayes_classifier = GaussianNB()

In [18]:
bayes_classifier.fit(x_train,y_train)

y_pred = bayes_classifier.predict(x_test)

In [19]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
print ("Accuracy : %0.5f \n\n" % accuracy_score(y_test, bayes_classifier.predict(x_test)))

[[845  30]
 [ 22 244]]
Accuracy : 0.95443 




In [20]:
# Decision Tree classifier
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()

In [21]:
dt_model.fit(x_train,y_train)

prediction = dt_model.predict(x_test)

In [22]:
print("accuracy_score:",accuracy_score(y_test,prediction))
print("confusion_matrix:\n",confusion_matrix(y_test,prediction))

accuracy_score: 0.9605609114811569
confusion_matrix:
 [[855  20]
 [ 25 241]]


In [28]:
from sklearn.svm import SVC
svm_clf = SVC()

svm_clf.fit(x_train, y_train)

y_pred = svm_clf.predict(x_test)

In [29]:
print("model_accuracy:",accuracy_score(y_test,y_pred))
print("fbeta score:",fbeta_score(y_test,y_pred,beta=0.5))
print("confusion_matrix:\n",confusion_matrix(y_test,y_pred))

model_accuracy: 0.9745836985100789
fbeta score: 0.9585289514866979
confusion_matrix:
 [[867   8]
 [ 21 245]]


In [23]:
#naive_bayes
spam_detect_model = MultinomialNB()
spam_detect_model.fit(x_train,y_train)

y_pred = spam_detect_model.predict(x_test)

In [24]:
print("model_accuracy:",accuracy_score(y_test,y_pred))
print("fbeta score:",fbeta_score(y_test,y_pred,beta=0.5))
print("confusion_matrix:\n",confusion_matrix(y_test,y_pred))

model_accuracy: 0.9824715162138475
fbeta score: 0.9469153515064562
confusion_matrix:
 [[857  18]
 [  2 264]]


In [25]:
import pickle

In [26]:
pickle.dump(spam_detect_model,open('spam_mail.pkl','wb'))  
 #this pickle file is our machine learning model pickle file.

In [27]:
pickle.dump(cVect,open('emvec.pkl','wb'))