In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
df = pd.read_csv('email.csv')

In [3]:
df.rename(columns = {'CATEGORY':'target','MESSAGE':'text'},inplace = True)
df.tail(5)

Unnamed: 0.1,Unnamed: 0,target,text
3995,4995,Not Spam,-----BEGIN PGP SIGNED MESSAGE-----\n\nHash: SH...
3996,4996,Not Spam,"On Thursday 25 July 2002 06:16 am, Kylus wrote..."
3997,4997,Not Spam,Update of /cvsroot/spamassassin/spamassassin/w...
3998,4998,Not Spam,"On Thu, 2002-08-15 at 10:53, Erik Williamson w..."
3999,4999,Not Spam,"On Mon, 2002-08-12 at 06:32, Matthias Saou wro..."


In [4]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [5]:
df['target'] = encoder.fit_transform(df['target'])

In [6]:
import nltk

In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sarva\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
df['numChar'] = df['text'].apply(len)

In [9]:
df['numWord'] = df['text'].apply(lambda x: len(nltk.word_tokenize(x)))

In [10]:
df['numSent'] = df['text'].apply(lambda x: len(nltk.sent_tokenize(x)))

In [11]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    text = y[:]
    y.clear()

    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
    text = y[:]
    y.clear()

    for i in text:
        y.append(ps.stem(i))
        
    return " ".join(y)


In [12]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
ps.stem('loving')

'love'

In [13]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [14]:
import string 
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [15]:
df['transformed_text'] = df['text'].apply(transform_text)

In [16]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer()

In [17]:
X = tfidf.fit_transform(df['transformed_text']).toarray()

In [18]:
y = df['target'].values

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = 2)

In [21]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score

In [22]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [23]:
gnb.fit(X_train,y_train)
y_pred1 = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))

0.94125
[[580  29]
 [ 18 173]]
0.8564356435643564


In [24]:
mnb.fit(X_train,y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

0.8875
[[608   1]
 [ 89 102]]
0.9902912621359223


In [25]:
bnb.fit(X_train,y_train)
y_pred3 = bnb.predict(X_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))

0.90375
[[596  13]
 [ 64 127]]
0.9071428571428571


In [26]:
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))

In [27]:
test_df = pd.read_csv('email_test.csv')
test_df.rename(columns={'MESSAGE': 'text'}, inplace=True)

In [28]:
test_df['transformed_text'] = test_df['text'].apply(transform_text)
test_df.describe()

Unnamed: 0.1,Unnamed: 0
count,1000.0
mean,499.5
std,288.819436
min,0.0
25%,249.75
50%,499.5
75%,749.25
max,999.0


In [29]:
X2 = tfidf.transform(test_df['transformed_text']).toarray()


y_sol = mnb.predict(X2)


mapping = {0: 'ham', 1: 'spam'}
test_df['CATEGORY'] = y_sol
test_df['CATEGORY'] = test_df['CATEGORY'].map(mapping)


In [30]:
output_csv_path = 'outputQ72_data.csv'
test_df.to_csv(output_csv_path, index=False)