In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('emails.csv')

In [3]:
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [4]:
columns_to_keep = ['text', 'spam']
df = df[columns_to_keep]

In [5]:
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [6]:
df.isnull().sum()

text    0
spam    0
dtype: int64

In [7]:
df.dropna(subset=['text'], inplace=True)
df.dropna(subset=['spam'], inplace=True)

In [8]:
df.isnull().sum()

text    0
spam    0
dtype: int64

In [9]:
df.shape

(5728, 2)

In [10]:
df.drop_duplicates(inplace=True)

In [11]:
df.shape

(5695, 2)

In [12]:
df['text']

0       Subject: naturally irresistible your corporate...
1       Subject: the stock trading gunslinger  fanny i...
2       Subject: unbelievable new homes made easy  im ...
3       Subject: 4 color printing special  request add...
4       Subject: do not have money , get software cds ...
                              ...                        
5723    Subject: re : research and development charges...
5724    Subject: re : receipts from visit  jim ,  than...
5725    Subject: re : enron case study update  wow ! a...
5726    Subject: re : interest  david ,  please , call...
5727    Subject: news : aurora 5 . 2 update  aurora ve...
Name: text, Length: 5695, dtype: object

In [13]:
df['text'][0]

"Subject: naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within three business days . affordability : your  ma

In [14]:
import nltk
import string
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
from nltk.corpus import stopwords, words

In [16]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

# Text Preprocessing

In [18]:
def clean_text(text):
    remove_punc1 = [char for char in text if char not in string.punctuation]
    return remove_punc1

In [19]:
df['text'].head(1).apply(clean_text)

0    [S, u, b, j, e, c, t,  , n, a, t, u, r, a, l, ...
Name: text, dtype: object

In [20]:
def clean_text(text):
    remove_punc1 = [char for char in text if char not in string.punctuation]
    remove_punc2 = ''.join(remove_punc1)
    return remove_punc2

In [21]:
df['text'].head(1).apply(clean_text)

0    Subject naturally irresistible your corporate ...
Name: text, dtype: object

In [22]:
def clean_text(text):
    remove_punc1 = [char for char in text if char not in string.punctuation]
    remove_punc2 = ''.join(remove_punc1)
    stop_words_remove = [word for word in remove_punc2.split() if word.lower() not in stopwords.words('english')]
    return stop_words_remove

In [23]:
df['text'].head(5).apply(clean_text)

0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
Name: text, dtype: object

# Feature Extraction

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer 

In [25]:
tf = TfidfVectorizer(analyzer=clean_text).fit_transform(df['text'])

In [26]:
from sklearn.model_selection import train_test_split as tts

In [27]:
xtrain,xtest,ytrain,ytest = tts(tf,df['spam'],test_size=0.3,random_state=45)

In [28]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

# MultinomialNB

In [29]:
ml = MultinomialNB()

In [30]:
print(ytrain.dtype)

int64


In [31]:
ml.fit(xtrain, ytrain)

In [32]:
#ml.fit(xtrain,ytrain)

In [33]:
ml.score(xtest,ytest)

0.8899941486249269

In [34]:
ml.predict(xtest)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

# BernoulliNB

In [35]:
bn = BernoulliNB()

In [36]:
bn.fit(xtrain,ytrain)

In [37]:
bn.score(xtest,ytest)

0.9824458747805734

# GaussianNB

In [38]:
#gs = GaussianNB()

In [39]:
#gs.fit(xtrain,ytrain)

In [40]:
tf

<5695x37229 sparse matrix of type '<class 'numpy.float64'>'
	with 562930 stored elements in Compressed Sparse Row format>

In [41]:
tf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [42]:
xtrain,xtest,ytrain,ytest = tts(tf.toarray(),df['spam'],test_size=0.3,random_state=45)

In [43]:
gs = GaussianNB().fit(xtrain,ytrain)

In [44]:
accuracy = gs.score(xtest,ytest)
accuracy

0.9584552369806905

THE END