In [1]:
import string

import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier



In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dhira\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df= pd.read_csv("spam_ham_dataset.csv")

In [4]:
df

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


### Data Processing & Cleaning

In [5]:
df['text']= df['text'].apply(lambda x: x.replace('\r\n', ' '))

In [6]:
df

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291 thi...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001 ( see at...",0
2,3624,ham,"Subject: neon retreat ho ho ho , we ' re aroun...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs this deal is to b...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft the transport vo...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms hpl c...,0
5168,2933,ham,Subject: calpine daily gas nomination > > juli...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [8]:
df.isnull()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
5166,False,False,False,False
5167,False,False,False,False
5168,False,False,False,False
5169,False,False,False,False


In [9]:
stemmer= PorterStemmer()

In [10]:
stemmer.stem('developing') # Reducing the number of words

'develop'

In [11]:
corpus= [] # This list will contain the transformed version of the the email.

stopwords_set= set(stopwords.words('english'))

for i in range(len(df)):
    text= df['text'].iloc[i].lower()
    text= text.translate(str.maketrans('', '', string.punctuation)).split()
    text= [stemmer.stem(word) for word in text if word not in stopwords_set]
    text= ' '.join(text)
    corpus.append(text)

#### Below is the actual email vs converted email

In [12]:
# Actual Email
df.text.iloc[0]

"Subject: enron methanol ; meter # : 988291 this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary flow data provided by daren } . please override pop ' s daily volume { presently zero } to reflect daily activity you can obtain from gas control . this change is needed asap for economics purposes ."

In [13]:
# Converted Email
corpus[0]

'subject enron methanol meter 988291 follow note gave monday 4 3 00 preliminari flow data provid daren pleas overrid pop daili volum present zero reflect daili activ obtain ga control chang need asap econom purpos'

### Vectorizing

In [14]:
vectorizer= CountVectorizer()

### Train Test Split

In [15]:
x= vectorizer.fit_transform(corpus).toarray() # Creating arrays of numbers
y= df.label_num

x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.2)

In [16]:
clf= RandomForestClassifier(n_jobs= -1) # Utilizing all the CPU cores, for faster execution.

clf.fit(x_train, y_train)

RandomForestClassifier(n_jobs=-1)

In [17]:
clf.score(x_test, y_test)

0.9768115942028985

## Here our model has accuracy of 0.96. So, We can go ahead and test our model with some actual mails.

## Now lets check our model result by utilizing few mails from dataset.

In [18]:
# Below function will return vectorized version of the given email/text.
def vector_fun(email_to_classify):
    email_text= email_to_classify.lower().translate(str.maketrans('', '', string.punctuation)).split()
    email_text= [stemmer.stem(word) for word in email_text if word not in stopwords_set]
    email_text= ' '.join(email_text)
    email_text= [email_text]
    return vectorizer.transform(email_text)

#### Test 1

In [19]:
# Lets take 2nd mail in the df for testing
x_email= vector_fun(df.text.values[2])

df.iloc[2] # Its not a spam mail.

Unnamed: 0                                                 3624
label                                                       ham
text          Subject: neon retreat ho ho ho , we ' re aroun...
label_num                                                     0
Name: 2, dtype: object

In [20]:
# Actual
df.label_num.iloc[2]

0

In [21]:
# prediction
clf.predict(x_email)[0]

0

#### Test 2

In [22]:
# Lets take 10th index mail in the df for testing
x_email= vector_fun(df.text.values[10])

df.iloc[10] # Its a spam mail.

Unnamed: 0                                                 4922
label                                                      spam
text          Subject: vocable % rnd - word asceticism vcsc ...
label_num                                                     1
Name: 10, dtype: object

In [23]:
# Actual
df.label_num.iloc[10]

1

In [24]:
# prediction
clf.predict(x_email)[0]

1