In [1]:
import string
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df = pd.read_csv("spam_ham_dataset.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [6]:
df.isna().sum()

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

**Getting rid of \r\n in the text column of the dataset**

In [7]:
df['text'] = df['text'].apply(lambda x: x.replace('\r\n', ' '))

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291 thi...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001 ( see at...",0
2,3624,ham,"Subject: neon retreat ho ho ho , we ' re aroun...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs this deal is to b...,0


In [9]:
stemmer = PorterStemmer()

corpus =[]

stopwords_set  = set(stopwords.words('english'))

for i in range(len(df)):
    text = df['text'].iloc[i].lower()
    text = text.translate(str.maketrans('','',string.punctuation)).split()
    text = [stemmer.stem(word) for word in text if word not in stopwords_set]
    text = ' '.join(text)
    corpus.append(text)

In [10]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(corpus).toarray()
y = df.label_num

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
mnb = MultinomialNB()

In [13]:
mnb.fit(X_train, y_train)

In [14]:
mnb.score(X_test, y_test)

0.9748792270531401

In [15]:
email_to_classify = df.text.values[19]

In [16]:
email_to_classify

"Subject: additional recruiting i ' m happy to introduce molly magee as the newest addition to the eops recruiting team . toni and molly have divided their recruiting duties along separate job functions . please review the information below and direct your staffing requests to either toni or molly depending on your job needs . toni graham - accounting , risk and confirmation / settlements positions ( or openings requiring a similar skill set of this candidate pool ) molly magee - logistics , global data management , research , legal , competitive analysis , contract administration and other positions ( or openings requiring a similar skill set of this candidate pool ) thanks for your assistance , hgm"

In [17]:
email_text = email_to_classify.lower().translate(str.maketrans('','',string.punctuation)).split()
email_text = [stemmer.stem(word) for word in text if word not in stopwords_set]
email_text = ' '.join(email_text)

email_corpus = [email_text]

X_email = vectorizer.transform(email_corpus)

In [18]:
mnb.predict(X_email)

array([0])

In [19]:
df.label_num.iloc[19]

0