## Description

#### This program detects if an email is spam (1) or not (0)

In [30]:
# importing the necessary/required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
import string
from nltk.corpus import stopwords

In [31]:
# loading the dataset and storing it in a dataframe using pandas
df = pd.read_csv("spam-emails-dataset.csv")

# selecting only the two columns
# df = df[['Category', 'Message']]

# printing the first five records/rows of the dataframe
df.head()

Unnamed: 0,Message,spam
0,naturally irresistible your corporate identit...,1
1,the stock trading gunslinger fanny is merril...,1
2,unbelievable new homes made easy im wanting ...,1
3,4 color printing special request additional ...,1
4,"do not have money , get software cds from her...",1


In [32]:
# first, we'll convert the spam/ham to a number 1/0 in the category column
# we'll have to convert the category & message columns into numbers
# because ML models understand only numbers not strings...

# first, we'll convert the spam/ham to a number 1/0 in the category column
# df['spam'] = df['Category'].apply(lambda x : 1 if x == 'spam' or x == 1 else 0)
# df.head()

In [33]:
# data exploration
# df.groupby('Category').describe()

In [34]:
# printing the number of rows and columns i.e., the shape
df.shape

(5726, 2)

In [35]:
# printing the column names
df.columns

Index(['Message', 'spam'], dtype='object')

In [36]:
# check for the duplicates and remove them
df.drop_duplicates(inplace = True)

In [37]:
# show the new shape of the dataframe after removing the duplicates
df.shape

(5693, 2)

In [38]:
# printing the number of missing data (NaN, NAN, na) for each column
df.isnull().sum()

Message    0
spam       0
dtype: int64

In [39]:
# downloading the stopword packages using nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rehbe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [40]:
# creating a custom function which will process the text
def processEmail(email):
    
    # step-01: remove punctuation marks
    noPuncText = [char for char in email if char not in string.punctuation]
    noPuncText = ''.join(noPuncText)
    
    # step-02: remove stopwords
    cleanWords = [word for word in noPuncText.split() if word.lower() not in stopwords.words('english')]
    
    # step-03: return a list of clean text words
    return cleanWords

In [41]:
# show the tokenization - a list of token also called lemmas
df['Message'].head().apply(processEmail)

0    [naturally, irresistible, corporate, identity,...
1    [stock, trading, gunslinger, fanny, merrill, m...
2    [unbelievable, new, homes, made, easy, im, wan...
3    [4, color, printing, special, request, additio...
4    [money, get, software, cds, software, compatib...
Name: Message, dtype: object

In [42]:
# splitting the samples into train and test datasets respectively... setting the test size to 25%
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.25)

In [43]:
emails = [
    'WINNER!! As a valued network customer you have been selected to receivea Â£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.',
    "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune",
    'Hi there, there are sexy singles in your area that want to meet you. Kindly reply with your details below so that I can connect you to amazing single ladies that will show you the best time of your life uwu. I will be needing your Credit Card details, your full name, your social security number, your phone number, and finally your address. Kindly reply to me as fast as u can then I can process your information and make sure you get connected to a lovely lady.',
    'Hi, emailing you in response to the thing we discussed yesterday. You can find my CV attached'
]

In [46]:
# creating a classifier using the pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()), # step-1: convert the text into a vector
    ('nb', MultinomialNB()) # step-2: then apply the MultinomialNB
])

In [47]:
# training the classifier

# X_train is basically the text in 'Message' column... Previously, first we converted that text into a matrix
# and then train the model on that matrix values..
clf.fit(X_train, y_train) 

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [48]:
# measuring the accuracy of the classifier
clf.score(X_test, y_test) * 100

98.5252808988764

In [49]:
# predict
clf.predict(emails)

array([1, 1, 1, 0], dtype=int64)