**Build a machine learning model that predicts whether an email is spam or ham**

In [1]:
import pandas as pd
pd.pandas.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv("spam.csv", encoding='latin1') 
df.head()

# The data contains 2 major columns: v1 and v2
# v1 indicates whether an email is spam or ham while v2 contains the text message

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [4]:
# explore the messages
print(df["v2"][0])
print(df["v2"][30])
print(df["v2"][500])

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
Wait that's still not all that clear, were you not sure about me being sarcastic or that that's why x doesn't want to live with us
Fighting with the world is easy, u either win or lose bt fightng with some1 who is close to u is dificult if u lose - u lose if u win - u still lose.


In [5]:
# Data cleaning and preprocessing

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [1]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))
corpus = []

for i in range(0, len(df)):
    review = re.sub("[^a-zA-Z]", " ", df["v2"][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review if word not in stop_words]
    review = " ".join(review)
    corpus.append(review)

NameError: name 'WordNetLemmatizer' is not defined

In [39]:
corpus[:5]

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though']

In [40]:
# Creating the bag of words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)   # top 2500 most frequent words
X = cv.fit_transform(corpus).toarray()

In [42]:
X.shape

(5572, 2500)

In [55]:
y = pd.get_dummies(df["v1"])
print(y.head())

y = y.iloc[:, 1].values

   ham  spam
0    1     0
1    1     0
2    0     1
3    1     0
4    1     0


In [57]:
# split the data into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [58]:
# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [59]:
y_pred = spam_detect_model.predict(X_test)

In [62]:
from sklearn.metrics import confusion_matrix, accuracy_score
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

accuracy = accuracy_score(y_test, y_pred)
print(accuracy)


[[957   8]
 [ 10 140]]
0.9838565022421525
