In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [7]:
df = pd.read_csv("/content/spam.csv")
display(df.head())

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df.shape

(5572, 2)

In [9]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [10]:
# Create a new column 'spam' that is the label encode (numerical representation) of 'Category'
df['Spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)
df

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [32]:
# Train and Test split
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.Spam, test_size=0.25)

In [33]:
X_train.describe()

count                       4179
unique                      3942
top       Sorry, I'll call later
freq                          21
Name: Message, dtype: object

In [34]:
# Calculate word count and create a matrix
count_vectorizer = CountVectorizer()
X_train_cv = count_vectorizer.fit_transform(X_train.values)

In [35]:
X_train_cv

<4179x7487 sparse matrix of type '<class 'numpy.int64'>'
	with 55846 stored elements in Compressed Sparse Row format>

In [36]:
X_train_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [37]:
# 7451 unique words present in 4179(x_train) mails and are 
# vectorized as numerical values

# Train model
model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [38]:
# Pre-test spam and ham
ham_email = ["Hey, How are you doing? Want to catch up this weeking for hiking"]
ham_email_cv = count_vectorizer.transform(ham_email)
print(model.predict(ham_email_cv))

spam_email = ["Play this game and earn 200€ per hour"]
spam_email_cv = count_vectorizer.transform(spam_email)
print(model.predict(spam_email_cv))


[0]
[1]


In [39]:
# Evaluate model with test data
X_test_cv = count_vectorizer.transform(X_test)
model.score(X_test_cv, y_test)

0.9863603732950467