In [37]:
#importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [38]:
#load the dataset
raw_dataset = pd.read_csv("../Classification Datasets/spam.csv")
# Printing the first five rows
print(raw_dataset.head())

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [39]:
# data Analysis
raw_dataset.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [40]:
# converting the categorical variable to numberical by introducing a new column know as spam "1-spam and 0-ham"
raw_dataset["spam"] = raw_dataset["Category"].apply(lambda x:1 if x=="spam" else 0)
print(raw_dataset.head())
# lets remove the category variable
raw_dataset = raw_dataset.drop(columns=["Category"])
print(raw_dataset.head())

  Category                                            Message  spam
0      ham  Go until jurong point, crazy.. Available only ...     0
1      ham                      Ok lar... Joking wif u oni...     0
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...     1
3      ham  U dun say so early hor... U c already then say...     0
4      ham  Nah I don't think he goes to usf, he lives aro...     0
                                             Message  spam
0  Go until jurong point, crazy.. Available only ...     0
1                      Ok lar... Joking wif u oni...     0
2  Free entry in 2 a wkly comp to win FA Cup fina...     1
3  U dun say so early hor... U c already then say...     0
4  Nah I don't think he goes to usf, he lives aro...     0


In [41]:
# Lets print only the message
print(raw_dataset.Message.head())

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object


In [42]:
# lets create the test train split to train the model
X_train,X_test,y_train,y_test = train_test_split(raw_dataset.Message,raw_dataset.spam,random_state=3,train_size=0.7)
print(X_train.head())
print(y_train.head())

1455    Can ü all decide faster cos my sis going home ...
3460    Not heard from U4 a while. Call me now am here...
2493    No drama Pls.i have had enough from you and fa...
3378    Yup. Wun believe wat? U really neva c e msg i ...
3826    Hi. I'm always online on yahoo and would like ...
Name: Message, dtype: object
1455    0
3460    1
2493    0
3378    0
3826    0
Name: spam, dtype: int64


In [43]:
# Analyse data in the X_train and y_train
print(X_train.describe())
print(y_train.describe())

count                       3900
unique                      3659
top       Sorry, I'll call later
freq                          24
Name: Message, dtype: object
count    3900.000000
mean        0.132051
std         0.338590
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: spam, dtype: float64


In [44]:
# lets create a countVectorizer to transform our data i.e Message to numerical values
cv = CountVectorizer()
X_transformed_train_dataset = cv.fit_transform(X_train.values)
# CountVectorizer converted all the text to numbers
X_transformed_train_dataset.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [45]:
# lets get to know the shap of the X_transformed_train_dataset
print(X_transformed_train_dataset.shape)

(3900, 7153)


In [48]:
# Train Model
NB_model = MultinomialNB()
NB_model.fit(X_transformed_train_dataset,y_train)

In [49]:
# Lets check whether our model is predicting is right "ham message"
ham_message = ["Hi...there. Let's meet today for sure."]
transformed_ham_message = cv.transform(ham_message)
NB_model.predict(transformed_ham_message)

array([0], dtype=int64)

In [50]:
# Lets check whether our model is predicting is right "spam message"
spam_message = ["Lucky Draw... click here"]
transformed_spam_message = cv.transform(spam_message)
NB_model.predict(transformed_spam_message)

array([1], dtype=int64)

In [55]:
# Lets test it with our test dataset
y_transformed_test_dataset = cv.transform(X_test.values)
print(y_transformed_test_dataset.shape)
print(y_transformed_test_dataset.toarray())


(1672, 7153)
[[0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [60]:
# test dataset predictions
perdiction_testdata = NB_model.predict(y_transformed_test_dataset)
perdiction_testdata.shape

(1672,)

In [65]:
# performance matrix on the test dataset
from sklearn.metrics import accuracy_score
print(f"The accuracy of the model on Test Dataset is : {accuracy_score(perdiction_testdata,y_test) * 100}%")

The accuracy of the model on Test Dataset is : 98.80382775119617%
