## Question 4

### Naive Bayes : Perform spam email detection using Naive Bayes classifier (on a given dataset)

## Importing libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

## Loading the data

In [2]:
df = pd.read_csv('spam.csv')
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
df.shape

(5572, 2)

## Transforming the test data

In [4]:
vectorizer = CountVectorizer(stop_words='english')

In [5]:
all_features = vectorizer.fit_transform(df.Message)

In [6]:
vectorizer.vocabulary_

{'jurong': 4244,
 'point': 5775,
 'crazy': 2278,
 'available': 1281,
 'bugis': 1715,
 'great': 3551,
 'world': 8281,
 'la': 4370,
 'buffet': 1713,
 'cine': 2003,
 'got': 3511,
 'amore': 1061,
 'wat': 8079,
 'ok': 5373,
 'lar': 4406,
 'joking': 4212,
 'wif': 8187,
 'oni': 5399,
 'free': 3276,
 'entry': 2885,
 'wkly': 8239,
 'comp': 2119,
 'win': 8199,
 'fa': 3014,
 'cup': 2337,
 'final': 3131,
 'tkts': 7565,
 '21st': 410,
 '2005': 401,
 'text': 7433,
 '87121': 791,
 'receive': 6154,
 'question': 6048,
 'std': 7073,
 'txt': 7750,
 'rate': 6100,
 'apply': 1138,
 '08452810075over18': 77,
 'dun': 2747,
 'say': 6492,
 'early': 2766,
 'hor': 3836,
 'nah': 5120,
 'don': 2659,
 'think': 7488,
 'goes': 3475,
 'usf': 7888,
 'lives': 4558,
 'freemsg': 3283,
 'hey': 3753,
 'darling': 2394,
 'week': 8125,
 'word': 8272,
 'like': 4508,
 'fun': 3338,
 'tb': 7368,
 'xxx': 8346,
 'chgs': 1957,
 'send': 6580,
 '50': 613,
 'rcv': 6112,
 'brother': 1685,
 'speak': 6955,
 'treat': 7683,
 'aids': 994,
 'pate

In [7]:
Y = df.Category
Y.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: Category, dtype: object

In [8]:
Y.unique()

array(['ham', 'spam'], dtype=object)

In [9]:
Y = Y.map({'ham': 0, 'spam': 1})
Y.head(5)

0    0
1    0
2    1
3    0
4    0
Name: Category, dtype: int64

## Splitting the data

In [10]:
X_train, X_test, y_train, y_test = train_test_split(all_features, Y, test_size=0.3, random_state=90)

In [11]:
X_train.shape

(3900, 8440)

In [12]:
X_test.shape

(1672, 8440)

## Training the model

In [13]:
 classifier = MultinomialNB()

In [14]:
classifier.fit(X_train, y_train)

MultinomialNB()

## Evaluating the model

In [15]:
nr_correct = (y_test == classifier.predict(X_test)).sum()
print(f"Number of messages classified correctly: {nr_correct}")

Number of messages classified correctly: 1630


In [16]:
nr_incorrect = y_test.size - nr_correct
print(f"Number of messages classified incorrectly: {nr_incorrect}")

Number of messages classified incorrectly: 42


## Accuracy and Score of the model

In [17]:
acc = accuracy_score(y_test, classifier.predict(X_test))
print(f"Accuracy of the model: {acc}")

Accuracy of the model: 0.9748803827751196


In [18]:
re_Score = recall_score(y_test, classifier.predict(X_test))
pre_Score = precision_score(y_test, classifier.predict(X_test))
f1_Score = f1_score(y_test, classifier.predict(X_test))

print(f"Recall Score: {re_Score}")
print(f"Precision Score: {pre_Score}")
print(f"F1 Score: {f1_Score}")

Recall Score: 0.9357798165137615
Precision Score: 0.8793103448275862
F1 Score: 0.9066666666666666


## Example (Prediction)

In [19]:
example = ['get a viagra for free now!', 
           'Hello John, How are you?',
            'The New Jersey Devils and the Detroit Red Wings play Ice Hockey. Correct or Incorrect? End? Reply END SPTV']

In [20]:
testMessages = vectorizer.transform(example)

In [21]:
testResult = classifier.predict(testMessages)

In [22]:
for i in testResult:
    if i == 0:
        print("Not a Scam")
    elif i == 1:
        print("Scam!")

Scam!
Not a Scam
Scam!
