# Classification of messages as spam or not spam using Naive Bayes algorithm

In [78]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [79]:
# Import Dataset
df = pd.read_table('SMS.txt', sep='\t', header=None, names=['label', 'sms_message'])
df

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [80]:
# map the 'ham' value to 0 and the 'spam' value to 1.
df['label'] = df.label.map({'ham': 0, 'spam': 1})

In [81]:
# Randomize the dataset
df = df.sample(frac=1, random_state=1)
df

Unnamed: 0,label,sms_message
1078,0,"Yep, by the pretty sculpture"
4028,0,"Yes, princess. Are you going to make me moan?"
958,0,Welp apparently he retired
4642,0,Havent.
4674,0,I forgot 2 ask ü all smth.. There's a card on ...
...,...,...
905,0,"We're all getting worried over here, derek and..."
5192,0,Oh oh... Den muz change plan liao... Go back h...
3980,0,CERI U REBEL! SWEET DREAMZ ME LITTLE BUDDY!! C...
235,1,Text & meet someone sexy today. U can find a d...


In [82]:
# Split into training and test sets
training_test_index = round(len(df) * 0.8)
training = df[:training_test_index].reset_index(drop=True)
test = df[training_test_index:].reset_index(drop=True)
print(training.shape)
print(test.shape)

(4458, 2)
(1114, 2)


In [83]:
#  data cleaning
training['sms_message'] = training['sms_message'].astype(str).str.replace('\W', ' ', regex=True).str.lower()
test['sms_message'] = test['sms_message'].astype(str).str.replace('\W', ' ', regex=True).str.lower()
training

Unnamed: 0,label,sms_message
0,0,yep by the pretty sculpture
1,0,yes princess are you going to make me moan
2,0,welp apparently he retired
3,0,havent
4,0,i forgot 2 ask ü all smth there s a card on ...
...,...,...
4453,0,sorry i ll call later in meeting any thing re...
4454,0,babe i fucking love you too you know fuck...
4455,1,u ve been selected to stay in 1 of 250 top bri...
4456,0,hello my boytoy geeee i miss you already a...


In [84]:
### creating vocabulary from training data
training['sms_message'] = training['sms_message'].str.split()
vocabulary = list(set(word for sms in training['sms_message'] for word in sms))
print(len(vocabulary))

7783


In [85]:
# Counting the frequency of each word in the vocabulary in each message
word_counts_per_sms = {unique_word: [0] * len(training['sms_message']) for unique_word in vocabulary}
for index, sms in enumerate(training['sms_message']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [86]:
# Adding word count features to the training set
word_counts = pd.DataFrame(word_counts_per_sms)
training_new = pd.concat([training, word_counts], axis=1)

# You will start from here.

In [87]:
# Naive Bayes calculations
alpha = 1  # Laplace smoothing
p_spam = training_new['label'].mean()  # Probability of spam
p_ham = 1 - p_spam  # Probability of ham
N_Spam = training_new[training_new['label'] == 1].iloc[:, 3:].sum().sum()  # Total word count in spam messages
N_Ham = training_new[training_new['label'] == 0].iloc[:, 3:].sum().sum()  # Total word count in ham messages
N_Vocabulary = len(vocabulary)  # Number of unique words in the vocabulary

In [88]:
# Calculating the probability of each word in spam and ham messages
p_w_spam = {unique_word: (training_new.loc[training_new['label'] == 1, unique_word].sum() + alpha) / (N_Spam + alpha * N_Vocabulary) for unique_word in vocabulary}
p_w_ham = {unique_word: (training_new.loc[training_new['label'] == 0, unique_word].sum() + alpha) / (N_Ham + alpha * N_Vocabulary) for unique_word in vocabulary}

In [89]:
# Function to classify a message as spam or ham
def classify_message(message, p_w_spam, p_w_ham, p_spam, p_ham):
    message = message.split()
    p_spam_given_message = np.log(p_spam)
    p_ham_given_message = np.log(p_ham)

    for word in message:
        if word in p_w_spam:
            p_spam_given_message += np.log(p_w_spam[word])
        if word in p_w_ham:
            p_ham_given_message += np.log(p_w_ham[word])

    return 1 if p_spam_given_message > p_ham_given_message else 0

In [90]:
# Applying the classifier to the test set
test['predicted'] = test['sms_message'].apply(lambda sms: classify_message(sms, p_w_spam, p_w_ham, p_spam, p_ham))

## Calculate accuracy, precision, recall and F1_score.

In [91]:
# Evaluation of the manual Naive Bayes classifier
print('Manual Naive Bayes Classifier:')
print('Accuracy score:', accuracy_score(test['label'], test['predicted']))
print('Precision score:', precision_score(test['label'], test['predicted']))
print('Recall score:', recall_score(test['label'], test['predicted']))
print('F1 score:', f1_score(test['label'], test['predicted']))

Manual Naive Bayes Classifier:
Accuracy score: 0.9883303411131059
Precision score: 0.9652777777777778
Recall score: 0.9455782312925171
F1 score: 0.9553264604810997


### Now use function MultinomialNB (from sklearn.naive_bayes import MultinomialNB) to validate your results, check the accuracy

In [92]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Splitting the original dataset into training and test sets without cleaning
df = pd.read_table('SMS.txt', sep='\t', header=None, names=['label', 'sms_message'])
df['label'] = df.label.map({'ham': 0, 'spam': 1})
df = df.sample(frac=1, random_state=1)

training_test_index = round(len(df) * 0.8)
training = df[:training_test_index].reset_index(drop=True)
test = df[training_test_index:].reset_index(drop=True)

# Using CountVectorizer to convert text messages to a matrix of token counts
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(training['sms_message'])
X_test = vectorizer.transform(test['sms_message'])

# Training the MultinomialNB classifier on the uncleaned data
clf = MultinomialNB()
clf.fit(X_train, training['label'])

# Making predictions with MultinomialNB on the test set
test['predicted_mnb'] = clf.predict(X_test)

# Evaluating the MultinomialNB classifier
print('\nMultinomialNB Classifier on Raw Data:')
print('Accuracy score:', accuracy_score(test['label'], test['predicted_mnb']))
print('Precision score:', precision_score(test['label'], test['predicted_mnb'], zero_division=1))
print('Recall score:', recall_score(test['label'], test['predicted_mnb'], zero_division=1))
print('F1 score:', f1_score(test['label'], test['predicted_mnb'], zero_division=1))



MultinomialNB Classifier on Raw Data:
Accuracy score: 0.9838420107719928
Precision score: 0.9448275862068966
Recall score: 0.9319727891156463
F1 score: 0.9383561643835617
