<a href="https://colab.research.google.com/github/Parth722/spam_classifier/blob/main/spam_classifier_updated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Spam Filter By Naive Bayes Algorithm

Importing pandas

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve

In [6]:
sms_spam = pd.read_csv('sample_data/spam.csv', encoding='latin1')

In [10]:
print(sms_spam.shape)
sms_spam = sms_spam[['v1','v2']]
sms_spam.columns = ['Label', 'SMS']
sms_spam.head()

(5572, 5)


Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
#Randomizing the dataset
sms_spam_randomized = sms_spam.sample(frac= 1, random_state= 1)

training_set_index = round(len(sms_spam_randomized) * 0.8)

training_set = sms_spam_randomized[:training_set_index].reset_index(drop=True)
test_set =  sms_spam_randomized[training_set_index:].reset_index(drop=True)

print(training_set.shape, test_set.shape)

(4458, 2) (1114, 2)


In [12]:
#checking percentage of spams and normal messages in training and test set
training_set['Label'].value_counts(normalize=True)

test_set['Label'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Label,Unnamed: 1_level_1
ham,0.873429
spam,0.126571


Data formatting and cleaning

In [13]:
#Before cleaning and formatting
training_set.head(4)

Unnamed: 0,Label,SMS
0,ham,Convey my regards to him
1,ham,"[Û_] anyway, many good evenings to u! s"
2,ham,My sort code is and acc no is . The bank is n...
3,ham,Sorry i din lock my keypad.


In [14]:
#Removing punctuations and converting to lower case.

training_set['SMS'] = training_set['SMS'].str.replace('\W', ' ')
training_set['SMS'] = training_set['SMS'].str.lower()

training_set.head(4)

Unnamed: 0,Label,SMS
0,ham,convey my regards to him
1,ham,"[û_] anyway, many good evenings to u! s"
2,ham,my sort code is and acc no is . the bank is n...
3,ham,sorry i din lock my keypad.


In [15]:
#Creating vocabulary for all unique words
training_set['SMS'] = training_set['SMS'].str.split()

vocabulary = []

for sms in training_set['SMS']:
  for word in sms:
    vocabulary.append(word)

vocabulary = list(set(vocabulary))



In [16]:
len(vocabulary)

11757

In [17]:
word_count_per_sms = {unique_word: [0] * training_set.shape[0] for unique_word in vocabulary}

for index, sms in enumerate(training_set['SMS']):
  for word in sms:
    word_count_per_sms[word][index] += 1



In [18]:
word_counts = pd.DataFrame(word_count_per_sms)
word_counts.head()

Unnamed: 0,2stop,print,50p,crave,dude!,hands!,shb,tp,"about,",8.30,...,seen.,%.,......forward,havenåõt,irritated,terminated.we,kisses*,zindgi,student,gynae
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head()

Unnamed: 0,Label,SMS,2stop,print,50p,crave,dude!,hands!,shb,tp,...,seen.,%.,......forward,havenåõt,irritated,terminated.we,kisses*,zindgi,student,gynae
0,ham,"[convey, my, regards, to, him]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[[û_], anyway,, many, good, evenings, to, u!, s]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[my, sort, code, is, and, acc, no, is, ., the,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,"[sorry, i, din, lock, my, keypad.]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,spam,"[hi, babe, its, chloe,, how, r, u?, i, was, sm...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Parameters for bayes rule like P(spam), P(ham) and so on

In [20]:
#Seperating spam and normal messages
spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']
ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']

#P(spam) and P(ham)
p_spam = len(spam_messages)/len(training_set_clean)
p_ham = len(ham_messages)/len(training_set_clean)

#N_spam(total no. of words in spam)
n_words_per_spam = spam_messages['SMS'].apply(len)
n_spam = n_words_per_spam.sum()

#N_ham(total no. of words in non spam)
n_words_per_ham = ham_messages['SMS'].apply(len)
n_ham = n_words_per_ham.sum()

n_vocab = len(vocabulary)

alpha = 1 #(parameter)

In [21]:
#Initiating Parameters
parameters_spam = {unique_word: 0 for unique_word in vocabulary}
parameters_ham = {unique_word: 0 for unique_word in vocabulary}

for word in vocabulary:
  n_word_given_spam = spam_messages[word].sum()
  p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha * n_vocab)
  parameters_spam[word] = p_word_given_spam

  n_word_given_ham = ham_messages[word].sum()
  p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha * n_vocab)
  parameters_ham[word] = p_word_given_ham


In [22]:
import re

def classify(message):
  """
  message: string
  """

  message = re.sub('\W', ' ', message)
  message = message.lower().split()

  #inititializing probabilities
  p_spam_given_message = p_spam
  p_ham_given_message = p_ham

  for word in message:
    if word in parameters_spam:
      p_spam_given_message *= parameters_spam[word]

    if word in parameters_ham:
      p_ham_given_message *= parameters_ham[word]


  if p_ham_given_message > p_spam_given_message:
    return 'ham'
  elif p_ham_given_message < p_spam_given_message:
    return 'spam'
  else:
    return 'need human classification'

In [23]:
#Running classify on test set
test_set['Predicted'] = test_set['SMS'].apply(classify)

accuracy = len(test_set[test_set['Label'] == test_set['Predicted']]) / len(test_set)
print(accuracy)

0.9919210053859964


### The naive bayse spam classifer classifies test data with 98.74% accuracy.