<a href="https://colab.research.google.com/github/Parth722/spam_classifier/blob/main/spam_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Spam Filter By Naive Bayes Algorithm

Importing pandas

In [1]:
import pandas as pd

In [4]:
sms_spam = pd.read_csv('sample_data/SMSSpamCollection', sep='\t', header=None, names=['Label', 'SMS'])

In [6]:
print(sms_spam.shape)
sms_spam.head()

(5572, 2)


Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
#Randomizing the dataset
sms_spam_randomized = sms_spam.sample(frac= 1, random_state= 1)

training_set_index = round(len(sms_spam_randomized) * 0.8)

training_set = sms_spam_randomized[:training_set_index].reset_index(drop=True)
test_set =  sms_spam_randomized[training_set_index:].reset_index(drop=True)

print(training_set.shape, test_set.shape)

(4458, 2) (1114, 2)


In [11]:
#checking percentage of spams and normal messages in training and test set
training_set['Label'].value_counts(normalize=True)

test_set['Label'].value_counts(normalize=True)

ham     0.868043
spam    0.131957
Name: Label, dtype: float64

Data formatting and cleaning

In [12]:
#Before cleaning and formatting
training_set.head(4)

Unnamed: 0,Label,SMS
0,ham,"Yep, by the pretty sculpture"
1,ham,"Yes, princess. Are you going to make me moan?"
2,ham,Welp apparently he retired
3,ham,Havent.


In [13]:
#Removing punctuations and converting to lower case.

training_set['SMS'] = training_set['SMS'].str.replace('\W', ' ')
training_set['SMS'] = training_set['SMS'].str.lower()

training_set.head(4)

  after removing the cwd from sys.path.


Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent


In [14]:
#Creating vocabulary for all unique words
training_set['SMS'] = training_set['SMS'].str.split()

vocabulary = []

for sms in training_set['SMS']:
  for word in sms:
    vocabulary.append(word)

vocabulary = list(set(vocabulary))



In [15]:
len(vocabulary)

7783

In [25]:
word_count_per_sms = {unique_word: [0] * training_set.shape[0] for unique_word in vocabulary}

for index, sms in enumerate(training_set['SMS']):
  for word in sms:
    word_count_per_sms[word][index] += 1



In [26]:
word_counts = pd.DataFrame(word_count_per_sms)
word_counts.head()

Unnamed: 0,shinco,better,gimmi,noon,mom,08715203652,tiz,01223585334,good,hurt,...,rem,harri,get4an18th,embarassing,busy,best,census,sold,happy,vivekanand
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head()

Unnamed: 0,Label,SMS,shinco,better,gimmi,noon,mom,08715203652,tiz,01223585334,...,rem,harri,get4an18th,embarassing,busy,best,census,sold,happy,vivekanand
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Parameters for bayes rule like P(spam), P(ham) and so on

In [28]:
#Seperating spam and normal messages
spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']
ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']

#P(spam) and P(ham)
p_spam = len(spam_messages)/len(training_set_clean)
p_ham = len(ham_messages)/len(training_set_clean)

#N_spam(total no. of words in spam)
n_words_per_spam = spam_messages['SMS'].apply(len)
n_spam = n_words_per_spam.sum()

#N_ham(total no. of words in non spam)
n_words_per_ham = ham_messages['SMS'].apply(len)
n_ham = n_words_per_ham.sum()

n_vocab = len(vocabulary)

alpha = 1 #(parameter)

In [29]:
#Initiating Parameters
parameters_spam = {unique_word: 0 for unique_word in vocabulary}
parameters_ham = {unique_word: 0 for unique_word in vocabulary}

for word in vocabulary:
  n_word_given_spam = spam_messages[word].sum()
  p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha * n_vocab)
  parameters_spam[word] = p_word_given_spam

  n_word_given_ham = ham_messages[word].sum()
  p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha * n_vocab)
  parameters_ham[word] = p_word_given_ham


In [34]:
import re

def classify(message):
  """
  message: string
  """

  message = re.sub('\W', ' ', message)
  message = message.lower().split()

  #inititializing probabilities 
  p_spam_given_message = p_spam
  p_ham_given_message = p_ham

  for word in message:
    if word in parameters_spam:
      p_spam_given_message *= parameters_spam[word]
    
    if word in parameters_ham: 
      p_ham_given_message *= parameters_ham[word]
  

  if p_ham_given_message > p_spam_given_message:
    return 'ham'
  elif p_ham_given_message < p_spam_given_message:
    return 'spam'
  else:
    return 'need human classification'

In [39]:
#Running classify on test set
test_set['Predicted'] = test_set['SMS'].apply(classify)

accuracy = len(test_set[test_set['Label'] == test_set['Predicted']]) / len(test_set)
print(accuracy)

0.9874326750448833


### The naive bayse spam classifer classifies test data with 98.74% accuracy.