Let's start by opening the MailSpamCollection file with the read_csv() function from the pandas package.

In [None]:
import pandas as pd

mail_spam = pd.read_csv('dataset2', sep='\t',header=None, names=['Label', 'MAIL'])

print(mail_spam.shape)
mail_spam.head()

In [None]:
mail_spam['Label'].value_counts(normalize=True)

We're now going to split our dataset into a training set and a test set. We'll use 80% of the data for training and the remaining 20% for testing.
  
We'll randomize the entire dataset before splitting to ensure that spam and ham messages are spread properly throughout the dataset.

In [None]:
# Randomize the dataset
data_randomized = mail_spam.sample(frac=1, random_state=1)

# Calculate index for split
training_test_index = round(len(data_randomized) * 0.8)

# Split into training and test sets
training_set = data_randomized[:training_test_index].reset_index(drop=True)
test_set = data_randomized[training_test_index:].reset_index(drop=True)

print(training_set.shape)
print(test_set.shape)

In [None]:
training_set['Label'].value_counts(normalize=True)

In [None]:
test_set['Label'].value_counts(normalize=True)

Let's begin the data cleaning process by removing the punctuation and making all the words lowercase.

In [None]:
# Before cleaning
training_set.head(3)

In [None]:
# After cleaning
training_set['MAIL'] = training_set['MAIL'].str.replace('\W', ' ') # Removes punctuation
training_set['MAIL'] = training_set['MAIL'].str.lower()
training_set.head(3)

Let's now create the vocabulary, which in this context means a list with all the unique words in our training set.

In [None]:
training_set['MAIL'] = training_set['MAIL'].str.split()

vocabulary = []
for mail in training_set['MAIL']:
   for word in mail:
      vocabulary.append(word)

vocabulary = list(set(vocabulary))

In [None]:
len(vocabulary)

Creating clean training data set.


In [None]:
word_counts_per_mail = {unique_word: [0] * len(training_set['MAIL']) for unique_word in vocabulary}

for index, mail in enumerate(training_set['MAIL']):
   for word in mail:
      word_counts_per_mail[word][index] += 1

In [None]:
word_counts = pd.DataFrame(word_counts_per_mail)
word_counts.head()

In [None]:
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head()

Calculating Constants First

In [None]:
# Isolating spam and ham messages first
spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']
ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']

# P(Spam) and P(Ham)
p_spam = len(spam_messages) / len(training_set_clean)
p_ham = len(ham_messages) / len(training_set_clean)

# N_Spam
n_words_per_spam_message = spam_messages['MAIL'].apply(len)
n_spam = n_words_per_spam_message.sum()

# N_Ham
n_words_per_ham_message = ham_messages['MAIL'].apply(len)
n_ham = n_words_per_ham_message.sum()

# N_Vocabulary
n_vocabulary = len(vocabulary)

# Laplace smoothing
alpha = 1

Calculating Parameters

In [None]:
# Initiate parameters
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}

# Calculate parameters
for word in vocabulary:
   n_word_given_spam = spam_messages[word].sum() # spam_messages already defined
   p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)
   parameters_spam[word] = p_word_given_spam

   n_word_given_ham = ham_messages[word].sum() # ham_messages already defined
   p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)
   parameters_ham[word] = p_word_given_ham

Creating a function to Classifying A New Message

In [None]:
import re

def classify(message):

   message = re.sub('\W', ' ', message)
   message = message.lower().split()

   p_spam_given_message = p_spam
   p_ham_given_message = p_ham

   for word in message:
      if word in parameters_spam:
         p_spam_given_message *= parameters_spam[word]

      if word in parameters_ham: 
         p_ham_given_message *= parameters_ham[word]

   print('P(Spam|message):', p_spam_given_message)
   print('P(Ham|message):', p_ham_given_message)

   if p_ham_given_message > p_spam_given_message:
      print('Label: Ham')
   elif p_ham_given_message < p_spam_given_message:
      print('Label: Spam')
   else:
      print('Equal proabilities, have a human classify this!')

In [None]:
classify('WINNER!! This is the secret code to unlock the money: C3421.')

In [None]:
classify("Sounds good, Tom, then see u there")

Measuring the Spam Filter's Accuracy

In [None]:
def classify_test_set(message):

   message = re.sub('\W', ' ', message)
   message = message.lower().split()

   p_spam_given_message = p_spam
   p_ham_given_message = p_ham

   for word in message:
      if word in parameters_spam:
         p_spam_given_message *= parameters_spam[word]

      if word in parameters_ham:
         p_ham_given_message *= parameters_ham[word]

   if p_ham_given_message > p_spam_given_message:
      return 'ham'
   elif p_spam_given_message > p_ham_given_message:
      return 'spam'
   else:
      return 'needs human classification'

In [None]:
test_set['predicted'] = test_set['MAIL'].apply(classify_test_set)
test_set.head()

In [None]:
correct = 0
total = test_set.shape[0]

for row in test_set.iterrows():
   row = row[1]
   if row['Label'] == row['predicted']:
      correct += 1

print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total)

Fetching Mails from user inbox.

In [None]:
import easyimap as e
import getpass
from bs4 import BeautifulSoup

mail_bodies = []
password = getpass.getpass("Enter the password for your email: ")
server = e.connect("imap.gmail.com","your email",password)
for i in range(0,len(server.listids())):
    email = server.mail(server.listids()[i])
    cleantext = BeautifulSoup(email.body, "lxml").text
    mail_bodies.append(cleantext)
server.quit()

Classifing each mail into Spam/Ham

In [None]:
for body in mail_bodies:
    print("<-----------------------------------------------START----------------------------------------------->\n")
    print(body + "\n\t\t\t\t\t::::Result::::")
    classify(body)
    print("<------------------------------------------------END------------------------------------------------>\n")