In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount='True')
import pandas as pd
import math
import re

def vocab(sms_spam, sms_spam_test):
  #Data Cleaning
  sms_spam['SMS'] = sms_spam['SMS'].str.replace('\W', ' ', regex=True) # Removes punctuation
  sms_spam['SMS'] = sms_spam['SMS'].str.lower() # Lowercase
  sms_spam['SMS'] = sms_spam['SMS'].str.split() # Word Split
  sms_spam_test['SMS'] = sms_spam_test['SMS'].str.replace('\W', ' ', regex=True) # Removes punctuation
  sms_spam_test['SMS'] = sms_spam_test['SMS'].str.lower() # Lowercase
  vocabulary = []
  for sms in sms_spam['SMS']:
    for word in sms:
        vocabulary.append(word)

  vocabulary = list(set(vocabulary)) # Gives Unique Vocabulary
  return sms_spam, sms_spam_test, vocabulary

def bernoulli(sms_spam, vocabulary):
  word_counts_per_sms = {unique_word: [0] * len(sms_spam['SMS']) for unique_word in vocabulary}

  for index, sms in enumerate(sms_spam['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

  word_counts = pd.DataFrame(word_counts_per_sms)
  sms_spam_clean = pd.concat([sms_spam, word_counts], axis=1)
  return sms_spam_clean

def predict(sms_spam_clean, vocabulary):
  # Isolating spam and ham messages first
  spam_messages = sms_spam_clean[sms_spam_clean['Label'] == 'spam']
  ham_messages = sms_spam_clean[sms_spam_clean['Label'] == 'ham']

  # P(Spam) and P(Ham)
  p_spam = len(spam_messages) / len(sms_spam_clean)
  p_ham = len(ham_messages) / len(sms_spam_clean)

  # Number of Words in Spam set
  n_words_per_spam_message = spam_messages['SMS'].apply(len)
  n_spam = n_words_per_spam_message.sum()

  # # Number of Words in Ham set
  n_words_per_ham_message = ham_messages['SMS'].apply(len)
  n_ham = n_words_per_ham_message.sum()

  # Length of Vocabulary
  n_vocabulary = len(vocabulary)

  # 1-Laplace Smoothing
  alpha = 1

  # Initiate parameters
  parameters_spam = {unique_word:0 for unique_word in vocabulary}
  parameters_ham = {unique_word:0 for unique_word in vocabulary}

  # Calculate parameters
  for word in vocabulary:
    n_word_given_spam = spam_messages[word].sum() 
    p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha * n_vocabulary)
    parameters_spam[word] = math.log(p_word_given_spam) # Probability of word in spam

    n_word_given_ham = ham_messages[word].sum()
    p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha * n_vocabulary)
    parameters_ham[word] = math.log(p_word_given_ham) # Probability of word in ham

  return p_ham, p_spam, parameters_ham, parameters_spam

# Appending the test set with predictions
def classify_test_set(message, p_ham, p_spam, parameters_ham, parameters_spam): 
  message = re.sub('\W', ' ', message)
  message = message.lower().split()

  p_spam_given_message = p_spam
  p_ham_given_message = p_ham

  for word in message:
      if word in parameters_spam:
        p_spam_given_message += parameters_spam[word]

      if word in parameters_ham:
        p_ham_given_message += parameters_ham[word]

  if p_ham_given_message > p_spam_given_message:
      return 'ham'
  elif p_spam_given_message > p_ham_given_message:
      return 'spam'
  elif p_spam_given_message == p_ham_given_message:
      return 'Equal Probability'

def metric(sms_spam_test):
  # Performance Metrics
  tp = 0
  tn = 0
  fp = 0
  fn = 0
  total = sms_spam_test.shape[0]

  for row in sms_spam_test.iterrows():
    row = row[1]
    if row['Label'] == row['predicted'] and row['Label'] == 'spam':
        tp += 1
    if row['Label'] == row['predicted'] and row['Label'] == 'ham':
        tn += 1
    if row['Label'] != row['predicted'] and row['Label'] == 'spam':
        fp += 1
    if row['Label'] != row['predicted'] and row['Label'] == 'ham':
        fn += 1
  accuracy = (tp+tn)/total
  precision = (tp)/(tp+fp)
  recall = (tp)/(tp+fn)
  f1_score = (2 * precision *recall)/(precision + recall)

  print('Accuracy:', accuracy)
  print('Precision:', precision)
  print('Recall:', recall)
  print('F1 Score:', f1_score)

def model(sms_spam, sms_spam_test):
  sms_spam, sms_spam_test, vocabulary = vocab(sms_spam, sms_spam_test)
  sms_spam_clean = bernoulli(sms_spam, vocabulary)
  p_ham, p_spam, parameters_ham, parameters_spam = predict(sms_spam_clean, vocabulary)
  sms_spam_test['predicted'] = sms_spam_test['SMS'].apply(classify_test_set, args=(p_ham, p_spam, parameters_ham, parameters_spam)) 
  metric(sms_spam_test)

Mounted at /content/drive


In [None]:
# Enron1 Dataset
sms_spam = pd.read_csv('/content/drive/MyDrive/Datasets/ML/enron1train.csv', header=None, names=['Label', 'SMS'])
sms_spam_test = pd.read_csv('/content/drive/MyDrive/Datasets/ML/enron1test.csv', header=None, names=['Label', 'SMS'])
model(sms_spam, sms_spam_test)

Accuracy: 0.9407894736842105
Precision: 0.8657718120805369
Recall: 0.9485294117647058
F1 Score: 0.9052631578947368


In [None]:
# Enron4 Dataset
sms_spam = pd.read_csv('/content/drive/MyDrive/Datasets/ML/enron4train.csv', header=None, names=['Label', 'SMS'])
sms_spam_test = pd.read_csv('/content/drive/MyDrive/Datasets/ML/enron4test.csv', header=None, names=['Label', 'SMS'])
model(sms_spam, sms_spam_test)

Accuracy: 0.9429097605893186
Precision: 0.969309462915601
Recall: 0.9522613065326633
F1 Score: 0.9607097591888467


In [None]:
# HW Dataset
sms_spam = pd.read_csv('/content/drive/MyDrive/Datasets/ML/hwtrain.csv', header=None, names=['Label', 'SMS'])
sms_spam_test = pd.read_csv('/content/drive/MyDrive/Datasets/ML/hwtest.csv', header=None, names=['Label', 'SMS'])
model(sms_spam, sms_spam_test)

Accuracy: 0.9414225941422594
Precision: 0.8538461538461538
Recall: 0.925
F1 Score: 0.888
