In [1]:
import pandas as pd 
from sklearn import svm 
from sklearn.metrics import classification_report
import joblib
import re 

In [2]:
df = pd.read_csv('spam_ham_dataset.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [3]:
def remove_subject_line(text):
    return '\r\n'.join(line for line in text.split('\r\n') if not line.startswith('Subject:'))

In [4]:
df = df.rename(columns={"text": "INFO", "label": "LABEL"})

In [5]:
df['INFO'] = df['INFO'].apply(remove_subject_line)

In [6]:
df['LABEL'].value_counts(normalize=True)

LABEL
ham     0.710114
spam    0.289886
Name: proportion, dtype: float64

In [7]:
# Randomize the dataset
data_randomized = df.sample(frac=1, random_state=1)

# Calculate index for split
training_test_index = round(len(data_randomized) * 0.8)

# Split into training and test sets
training_set = data_randomized[:training_test_index].reset_index(drop=True)
test_set = data_randomized[training_test_index:].reset_index(drop=True)

print(training_set.shape)
print(test_set.shape)

(4137, 3)
(1034, 3)


In [8]:
training_set['LABEL'].value_counts(normalize=True)

LABEL
ham     0.707276
spam    0.292724
Name: proportion, dtype: float64

In [9]:
test_set['LABEL'].value_counts(normalize=True)

LABEL
ham     0.72147
spam    0.27853
Name: proportion, dtype: float64

In [10]:
# After cleaning
training_set['INFO'] = training_set['INFO'].str.replace(
   '\W', ' ') # Removes punctuation
training_set['INFO'] = training_set['INFO'].str.lower()

In [11]:
training_set['INFO'] = training_set['INFO'].str.split()

vocabulary = []
for sms in training_set['INFO']:
   for word in sms:
      vocabulary.append(word)

vocabulary = list(set(vocabulary))

In [12]:
word_counts_per_text = {unique_word: [0] * len(training_set['INFO']) for unique_word in vocabulary}

for index, sms in enumerate(training_set['INFO']):
   for word in sms:
      word_counts_per_text[word][index] += 1

In [13]:
word_counts = pd.DataFrame(word_counts_per_text)

In [14]:
training_set_clean = pd.concat([training_set, word_counts], axis=1)

In [15]:
# Isolating spam and ham messages first
spam_messages = training_set_clean[training_set_clean['label_num'] == 1]
ham_messages = training_set_clean[training_set_clean['label_num'] == 0]

In [16]:
# P(Spam) and P(Ham)
p_spam = len(spam_messages) / len(training_set_clean)
p_ham = len(ham_messages) / len(training_set_clean)

# N_Spam
n_words_per_spam_message = spam_messages['INFO'].apply(len)
n_spam = n_words_per_spam_message.sum()

# N_Ham
n_words_per_ham_message = ham_messages['INFO'].apply(len)
n_ham = n_words_per_ham_message.sum()

# N_Vocabulary
n_vocabulary = len(vocabulary)

# Laplace smoothing
alpha = 1

In [17]:
# Initiate parameters
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}

In [18]:
# Calculate parameters
for word in vocabulary:
   n_word_given_spam = spam_messages[word].sum() # spam_messages already defined
   p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)
   parameters_spam[word] = p_word_given_spam

   n_word_given_ham = ham_messages[word].sum() # ham_messages already defined
   p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)
   parameters_ham[word] = p_word_given_ham

In [44]:
class NaiveBayesClassifier:
    def __init__(self, parameters_spam, parameters_ham, p_spam, p_ham):
        self.parameters_spam = parameters_spam
        self.parameters_ham = parameters_ham
        self.p_spam = p_spam
        self.p_ham = p_ham

    def classify(self, message):
        message = re.sub('\W', ' ', message)
        message = message.lower().split()

        p_spam_given_message = self.p_spam
        p_ham_given_message = self.p_ham

        for word in message:
            if word in self.parameters_spam:
                p_spam_given_message *= self.parameters_spam[word]

            if word in self.parameters_ham:
                p_ham_given_message *= self.parameters_ham[word]

        if p_ham_given_message > p_spam_given_message:
            return 0
        else:
            return 1

In [45]:
classifier = NaiveBayesClassifier(parameters_spam, parameters_ham, p_spam, p_ham)
classifier.classify('WINNER!! This is the secret code to unlock the money: C3421.')

1

In [19]:
def classify_test_set(message):
   '''
   message: a string
   '''

   message = re.sub('\W', ' ', message)
   message = message.lower().split()

   p_spam_given_message = p_spam
   p_ham_given_message = p_ham

   for word in message:
      if word in parameters_spam:
         p_spam_given_message *= parameters_spam[word]

      if word in parameters_ham:
         p_ham_given_message *= parameters_ham[word]

   if p_ham_given_message > p_spam_given_message:
      return 'ham'
   elif p_spam_given_message > p_ham_given_message:
      return 'spam'
   else:
      return 'needs human classification'

In [20]:
test_set['predicted'] = test_set['INFO'].apply(classify_test_set)
test_set

Unnamed: 0,LABEL,INFO,label_num,predicted
0,ham,teco tap 115 . 000 / hpl iferc ; 10 . 000 / en...,0,ham
1,spam,having trouble reading\r\nthis e - mail ? clic...,1,needs human classification
2,ham,done .\r\ndaren j farmer @ ect\r\n06 / 29 / 20...,0,needs human classification
3,spam,this message will inform you on how to remove ...,1,needs human classification
4,spam,microsoft windows xp professional 2002 $ 50 re...,1,needs human classification
...,...,...,...,...
1029,spam,driving at ? in 1876\r\ndogs and cats that ' s...,1,spam
1030,ham,- - - - - - - - - - - - - - - - - - - - - - fo...,0,ham
1031,spam,"hi ,\r\nregalis , also known as superviagra or...",1,spam
1032,ham,please make sure this is clearly understood by...,0,needs human classification


In [21]:
correct = 0
total = test_set.shape[0]

for row in test_set.iterrows():
   row = row[1]
   if row['LABEL'] == row['predicted']:
      correct += 1

print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total)

Correct: 568
Incorrect: 466
Accuracy: 0.5493230174081238


In [23]:
joblib.dump(parameters_spam, "parameters_spam.joblib")
joblib.dump(parameters_ham, "parameters_ham.joblib")
joblib.dump(p_spam, "p_spam.joblib")
joblib.dump(p_ham, "p_ham.joblib")

['p_ham.joblib']