# Spam Filter - Alternative Algorithm Versions (II, III and IV)


**This notebook compiles the core structure of the algorithm built in Project 14.**

Algorithm's purpose(s):

- 1st, determine if a SMS (cellphone message) is spam or not.
- 2nd, return a percentage of accuracy when applying it to a test data set comprised of SMSs (pre-evaluated as spam or not spam).

In each version/script we tweak two sections of the algorithm where the input messages (strings) are divided into a list of words: 

- Section 1 - in the training section of the algorithm.
- Section 2 - inside the `classify_test_set` function.
    
Notes: 

- these section's boundaries are marked as comment with the names 'Tweak section 1' and 'Tweak section 2'.

- the case sensitive versions are omitted for sake of readability.

In [1]:
import numpy as np
import pandas as pd
import re

## Version II - Recognize currency symbols: £, € and \$
---

In this version we simply make the list of words derived from the message recognize currency symbols.

Full script version II:

In [2]:
sms_spam_full = pd.read_csv(
    'SMSSpamCollection.txt',
    sep='\t',
    names=['Label', 'SMS']
)

random_sms_spam = sms_spam_full.sample(n=None, frac=1, random_state=1).reset_index(drop=True)

training_set = random_sms_spam.copy().iloc[:4458+1, :]

testing_set = random_sms_spam.copy().iloc[4458:, :]

# Training set.
count_label_training = training_set.Label.value_counts(normalize=True).round(3)*100

count_label_training = count_label_training.rename('ham vs spam (%)')

# Testing set.
count_label_testing = testing_set.Label.value_counts(normalize=True).round(3)*100

count_label_testing = count_label_testing.rename('ham vs spam (%)')

#---------------------------------------

# Free RAM 1.
del sms_spam_full
del random_sms_spam


### Tweak section 1 (start) - how a message is splitted into a list of words.

# Training set `SMS` cleaned series

# This replacement removes everything that is not a letter, number,
# whitespace, or one of the three currency symbols: £, € and $. 
cleaned = training_set.SMS.copy().str.replace('[^A-Za-z0-9\s£€\$]', ' ', regex=True)
# This replaccement finds the currency symbols and inserts whitespaces at the left and right, as boundaries.
cleaned = cleaned.str.replace('(£|€|\$)', r' \1 ', regex=True)
       
#-------------------------------------------------------------        
        
#`\s+` ensures that if there are two or more joined whitespaces they are converted to just one.
cleaned = cleaned.str.replace('\s+', ' ', regex=True) 

cleaned = cleaned.str.replace('(\A +| +\Z)', '', regex=True)

# Lower case for every string.
cleaned = cleaned.str.lower()

### Tweak section 1 (end).

#-------------------------------------------------------------

cleaned_split = cleaned.str.split(' ', expand=True) 

cleaned_split_cat = pd.concat([cleaned_split.iloc[i, :] for i in range(0, cleaned_split.shape[0])], ignore_index=True)

cleaned_split_cat = cleaned_split_cat.dropna()

cleaned_split_cat_to_list = cleaned_split_cat.to_list()

vocabulary_set = set(cleaned_split_cat_to_list)

vocabulary = list(vocabulary_set)

for index, el in enumerate(vocabulary):
    if el == '':
        del vocabulary[index]

#-------------------------------------------------------------
        
# Free RAM 2.
del cleaned_split_cat 
del cleaned_split_cat_to_list
del vocabulary_set

#-------------------------------------------------------------

def remove_elements(list_x, list_strings):
    """Strings in list_strings are removed from list_x if this later 
    list contains any of those strings.
    """
    
    for index, el in enumerate(list_x):
        if el in list_strings:
            del list_x[index]
    
    return list_x

# `expand=False` by default.
cleaned_split_listed = cleaned.str.split(' ') 

strings_to_remove = ['']

cleaned_split_listed_1 = cleaned_split_listed.copy().apply(lambda x: remove_elements(x, strings_to_remove))

#-------------------------------------------------------------

word_counts_per_sms = {unique_word: [0] * len(cleaned_split_listed_1) for unique_word in vocabulary}

for index, sms in enumerate(cleaned_split_listed_1):
    for word in sms:
        word_counts_per_sms[word][index] += 1
                   
# Convert dictionary into DataFrame.
word_counts_per_sms_df = pd.DataFrame(word_counts_per_sms)

# `sort=False` is required to preserve the order of the columns in a 'first in' fashion.
training_set_2 = pd.concat([training_set, word_counts_per_sms_df], axis=1, sort=False)

#-------------------------------------------------------------

# Free RAM 3.
del cleaned_split_listed
del word_counts_per_sms

#-------------------------------------------------------------

label_counts_ts2 = training_set_2.Label.value_counts(normalize=True)

p_spam = label_counts_ts2.spam

p_ham = label_counts_ts2.ham

n_vocabulary = len(vocabulary)

#-------------------------------------------------------------

training_set_2['sum_words_sms'] = training_set_2.iloc[:, 2:].copy().sum(axis=1)

n_spam = training_set_2.loc[training_set_2.Label=='spam', 'sum_words_sms'].sum()

n_ham = training_set_2.loc[training_set_2.Label=='ham', 'sum_words_sms'].sum()

alpha = 1 

#-------------------------------------------------------------

sms_spam = training_set_2.iloc[:, 2:-1].copy()[training_set_2.Label=='spam']

sms_ham = training_set_2.iloc[:, 2:-1].copy()[training_set_2.Label=='ham']

sms_spam_sum = sms_spam.sum().transpose()

sms_ham_sum = sms_ham.sum().transpose()

p_wi_given_spam_dict = {}

p_wi_given_ham_dict = {}

# P(w_i|Spam)
for i in range(0, sms_spam_sum.size):
    index = sms_spam_sum.index[i]
    dividend = sms_spam_sum[index] + alpha
    divisor = n_spam + (alpha*n_vocabulary)
    p_wi_given_spam_dict[index] =  dividend / divisor

    
# P(w_i|Ham)
for i in range(0, sms_ham_sum.size):
    index = sms_ham_sum.index[i]
    dividend = sms_ham_sum[index] + alpha
    divisor = n_ham + (alpha*n_vocabulary)
    p_wi_given_ham_dict[index] =  dividend / divisor
    
#-------------------------------------------------------------

# Free RAM 4.
del sms_spam
del sms_ham

#-------------------------------------------------------------

def classify_test_set(message):
    """Takes in a string - a cellphone message (SMS), and returns a classification of whether 
    the message is spam, not spam (ham), or if a human is required to classify the message.
    """
    
    ### Tweak section 2 (start) - how a message is splitted into a list of words

    message = re.sub('[^A-Za-z0-9\s£€\$]', ' ', message) # still a string
    message = re.sub('(£|€|\$)', r' \1 ', message)
    message = re.sub('\s+', ' ', message)
    message = message.lower() # still a string
    message = message.split() # now a list of strings
    
    ### Tweak section 2 (end)

    # Calculating P(Spam|w_1, w_2, ..., w_n) with P(Ham|w_1, w_2, ..., w_n).
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    # Note: if `word` is not in the spam or in the non-spam DataFrames the loop does nothing by default.
    for word in message:
        
        if word in p_wi_given_spam_dict.keys():
            p_spam_given_message *= p_wi_given_spam_dict[word]
            
        if word in p_wi_given_ham_dict.keys():
            p_ham_given_message *= p_wi_given_ham_dict[word]

    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'Requires human classification.'
    

testing_set['Test'] = testing_set['SMS'].apply(classify_test_set)

#-------------------------------------------------------------

# Assigns True if condition is met and multplying by one converts True in '1' and False in '0'.
testing_set['Correct'] = (testing_set['Label'] == testing_set['Test'])*1

test_accuracy = (testing_set['Correct'].sum() / testing_set.shape[0])*100

test_accuracy = test_accuracy.round(2)

print(f'When applied to the messages in the `training_set` ({testing_set.shape[0]} entries) the test accuracy was aprox. {test_accuracy}%.')

When applied to the messages in the `training_set` (1114 entries) the test accuracy was aprox. 98.83%.


## Version III - Recognize currency symbols: £, € and \$, and references to GBP pences
---

We've seen in the previous version that this innovation will marginally improve the algorithm's accuracy, but some money references were still not accounted for.

We can tell that we are working with a set of messages presumably from UK, since the money/currency references made are either 'GBP' or pences. When filtering the training set for 'pence' or other proxy we realize that there are many variations of this 'tell tell' sign of spamming. Some variations on 'pence' include '150pm', which stands for '150 per message' can be seen below. 

The regex expression `pattern_pences`, looks for every sequence of characters that might include a sequence of characters composed of a single to three digits (`\d{1, 3}`) followed immediately by the word `p`. This limitation is due to the fact that we know by default that references to pences range usually from 5 to 150 (or more), but not in the thousands range. Expected matches are: 

|sub-pattern  |  example                 | 
|---------|--------------------------|
|  \d+p\w+ |'**150p**pmsg'   | 
|  \d+p\w+ |'**50p**erwksub'   | 
|  \w+\d+p | '08712400602**450p**'  | 
|  \w+\d{1,}p\w+ | 'com1win**150p**pmx3age16'  | 

From the table above we can notice that in some situations, such as in the third example (from the top), some words may not be referencing pences, other cases can be ambiguous, such as the second example.  

In [3]:
list_of_patterns = []

pattern_pences = '\\b\w*\d{1,3}p\w*\\b'

for i, val in enumerate(cleaned):
    pences = re.findall(pattern_pences, val)
    if len(pences):
        list_of_patterns.append(pences[0])
        
list_of_patterns[:10]  

['08700621170150p',
 '150pm',
 '150p',
 '10p',
 '150p',
 '150p',
 '50pmmorefrommobile2bremoved',
 'mobstorequiz10ppm',
 'callcost150ppmmobilesvary',
 '450pw']

One attempt to make the algorithm recognize references to pences is to separate expressions like the ones previously mentioned, e.g. '10p', '150p', from the rest of the characters with a whitespace. The way to achieve this is to:

- first, identify a list of patterns (which is already done above).
- second, create a modified string compiled in a `list_of_replacements`.
- third and last, replacing in the main data set the old expression for the new expression, inside the same message, resorting to the `replace` function.


Example of a message modification: `training_set.SMS[186]`, contains this expression: 'callcost150ppmmobilesvary'.

In its former form:

- 'URGENT  This is the 2nd attempt to contact U U have WON  £ 1000CALL 09071512432 b4 300603t csBCM4235WC1N3XX callcost150ppmmobilesvary  max £ 7  50'

In its later form, after the modification:

- 'URGENT  This is the 2nd attempt to contact U U have WON  £ 1000CALL 09071512432 b4 300603t csBCM4235WC1N3XX callcost 150p pmmobilesvary  max £ 7  50'


The list of replacements is built as follows:

Note: in the first loop the condition `re.findall('([1-9]pm|1[0-2]pm)', val_1) == []` was set so that the evening time references are not recognize, namely 1pm to 12pm. Despite of clearing some mis-interpreted pences references, a small number will not be recognized though, e.g. if there is a message that has the expression '10pm', meaning '10 pences per message' instead of '10 p.m. in the evening', it will not be considered as a reference to pences.

In [4]:
list_of_replacements = []
pences_references = []

for i, val_1 in enumerate(list_of_patterns):
    if re.findall('([1-9]pm|1[0-2]pm)', val_1) == []:
        pence_ref = val_1
        digits_p = re.findall('\d{1,3}p', pence_ref)
        pences_references.append(digits_p)

    final_str = ''
    
    for val_2 in digits_p:
        
        sub = ' ' + val_2 + ' '
        
        if final_str == '':
            final_str = re.sub(val_2, sub, pence_ref, count=1)
            
        else:
            final_str = re.sub(val_2, sub, final_str, count=1)
    
    list_of_replacements.append(final_str)

list_of_replacements[:10]

['08700621170 150p ',
 ' 150p m',
 ' 150p ',
 ' 10p ',
 ' 150p ',
 ' 150p ',
 ' 50p mmorefrommobile2bremoved',
 'mobstorequiz 10p pm',
 'callcost 150p pmmobilesvary',
 ' 450p w']

The following function, when applied to a string, looks for expressions that may be in the `list_of_patterns` and if there is a match, it replaces that expression for the equivalent version (with whitespaces separating letters from words, as seen previously), resorting to the `list_of_replacements`.

In [5]:
def replace(string):
    
    new_string = ''
    
    for i, val in enumerate(list_of_patterns):
        if val in string:
            new_string += re.sub(val, list_of_replacements[i], string)
            break
            
    if len(new_string):
        return new_string
    else:
        return string

Verifying changes made by the function in two random spam messages:

In [6]:
# the one from the previous example
print(cleaned[186], replace(cleaned[186]), sep='\n\n')

urgent this is the 2nd attempt to contact u u have won £ 1000call 09071512432 b4 300603t csbcm4235wc1n3xx callcost150ppmmobilesvary max £ 7 50

urgent this is the 2nd attempt to contact u u have won £ 1000call 09071512432 b4 300603t csbcm4235wc1n3xx callcost 150p pmmobilesvary max £ 7 50


In [7]:
print(cleaned[2821], replace(cleaned[2821]), sep='\n\n')

someone has contacted our dating service and entered your phone because they fancy you to find out who it is call from a landline 09111032124 pobox12n146tf150p

someone has contacted our dating service and entered your phone because they fancy you to find out who it is call from a landline 09111032124 pobox12n146tf 150p 


The full version III script:

In [8]:
sms_spam_full = pd.read_csv(
    'SMSSpamCollection.txt',
    sep='\t',
    names=['Label', 'SMS']
)

random_sms_spam = sms_spam_full.sample(n=None, frac=1, random_state=1).reset_index(drop=True)

training_set = random_sms_spam.copy().iloc[:4458+1, :]

testing_set = random_sms_spam.copy().iloc[4458:, :]

# Training set.
count_label_training = training_set.Label.value_counts(normalize=True).round(3)*100

count_label_training = count_label_training.rename('ham vs spam (%)')

# Testing set.
count_label_testing = testing_set.Label.value_counts(normalize=True).round(3)*100

count_label_testing = count_label_testing.rename('ham vs spam (%)')

#---------------------------------------

# Saving RAM 1.
del sms_spam_full
del random_sms_spam

### Tweak section 1 (start) - how a message is splitted into a list of words.

# Training set `SMS` cleaned series
cleaned = training_set.SMS.copy().str.replace('[^A-Za-z0-9\s£€\$]', ' ', regex=True)
cleaned = cleaned.str.replace('€', ' € ', regex=False)
cleaned = cleaned.str.replace('£', ' £ ', regex=True)
cleaned = cleaned.str.replace('\$', ' $ ', regex=True)

# 1st, identify a list of patterns (which is already done above).
list_of_patterns = []

pattern_pences = '\\b\w*\d{1,3}p\w*\\b'

for i, val in enumerate(cleaned):
    pences = re.findall(pattern_pences, val)
    if len(pences):
        list_of_patterns.append(pences[0])
        
# 2nd, create a modified string compiled in a `list_of_replacements`.        
list_of_replacements = []
pences_references = []

for i, val_1 in enumerate(list_of_patterns):
    if re.findall('([1-9]pm|1[0-2]pm)', val_1) == []:
        pence_ref = val_1
        digits_p = re.findall('\d{1,3}p', pence_ref)
        pences_references.append(digits_p)

    final_str = ''
    
    for val_2 in digits_p:
        
        sub = ' ' + val_2 + ' '
        
        if final_str == '':
            final_str = re.sub(val_2, sub, pence_ref, count=1)
            
        else:
            final_str = re.sub(val_2, sub, final_str, count=1)
    
    list_of_replacements.append(final_str)
        
# 3rd, replacing in the main data set the old expression for the new expression inside the same message resorting to the `replace` function.

def replace(string):
    
    new_string = ''
    
    for i, val in enumerate(list_of_patterns):
        if val in string:
            new_string += re.sub(val, list_of_replacements[i], string)
            break
            
    if len(new_string):
        return new_string
    else:
        return string


cleaned = cleaned.apply(replace)

#`\s+` ensures that if there are two or more joined whitespaces they are converted to just one.
cleaned = cleaned.str.replace('\s+', ' ', regex=True) 

cleaned = cleaned.str.replace('(\A +| +\Z)', '', regex=True)

# Lower case for every string.
cleaned = cleaned.str.lower()

### Tweak section 1 (end).

#-------------------------------------------------------------

cleaned_split = cleaned.str.split(' ', expand=True) 

cleaned_split_cat = pd.concat([cleaned_split.iloc[i, :] for i in range(0, cleaned_split.shape[0])], ignore_index=True)

cleaned_split_cat = cleaned_split_cat.dropna()

cleaned_split_cat_to_list = cleaned_split_cat.to_list()

vocabulary_set = set(cleaned_split_cat_to_list)

vocabulary = list(vocabulary_set)

for index, el in enumerate(vocabulary):
    if el == '':
        del vocabulary[index]

#-------------------------------------------------------------
        
# Free RAM 2.
del cleaned_split_cat 
del cleaned_split_cat_to_list
del vocabulary_set

#-------------------------------------------------------------

def remove_elements(list_x, list_strings):
    """Strings in list_strings are removed from list_x if this later 
    list contains any of those strings.
    """
    
    for index, el in enumerate(list_x):
        if el in list_strings:
            del list_x[index]
    
    return list_x


# `expand=False` by default.
cleaned_split_listed = cleaned.str.split(' ') 


strings_to_remove = ['']

cleaned_split_listed_1 = cleaned_split_listed.copy().apply(lambda x: remove_elements(x, strings_to_remove))

#-------------------------------------------------------------

word_counts_per_sms = {unique_word: [0] * len(cleaned_split_listed_1) for unique_word in vocabulary}

for index, sms in enumerate(cleaned_split_listed_1):
    for word in sms:
        word_counts_per_sms[word][index] += 1
        
              
# Convert dictionary into DataFrame.
word_counts_per_sms_df = pd.DataFrame(word_counts_per_sms)


# `sort=False` is required to preserve the order of the columns in a 'first in' fashion.
training_set_2 = pd.concat([training_set, word_counts_per_sms_df], axis=1, sort=False)

#-------------------------------------------------------------

# Free RAM 3.
del cleaned_split_listed
del word_counts_per_sms

#-------------------------------------------------------------

label_counts_ts2 = training_set_2.Label.value_counts(normalize=True)

p_spam = label_counts_ts2.spam

p_ham = label_counts_ts2.ham

n_vocabulary = len(vocabulary)

#-------------------------------------------------------------

training_set_2['sum_words_sms'] = training_set_2.iloc[:, 2:].copy().sum(axis=1)

n_spam = training_set_2.loc[training_set_2.Label=='spam', 'sum_words_sms'].sum()

n_ham = training_set_2.loc[training_set_2.Label=='ham', 'sum_words_sms'].sum()

alpha = 1 

#-------------------------------------------------------------

# The last colum 'sum_words_sms' is not included in the calculations
# of these Series, so I set `.iloc[:, 2:-1]`.
sms_spam = training_set_2.iloc[:, 2:-1].copy()[training_set_2.Label=='spam']

sms_ham = training_set_2.iloc[:, 2:-1].copy()[training_set_2.Label=='ham']

sms_spam_sum = sms_spam.sum().transpose()

sms_ham_sum = sms_ham.sum().transpose()

p_wi_given_spam_dict = {}

p_wi_given_ham_dict = {}

# P(w_i|Spam)
for i in range(0, sms_spam_sum.size):
    index = sms_spam_sum.index[i]
    dividend = sms_spam_sum[index] + alpha
    divisor = n_spam + (alpha*n_vocabulary)
    p_wi_given_spam_dict[index] =  dividend / divisor

    
# P(w_i|Ham)
for i in range(0, sms_ham_sum.size):
    index = sms_ham_sum.index[i]
    dividend = sms_ham_sum[index] + alpha
    divisor = n_ham + (alpha*n_vocabulary)
    p_wi_given_ham_dict[index] =  dividend / divisor
    
#-------------------------------------------------------------

# Free RAM 4.
del sms_spam
del sms_ham

#-------------------------------------------------------------

def classify_test_set(message):
    """Takes in a string - a cellphone message (SMS), and returns a classification of whether 
    the message is spam, not spam (ham), or if a human is required to classify the message.
    """
    
    ### Tweak section 2 (start) - how a message is splitted into a list of words

    message = re.sub('[^A-Za-z0-9\s£€\$]', ' ', message) # still a string
    message = re.sub('£', ' £ ', message)
    message = re.sub('€', ' € ', message)
    message = re.sub('\$', ' $ ', message)
    message = replace(message)
    message = re.sub('\s+', ' ', message)
    message = message.lower() # still a string
    message = message.split() # now a list of strings
    
    
    ### Tweak section 2 (end).

    # Calculating P(Spam|w_1, w_2, ..., w_n) with P(Ham|w_1, w_2, ..., w_n).
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    # Note: if `word` is not in the spam or in the non-spam DataFrames the loop does nothing by default.
    for word in message:
        
        if word in p_wi_given_spam_dict.keys():
            p_spam_given_message *= p_wi_given_spam_dict[word]
            
        if word in p_wi_given_ham_dict.keys():
            p_ham_given_message *= p_wi_given_ham_dict[word]

    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'Requires human classification.'
    

testing_set['Test'] = testing_set['SMS'].apply(classify_test_set)

#-------------------------------------------------------------

# Assigns True if condition is met and multplying by one converts True in '1' and False in '0'.
testing_set['Correct'] = (testing_set['Label'] == testing_set['Test'])*1

test_accuracy = (testing_set['Correct'].sum() / testing_set.shape[0])*100

test_accuracy = test_accuracy.round(2)

print(f'When applied to the messages in the `training_set` ({testing_set.shape[0]} entries) the test accuracy was aprox. {test_accuracy}%.')

When applied to the messages in the `training_set` (1114 entries) the test accuracy was aprox. 98.83%.


## Version IV - Separate all numbers from letters and from other characters/symbols.
---

For this version, in section 1 and 2, instead of having all 'non-words' removed ('\w'), we identify digits and symbols and insert white spaces as boundaries:

    cleaned = training_set.\
        SMS.copy().\
        str.replace('(\W|\d+)', r' \1 ', regex=True)

    message = re.sub('(\W|\d+)', r' \1 ', message)
    
The sub parameter, `r' \1 '`, identifies any pattern given by `'(\W|\d+)'` in a string and inserts the whitespaces left and right.

An example below:

In [9]:
print(
    training_set.SMS[186], 
    re.sub('(\W|\d+)', r' \1 ', training_set.SMS[186]),
    sep='\n\n'
)

URGENT! This is the 2nd attempt to contact U!U have WON £1000CALL 09071512432 b4 300603t&csBCM4235WC1N3XX.callcost150ppmmobilesvary. max£7. 50

URGENT !    This   is   the    2 nd   attempt   to   contact   U ! U   have   WON    £  1000 CALL    09071512432    b 4     300603 t & csBCM 4235 WC 1 N 3 XX . callcost 150 ppmmobilesvary .    max £  7  .     50 


The full version IV script:


In [10]:
sms_spam_full = pd.read_csv(
    'SMSSpamCollection.txt',
    sep='\t',
    names=['Label', 'SMS']
)

random_sms_spam = sms_spam_full.sample(n=None, frac=1, random_state=1).reset_index(drop=True)

training_set = random_sms_spam.copy().iloc[:4458+1, :]

testing_set = random_sms_spam.copy().iloc[4458:, :]

# Training set.
count_label_training = training_set.Label.value_counts(normalize=True).round(3)*100

count_label_training = count_label_training.rename('ham vs spam (%)')

# Testing set.
count_label_testing = testing_set.Label.value_counts(normalize=True).round(3)*100

count_label_testing = count_label_testing.rename('ham vs spam (%)')

#---------------------------------------

# Free RAM 1.
del sms_spam_full
del random_sms_spam

### Tweak section 1 (start) - how a message is splitted into a list of words,

cleaned = training_set.SMS.copy().str.replace('(\W|\d+)', r' \1 ', regex=True)

#-------------------------------------------------------------

#`\s+` ensures that if there are two or more joined whitespaces they are converted to just one.
cleaned = cleaned.str.replace('\s+', ' ', regex=True) 

cleaned = cleaned.str.replace('(\A +| +\Z)', '', regex=True) 

# Lower case for every string.
cleaned = cleaned.str.lower()

### Tweak section 1 (end).

#-------------------------------------------------------------

cleaned_split = cleaned.str.split(' ', expand=True) 

cleaned_split_cat = pd.concat([cleaned_split.iloc[i, :] for i in range(0, cleaned_split.shape[0])], ignore_index=True)

cleaned_split_cat = cleaned_split_cat.dropna()

cleaned_split_cat_to_list = cleaned_split_cat.to_list()

vocabulary_set = set(cleaned_split_cat_to_list)

vocabulary = list(vocabulary_set)

for index, el in enumerate(vocabulary):
    if el == '':
        del vocabulary[index]

#-------------------------------------------------------------
        
# Free RAM 2.
del cleaned_split_cat 
del cleaned_split_cat_to_list
del vocabulary_set

#-------------------------------------------------------------

def remove_elements(list_x, list_strings):
    """Strings in list_strings are removed from list_x if this later 
    list contains any of those strings.
    """
    
    for index, el in enumerate(list_x):
        if el in list_strings:
            del list_x[index]
    
    return list_x

# `expand=False` by default.
cleaned_split_listed = cleaned.str.split(' ') 

strings_to_remove = ['']

cleaned_split_listed_1 = cleaned_split_listed.copy().apply(lambda x: remove_elements(x, strings_to_remove))

#-------------------------------------------------------------

word_counts_per_sms = {unique_word: [0] * len(cleaned_split_listed_1) for unique_word in vocabulary}

for index, sms in enumerate(cleaned_split_listed_1):
    for word in sms:
        word_counts_per_sms[word][index] += 1
        
              
# Convert dictionary into DataFrame
word_counts_per_sms_df = pd.DataFrame(word_counts_per_sms)


# `sort=False` is required to preserve the order of the columns in a 'first in' fashion.
training_set_2 = pd.concat([training_set, word_counts_per_sms_df], axis=1, sort=False)

#-------------------------------------------------------------

# Free RAM 3.
del cleaned_split_listed
del word_counts_per_sms

#-------------------------------------------------------------

label_counts_ts2 = training_set_2.Label.value_counts(normalize=True)

p_spam = label_counts_ts2.spam

p_ham = label_counts_ts2.ham

n_vocabulary = len(vocabulary)

#-------------------------------------------------------------

training_set_2['sum_words_sms'] = training_set_2.iloc[:, 2:].copy().sum(axis=1)

n_spam = training_set_2.loc[training_set_2.Label=='spam', 'sum_words_sms'].sum()

n_ham = training_set_2.loc[training_set_2.Label=='ham', 'sum_words_sms'].sum()

alpha = 1 

#-------------------------------------------------------------

# The last colum 'sum_words_sms' is not included in the calculations
# of these Series, so I set `.iloc[:, 2:-1]`.
sms_spam = training_set_2.iloc[:, 2:-1].copy()[training_set_2.Label=='spam']

sms_ham = training_set_2.iloc[:, 2:-1].copy()[training_set_2.Label=='ham']

sms_spam_sum = sms_spam.sum().transpose()

sms_ham_sum = sms_ham.sum().transpose()

p_wi_given_spam_dict = {}

p_wi_given_ham_dict = {}

# P(w_i|Spam)
for i in range(0, sms_spam_sum.size):
    index = sms_spam_sum.index[i]
    dividend = sms_spam_sum[index] + alpha
    divisor = n_spam + (alpha*n_vocabulary)
    p_wi_given_spam_dict[index] =  dividend / divisor

    
# P(w_i|Ham)
for i in range(0, sms_ham_sum.size):
    index = sms_ham_sum.index[i]
    dividend = sms_ham_sum[index] + alpha
    divisor = n_ham + (alpha*n_vocabulary)
    p_wi_given_ham_dict[index] =  dividend / divisor
    
#-------------------------------------------------------------

# Free RAM 4.
del sms_spam
del sms_ham

#-------------------------------------------------------------


def classify_test_set(message):
    """Takes in a string - a cellphone message (SMS), and returns a classification of whether 
    the message is spam, not spam (ham), or if a human is required to classify the message.
    """
    
    ### Tweak section 3 (start) - how a message is splitted into a list of words

    message = re.sub('(\W|\d+)', r' \1 ', message)
    message = re.sub('\s+', ' ', message)
    message = message.lower() 
    message = message.split() # now a list of strings
    
    ### Tweak section 1 (end)

    # Calculating P(Spam|w_1, w_2, ..., w_n) with P(Ham|w_1, w_2, ..., w_n).
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    # Note: if `word` is not in the spam or in the non-spam DataFrames the loop does nothing by default.
    for word in message:
        
        if word in p_wi_given_spam_dict.keys():
            p_spam_given_message *= p_wi_given_spam_dict[word]
            
        if word in p_wi_given_ham_dict.keys():
            p_ham_given_message *= p_wi_given_ham_dict[word]

    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'Requires human classification.'
    

testing_set['Test'] = testing_set['SMS'].apply(classify_test_set)

#-------------------------------------------------------------

# Assigns True if condition is met and multplying by one converts True in '1' and False in '0'.
testing_set['Correct'] = (testing_set['Label'] == testing_set['Test'])*1

test_accuracy = (testing_set['Correct'].sum() / testing_set.shape[0])*100

test_accuracy = test_accuracy.round(2)

print(f'When applied to the messages in the `testing_set` ({testing_set.shape[0]} entries) the test accuracy was aprox. {test_accuracy}%.')

When applied to the messages in the `testing_set` (1114 entries) the test accuracy was aprox. 98.65%.


\***