### This notebook compiles the core structure of the algorithm built in Project 12

Algorithm's purpose(s):

- 1st, determine if a SMS (cellphone message) is spam or not.
- 2nd, return a percentage of accuracy when applying it to a test data set comprised of SMSs (pre-evaluated as spam or not spam).

Notes:    

- In this notebook we tweak the three sections of the algorithm where the input messages (strings) are divided into a list of words (also strings): 



    - Section 1 - in the training section of the algorithm
    - Section 2 - inside the `classify` function (1st purpose mentioned above) 
    - Section 3 - inside the `classify_test_set` function (2nd purpose mentioned above)

In [3]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [20]:
sms_spam_full = pd.read_csv('SMSSpamCollection.txt',
                   sep='\t',
                   names=['Label', 'SMS'])


#-----------------------------------------

# 1.
random_sms_spam = sms_spam_full.sample(n=None, frac=1, random_state=1).reset_index(drop=True)

# 2.
training_set = random_sms_spam.copy().iloc[:4458+1, :]

testing_set = random_sms_spam.copy().iloc[4458:, :]

# 3.

# Training set.
count_label_training = training_set.Label.value_counts(normalize=True).round(3)*100

count_label_training = count_label_training.rename('ham vs spam (%)')

# Testing set.
count_label_testing = testing_set.Label.value_counts(normalize=True).round(3)*100

count_label_testing = count_label_testing.rename('ham vs spam (%)')

#---------------------------------------

# Saving RAM 1
del sms_spam_full
del random_sms_spam


### Tweak section 1 (start) - how a message is splitted into a list of words

# Training set `SMS` cleaned series
ts_cleaned_SMS = training_set.SMS.copy().str.replace('[^A-Za-z0-9\s£€\$]', ' ', regex=True)
ts_cleaned_SMS = ts_cleaned_SMS.str.replace('€', ' € ', regex=False)
ts_cleaned_SMS = ts_cleaned_SMS.str.replace('£', ' £ ', regex=True)
ts_cleaned_SMS = ts_cleaned_SMS.str.replace('\$', ' $ ', regex=True)

#-------------------------------------------------------------

list_of_patterns = []

pattern_pences = '\\b\w*\d{1,3}p\w*\\b'

for i, val in enumerate(ts_cleaned_SMS):
    pences = re.findall(pattern_pences, val)
    if len(pences):
        list_of_patterns.append(pences[0])
        
#-------------------------------------------------------------        
        
pences_references = []
list_of_replacements = []

for i, val_1 in enumerate(list_of_patterns):
    if re.findall('([1-9]pm|1[0-2]pm)', val_1) == []:
        pence_ref = val_1
        digits = re.findall('\d{1,3}p', pence_ref)
        pences_references.append(digits)
    
    final_str = ''
    
    for val_2 in digits:
        sub = ' ' + val_2 + ' '
        
        if final_str == '':
            final_str = re.sub(val_2, sub, pence_ref, count=1)
            
        else:
            final_str = re.sub(val_2, sub, final_str, count=1)
    
    list_of_replacements.append(final_str)
    
#-------------------------------------------------------------

def replace(string):
    
    new_string = ''
    
    for i, val in enumerate(list_of_patterns):
        if val in string:
            new_string += re.sub(val, list_of_replacements[i], string)
            break
            
    if len(new_string):
        return new_string
    else:
        return string
    
#-------------------------------------------------------------

ts_cleaned_SMS_2 = ts_cleaned_SMS.apply(replace)

#`\s+` ensures that if there are two or more joined whitespaces they are converted to just one.
ts_cleaned_SMS_2 = ts_cleaned_SMS_2.str.replace('\s+', ' ', regex=True) 

pat1 = '(?:\s)\Z' # whitespace end of the string

pat2 = '\A(?:\s)' # whitespace beginning of the string

ts_cleaned_SMS_2 = ts_cleaned_SMS_2.str.replace(pat1, '', regex=True) 
ts_cleaned_SMS_2 = ts_cleaned_SMS_2.str.replace(pat2, '', regex=True) 

# Lower case for every string.
# ts_cleaned_SMS_2 = ts_cleaned_SMS_2.str.lower()

#-------------------------------------------------------------

# 1.
ts_cleaned_SMS_split = ts_cleaned_SMS_2.str.split(' ', expand=True) 

# 2.
ts_cleaned_SMS_split_cat = pd.concat([ts_cleaned_SMS_split.iloc[i, :] for i in range(0, ts_cleaned_SMS_split.shape[0])], ignore_index=True)
# 3.
ts_cleaned_SMS_split_cat = ts_cleaned_SMS_split_cat.dropna()

# 4.
ts_cleaned_SMS_split_cat_to_list = ts_cleaned_SMS_split_cat.to_list()

#5.
vocabulary_set = set(ts_cleaned_SMS_split_cat_to_list)

#6.
vocabulary = list(vocabulary_set)

for index, el in enumerate(vocabulary):
    if el == '':
        del vocabulary[index]

#-------------------------------------------------------------
        
# Free RAM 2

del ts_cleaned_SMS_split_cat 
del ts_cleaned_SMS_split_cat_to_list
del vocabulary_set

#-------------------------------------------------------------

def remove_elements(list_x, list_strings):
    """Strings in list_strings are removed from list_x if this later 
    list contains any of those strings.
    """
    
    for index, el in enumerate(list_x):
        if el in list_strings:
            del list_x[index]
    
    return list_x


# `expand=False` by default.
ts_cleaned_SMS_split_listed = ts_cleaned_SMS_2.str.split(' ') 


### Tweak section 1 (end)



strings_to_remove = ['']

ts_cleaned_SMS_split_listed_1 = ts_cleaned_SMS_split_listed.copy().apply(lambda x: remove_elements(x, strings_to_remove))

#-------------------------------------------------------------

# From the tutorial.

# 1.
word_counts_per_sms = {unique_word: [0] * len(ts_cleaned_SMS_split_listed_1) for unique_word in vocabulary}

for index, sms in enumerate(ts_cleaned_SMS_split_listed_1):
    for word in sms:
        word_counts_per_sms[word][index] += 1
        
              
# 2. Convert dictionary into DataFrame
word_counts_per_sms_df = pd.DataFrame(word_counts_per_sms)


# 3. 
# `sort=False` is required to preserve the order of the columns in a 'first in' fashion.
training_set_2 = pd.concat([training_set, word_counts_per_sms_df], axis=1, sort=False)

#-------------------------------------------------------------

# Free RAM 3

del ts_cleaned_SMS_split_listed
del word_counts_per_sms

#-------------------------------------------------------------

label_counts_ts2 = training_set_2.Label.value_counts(normalize=True)

p_spam = label_counts_ts2.spam

p_ham = label_counts_ts2.ham

n_vocabulary = len(vocabulary)

#-------------------------------------------------------------

# 1.
training_set_2['sum_words_sms'] = training_set_2.iloc[:, 2:].copy().sum(axis=1)

# 2.

n_spam = training_set_2.loc[training_set_2.Label=='spam', 'sum_words_sms'].sum()

n_ham = training_set_2.loc[training_set_2.Label=='ham', 'sum_words_sms'].sum()

alpha = 1 

#-------------------------------------------------------------

# 1. the last colum 'sum_words_sms' is not included in the calculations
# of these Series, so I set `.iloc[:, 2:-1]`.
sms_spam = training_set_2.iloc[:, 2:-1].copy()[training_set_2.Label=='spam']

sms_ham = training_set_2.iloc[:, 2:-1].copy()[training_set_2.Label=='ham']


# 2.
sms_spam_sum = sms_spam.sum().transpose()

sms_ham_sum = sms_ham.sum().transpose()

# 3.

p_wi_given_spam_dict = {}

p_wi_given_ham_dict = {}

# P(w_i|Spam)
for i in range(0, sms_spam_sum.size):
    index = sms_spam_sum.index[i]
    dividend = sms_spam_sum[index] + alpha
    divisor = n_spam + (alpha*n_vocabulary)
    p_wi_given_spam_dict[index] =  dividend / divisor

    
# P(w_i|Ham)
for i in range(0, sms_ham_sum.size):
    index = sms_ham_sum.index[i]
    dividend = sms_ham_sum[index] + alpha
    divisor = n_ham + (alpha*n_vocabulary)
    p_wi_given_ham_dict[index] =  dividend / divisor
    
#-------------------------------------------------------------

# Free RAM 4

del sms_spam
del sms_ham

#-------------------------------------------------------------

def classify(message):
    """Takes in a string - a cellphone message (SMS), and returns the probability of Spam given the input message,
    the probability of non-spam (ham) given the input message and classifies whether
    the message is spam, not spam (ham), or if a human is required to classify the message.
    """

    ### Tweak section 2 (start) - how a message is splitted into a list of words
    
    message = re.sub('[^A-Za-z0-9\s£€\$]', ' ', message) # still a string
    message = re.sub('£', ' £ ', message)
    message = re.sub('€', ' € ', message)
    message = re.sub('\$', ' $ ', message)
    message = replace(message)
    message = re.sub('\s+', ' ', message)
#     message = message.lower() # still a string
    message = message.split() # now a list of strings
    
    ### Tweak section 2 (end)

    
    # Calculating P(Spam|w_1, w_2, ..., w_n) with P(Ham|w_1, w_2, ..., w_n).
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    # Note: if `word` is not in the spam or in the non-spam DataFrames the loop does nothing by default.
    for word in message:
        
        if word in p_wi_given_spam_dict.keys():
            p_spam_given_message *= p_wi_given_spam_dict[word]
            
        if word in p_wi_given_ham_dict.keys():
            p_ham_given_message *= p_wi_given_ham_dict[word]
        

    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)

    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
        
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
        
    else:
        print('Equal proabilities, have a human classify this!')


#-------------------------------------------------------------


def classify_test_set(message):
    """Takes in a string - a cellphone message (SMS), and returns a classification of whether 
    the message is spam, not spam (ham), or if a human is required to classify the message.
    """
    
    ### Tweak section 3 (start) - how a message is splitted into a list of words

    message = re.sub('[^A-Za-z0-9\s£€\$]', ' ', message) # still a string
    message = re.sub('£', ' £ ', message)
    message = re.sub('€', ' € ', message)
    message = re.sub('\$', ' $ ', message)
    message = replace(message)
    message = re.sub('\s+', ' ', message)
#     message = message.lower() # still a string
    message = message.split() # now a list of strings
    
    ### Tweak section 1 (end)

    
    # Calculating P(Spam|w_1, w_2, ..., w_n) with P(Ham|w_1, w_2, ..., w_n).
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    # Note: if `word` is not in the spam or in the non-spam DataFrames the loop does nothing by default.
    for word in message:
        
        if word in p_wi_given_spam_dict.keys():
            p_spam_given_message *= p_wi_given_spam_dict[word]
            
        if word in p_wi_given_ham_dict.keys():
            p_ham_given_message *= p_wi_given_ham_dict[word]

    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'Requires human classification.'
    

testing_set['Test'] = testing_set['SMS'].apply(classify_test_set)

#-------------------------------------------------------------

# Assigns True if condition is met and multplying by one converts True in '1' and False in '0'.
testing_set['Correct'] = (testing_set['Label'] == testing_set['Test'])*1

test_accuracy = (testing_set['Correct'].sum() / testing_set.shape[0])*100

test_accuracy = test_accuracy.round(2)

print(f'When applied to the messages in the `training_set` ({testing_set.shape[0]} entries) the test accuracy was aprox. {test_accuracy}%.')

When applied to the messages in the `training_set` (1114 entries) the test accuracy was aprox. 98.74%.


In [5]:
# list_of_patterns_4 = []

# pattern_4 = '\\b\w*\d{1,3}p\w*\\b'

# for i, val in enumerate(ts_cleaned_SMS):
#     pences = re.findall(pattern_4, val)
#     if len(pences):
#         list_of_patterns_4.append(pences[0])
        


# list_of_patterns_4

In [6]:
def replace(string):
    
    new_string = ''
    
    for i, val in enumerate(list_of_patterns):
        if val in string:
            new_string += re.sub(val, list_of_replacements[i], string)
            break
            
    if len(new_string):
        return new_string
    else:
        return string
    

In [17]:
ts_cleaned_SMS[186]

'URGENT  This is the 2nd attempt to contact U U have WON  £ 1000CALL 09071512432 b4 300603t csBCM4235WC1N3XX callcost150ppmmobilesvary  max £ 7  50'

In [18]:
replace(ts_cleaned_SMS[186])

'URGENT  This is the 2nd attempt to contact U U have WON  £ 1000CALL 09071512432 b4 300603t csBCM4235WC1N3XX callcost 150p pmmobilesvary  max £ 7  50'

In [14]:
replace(ts_cleaned_SMS[2821])

'Someone has contacted our dating service and entered your phone because they fancy you  To find out who it is call from a landline 09111032124   PoBox12n146tf 150p '

In [16]:
ts_cleaned_SMS[ts_cleaned_SMS.str.contains('callcost150ppmmobilesvary')]

186    URGENT  This is the 2nd attempt to contact U U...
Name: SMS, dtype: object

In [10]:
list_of_patterns

['08700621170150p',
 '150pm',
 '150p',
 '10p',
 '150p',
 '150p',
 '50pmmorefrommobile2Bremoved',
 'MobStoreQuiz10ppm',
 'callcost150ppmmobilesvary',
 '450pw',
 '35p',
 'com1win150ppmx3age16',
 '1x150p',
 '10ppm',
 '000pes',
 '150ppm',
 '150p',
 '150ppm',
 '150p',
 '150p',
 '150p',
 '08712400602450p',
 '150p',
 '150p',
 '150ppm',
 '150p',
 '150ppmPOBox10183BhamB64XE',
 '8pm',
 '150p',
 '150p',
 '150ppmsg',
 '10p',
 '7pm',
 '50perWKsub',
 '50p',
 '25p',
 '150p',
 '150ppm',
 '60p',
 '20p',
 '150p',
 '150p',
 '25p',
 '150p',
 '150ppm',
 'gr8prizes',
 'com1win150ppmx3age16subscription',
 '150p',
 '150p',
 '10p',
 '2price',
 '50p',
 '2p',
 '18p',
 '150p',
 'norm150p',
 '150p',
 '150ppm',
 '100percent',
 '150p',
 '11pm',
 '150p',
 '150p',
 '60p',
 '08712400602450p',
 '150p',
 '50perWKsub',
 '150p',
 '5p',
 '150pm',
 'box245c2150pm',
 '100percent',
 '45pm',
 '50p',
 '25p',
 '150ppermessSubscription',
 '08701417012150p',
 '150p',
 '20p',
 '150p',
 '60p',
 '50p',
 '9pm',
 '7pm',
 '30pm',
 '150p'

In [11]:
# list_of_replacements = []
# pences_references = []

# for i, val_1 in enumerate(list_of_patterns_4):
#     digits = re.findall('\d{1,3}p', val_1)
#     pences_references.append(digits)
    
#     final_str = ''
    
#     for val_2 in digits:
#         sub = ' ' + val_2 + ' '
        
#         if final_str == '':
#             final_str = re.sub(val_2, sub, val_1, count=1)
            
#         else:
#             final_str = re.sub(val_2, sub, final_str, count=1)
    
#     list_of_replacements.append(final_str)

# list_of_replacements

In [12]:
# list_of_replacements = []
# pences_references = []

# for i, val_1 in enumerate(list_of_patterns_3):
#     if re.findall('([1-9]pm|1[0-2]pm)', val_1) == []:
#         pence_ref = val_1
#         digits = re.findall('\d{1,3}p', pence_ref)
#         pences_references.append(digits)

#     final_str = ''
    
#     for val_2 in digits:
        
#         sub = ' ' + val_2 + ' '
        
#         if final_str == '':
#             final_str = re.sub(val_2, sub, pence_ref, count=1)
            
#         else:
#             final_str = re.sub(val_2, sub, final_str, count=1)
    
#     list_of_replacements.append(final_str)

# list_of_replacements
# # pences_references

In [19]:
cond_incorrect = testing_set.Correct == 0

incorrect = testing_set[cond_incorrect].copy().reset_index(drop=True)

pd.options.display.max_colwidth = 500

incorrect[['Label', 'SMS']]

Unnamed: 0,Label,SMS
0,spam,Not heard from U4 a while. Call me now am here all night with just my knickers on. Make me beg for it like U did last time 01223585236 XX Luv Nikiyu4.net
1,ham,Unlimited texts. Limited minutes.
2,ham,26th OF JULY
3,ham,Nokia phone is lovly..
4,ham,"A Boy loved a gal. He propsd bt she didnt mind. He gv lv lttrs, Bt her frnds threw thm. Again d boy decided 2 aproach d gal , dt time a truck was speeding towards d gal. Wn it was about 2 hit d girl,d boy ran like hell n saved her. She asked 'hw cn u run so fast?' D boy replied ""Boost is d secret of my energy"" n instantly d girl shouted ""our energy"" n Thy lived happily 2gthr drinking boost evrydy Moral of d story:- I hv free msgs:D;): gud ni8"
5,ham,No calls..messages..missed calls
6,ham,"We have sent JD for Customer Service cum Accounts Executive to ur mail id, For details contact us"
7,ham,Just taste fish curry :-P
8,spam,"Oh my god! I've found your number again! I'm so glad, text me back xafter this msgs cst std ntwk chg £1.50"
9,spam,"Hi babe its Chloe, how r u? I was smashed on saturday night, it was great! How was your weekend? U been missing me? SP visionsms.com Text stop to stop 150p/text"
