## Building a Spam/Ham SMS/Email detector from scratch using Naive Bayes Classifier
### Author : Qasim Wani

In [107]:
import pandas as pd
import numpy as np
import sklearn.model_selection as skl
from collections import Counter

In [108]:
df = pd.read_csv('spam-ham', sep="\t")
X = np.array(df.iloc[:,1])
y = np.array(df.iloc[:,0])
X_train, X_test, y_train, y_test = skl.train_test_split(X, y, test_size=0.33, random_state=42)

In [110]:
df.head(6)

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...


In [3]:
spam_np = []
ham_np = []

In [4]:
for i in range(len(X_train)):
    X_train[i] = X_train[i].replace(".", " ").replace("!"," ").replace(","," ").replace("?"," ")
    one_mail = X_train[i].split(" ")[:]
    if(y_train[i] == 'ham'):
        ham_np.append(one_mail)
    else:
        spam_np.append(one_mail)


In [5]:
def classfiy_words_counter(data):
    word_count = {}
    fanboys = "for|an|and|nor|but|or|yet|so|is|"
    for word in data:
        for cc in word:
            cc = cc.lower()
            if fanboys.find(cc) == -1:
                if(len(cc) > 1):
                    if cc not in word_count:
                        word_count[cc] = 1
                    else:
                        word_count[cc] += 1
                
    spam_most_common = Counter.most_common(word_count)[:2500]
    return spam_most_common

In [6]:
spam_classifier_word_counter = classfiy_words_counter(spam_np)

ham_classifier_word_counter = classfiy_words_counter(ham_np)

In [7]:
def occurrence_calc(words, size):
    """Calculates the occurance of top 20 most common words in all emails for it's category i.e. spam/ham. 
        Returns a new list with the word, frequency, and occurance as a fraction
        
        Takes in two parameters: 
        1. words : a list of tuples consisting of most frequent words and their respective frequencies
        2. size  : number of emails in given class (spam/ham for example)
    """
    i = 0
    new_list = []
    for i in range(len(words)):
        num = float(words[i][-1])
        x = float(num/size)
        a = list(words[i])
        y = float(x)
        a.append(y)
        new_list.append(a)
    return new_list

In [8]:
spam_TDM = np.array(occurrence_calc(words=spam_classifier_word_counter, size=len(spam_np)))
ham_TDM  = np.array(occurrence_calc(words=ham_classifier_word_counter, size=len(ham_np)))

In [9]:
spam_dataframe_TDM = pd.DataFrame(data=spam_TDM, columns=['Spam Email Terms','Frequency','Occurrence'])
ham_dataframe_TDM = pd.DataFrame(data=ham_TDM, columns=['Ham Email Terms','Frequency','Occurrence'])

In [10]:
ham_dataframe_TDM.head(6)

Unnamed: 0,Ham Email Terms,Frequency,Occurrence
0,you,1241,0.3839727722772277
1,to,1071,0.3313737623762376
2,the,779,0.2410272277227722
3,in,551,0.1704826732673267
4,me,531,0.1642945544554455
5,my,505,0.15625


In [11]:
spam_dataframe_TDM.head(6)

Unnamed: 0,Spam Email Terms,Frequency,Occurrence
0,to,459,0.9161676646706588
1,call,236,0.4710578842315369
2,you,190,0.3792415169660678
3,your,180,0.3592814371257485
4,free,161,0.3213572854291417
5,the,135,0.2694610778443114



# Naive Bayes Classifier Formula

![alt text](https://blog.easysol.net/wp-content/uploads/2017/12/Image-1-1-600x169.png)

### Let's understand what the above formula means in detail
> Here, P(A|B) is the posterior probability, i.e. the objective. 
In our case, P(A|B) is P(spam|email)
P(B|A) is the likelihood, i.e. P(email | spam)
P(A) referes to the prior probability, i.e. P(spam)
P(B) referes to the marginal probability, i.e. P(email)

## Note about calculating likelihood probability:
> In order to calculate P(B | A), we need to use the product operator, Π
> Here's an example of how it works
![alt text](https://math.illinoisstate.edu/day/courses/old/305/contentsum07.gif)

## Sample Naive Bayes Classifier
![alt text](https://alexn.org/assets/img/spam-multiple-bayes-naive.png)

In [12]:
def calculate_individual_likelihood(word_check, text_document_matrix):
    """
    Calculates the likelihood of a word in a dataset of spam/ham emails
    
    Parameters:
    1. word_check : The word to calculate the probability for. Data type = any
    2. text_document_matrix : Dataset to calculate the proability against. Data type = nx3 np.array() object
    
    Returns the likelihood as a probability (0 - 1)
    """
    rows = text_document_matrix.shape[0]
    
    occurrence = 0
    
    for i in range(rows):
        word = text_document_matrix[i, 0]
        if(word == word_check):
            occurrence = float(text_document_matrix[i, -1])
    return occurrence

In [13]:
def calculate_marginal(word_check, spam_set, ham_set, data_size):
    """
    Calculates the marginal probability of a word in a dataset
    Four (4) Required Parameters.
    Parameters:
    1. word_check : The word to calculate marginal probability for. Data type = ANY
    2. spam_set   : Spam dataset to calculate the probability against. Data Type = nx3 np.array() object
    3. ham_set    : Ham dataset to calculate the probability against. Data Type = nx3 np.array() object
    4. data_size  : The size of the entire dataset (spam and ham included). Data Type = INT
    """
    rows = spam_set.shape[0]
    
    frequency = 0
    
    for (spam,ham) in zip(spam_set, ham_set):
        if(spam[0] == word_check):
            frequency += int(spam[1])
        if(ham[0] == word_check):
            frequency += int(ham[1])
    marginal = frequency/float(data_size)
    return float(marginal)

In [14]:
def calculate_posterior(likelihood, prior, margin):
    """
    Calculates the Posterior probability of an email being spam or ham.
    Returns the posterior probability (0 - 1)
    Accepts Three (3) required parameters.
    Parameters:
    1. likelihood : Probability of a list of words given it's spam/ham. Data Type = float
    2. prior      : Probability of spam/ham. Data Type = float
    3. margin     : Probability of list of words. Data Type = float
    """
    numerator = likelihood*prior
    posterior = float(numerator/margin)
    return posterior

In [15]:
def one_point_naive_bayes(email, TDM_spam, TDM_ham, spam_size, ham_size):
    """
    Calculates whether an email is spam or ham.
    Takes in five (5) required parameters : 
    1. email      : email to classify as spam/ham. Takes in the email as a list of words
    2. TDM_spam   : Term Document Matrix for spam emails. Takes in an n x 3 list.
    3. TDM_ham    : Term Document Matrix for ham emails. Takes in an n x 3 list.
    4. spam_size  : The number of all spam emails in the dataset.
    5. ham_size   : The number of all ham emails in the dataset.
    
    returns 1 if spam, 0 if ham
    """
    total_set = int(ham_size) + int(spam_size)
    
#------Calculating prior spam and prior ham-----
    prior_spam = float(spam_size/total_set)
    prior_ham = float(ham_size/total_set)
# ----------------------------------------------
    
    
    likelihood_spam = 1
    likelihood_ham  = 1
    
    margin_ham = 1
    margin_spam = 1
    
    frequency_spam = 1
    frequency_ham  = 1
    
    for word in email:
        if(word in TDM_ham[:,0]):
            likelihood_ham *= calculate_individual_likelihood(word, TDM_ham)
            margin_ham *= calculate_marginal(word, TDM_spam, TDM_ham, total_set)
            
        if(word in TDM_spam[:,0]):
            likelihood_spam *= calculate_individual_likelihood(word, TDM_spam)
            margin_spam *= calculate_marginal(word, TDM_spam, TDM_ham, total_set)
        
#     print(margin_ham, margin_spam)
#     print("Prior")
#     print(prior_ham, prior_spam)
    posterior_spam = calculate_posterior(likelihood=likelihood_spam, prior=prior_spam, margin=margin_spam)
    posterior_ham  = calculate_posterior(likelihood=likelihood_ham, prior=prior_ham, margin=margin_ham)
#     print(posterior_ham, posterior_spam)
    if(posterior_ham >= posterior_spam):
        return posterior_ham, posterior_spam, 0
    else:
        return posterior_ham, posterior_spam, 1

# Validiating our Model

### Checking the performance of our Training Set

In [22]:
def model_validation(spam_size, ham_size, data, __type="Training"):
    """
    Validates the Naive Bayes model.
    Doesn't return anything. Prints some statistics.
    
    Parameters :
    1. spam_np : Number of spam SMS
    2. ham_np  : Number of ham SMS
    3. data    : The dataset to validate
    4. __type  : 'Training' -or- 'Testing'. By default, __type = "Training"
    """
    result = []
    for em_one in data:
        check_one_email = em_one.split(" ")
        _,_,classification = one_point_naive_bayes(check_one_email, spam_TDM, ham_TDM, spam_size, ham_size)
        result.append(classification)
    sp = 0
    ha = 0
    for cls in result:
        if(cls == 0):
            ha += 1
        else:
            sp += 1
    print("Number of Model Detected Ham SMS : {0}\nNumber of Model Detected Spam SMS : {1}\n".format(ha, sp))
    print("Number of Actual Ham SMS : {0}\nNumber of Actual Spam SMS : {1}\n".format(ham_size, spam_size))
    training_percent = float(sp/spam_size)*100
    if(__type == "Training"):
        training_percent = float(ha/ham_size)*100
        
    print("{0} Set Classification :\n{1}% of all SMS were classified correctly.\n".format(__type, training_percent))

In [17]:
## Training Set Performance
model_validation(len(spam_np), len(ham_np), X_train)

Number of Model Detected Ham SMS : 2883
Number of Model Detected Spam SMS : 850

Number of Actual Ham SMS : 3232
Number of Actual Spam SMS : 501

Training Set Classification :
89.20173267326733 of all SMS were classified correctly.



### Checking the performance of our Testing Set

In [28]:
def testing_validation_hyper_params():
    """
    This function is used to calculate recall score and precision used in validating the testing model only.
    
    Returns the size of the Ham and Spam SMS testing set.
    """
    test_spam = []
    test_ham = []
    for i in range(len(X_test)):
        one_mail = X_test[i].split(" ")[:]
        if(y_test[i] == 'ham'):
            test_ham.append(one_mail)
        else:
            test_spam.append(one_mail)
    return len(test_ham), len(test_spam)

In [31]:
# Testing Set Performance
test_spam, test_ham = testing_validation_hyper_params()
model_validation(test_spam, test_ham, X_test, "Testing")

Number of Model Detected Ham SMS : 354
Number of Model Detected Spam SMS : 1485

Number of Actual Ham SMS : 246
Number of Actual Spam SMS : 1593

Testing Set Classification :
93.22033898305084% of all SMS were classified correctly.



# Reflection:
#####  As you can see, my training accuracy was nearly 89.2%
#####  On the other hand, my testing accuracy was a whopping 93.2%

> ####  Note : SciKit Learn's NaiveBayesClassifier GausianNB() had a recall score of 98%. Pretty close to my model.

## Validating an actual email from my gmail spam folder!

![alt text](https://pbs.twimg.com/media/EAzk1DfVAAA2vk0?format=jpg&name=4096x4096)

In [54]:
def outside_source_checker(data):
    '''
    Classifies an individual email/SMS as spam (1) or ham (0).
    Returns void.
    
    Required Parameters:
    1. data : email / SMS to classify
    '''
    # Classifying the email as Spam (1) or Ham (0)
    posterior_ham, posterior_spam, res = one_point_naive_bayes(data, spam_TDM, ham_TDM, len(spam_np), len(ham_np))
    if(res == 0):
        print("""Ham Email/SMS\nResult Value : {2}\nConfidence Level of Ham : {0}\nConfidence Level of Spam : {1}\n
        Note : These confidence scores are not percentages since they haven't been normalized"""
              .format(posterior_ham, posterior_spam, res))
    else:
        print("""Spam Email/SMS detected\nResult Value: {2}\nConfidence Level of Ham : {0}\nConfidence Level of Spam : {1}\n
        Note : These confidence scores are not percentages since they haven't been normalized"""
              .format(posterior_ham, posterior_spam, res))

In [55]:
# Let's convert the email into a list of words

email ="""
Good day

We are writing you in the matter of your application on a Careers jobboard for vacancy of Supply Chain Agent

Job region: US, all states
Job Type: Full-time, Permanent
Pay range: $96,200.00 - $118,500.00 / Per Year

Major Responsibilities:
- Implement and prepare the typical project planning and monitoring process for all projects
- Coordinate ongoing relationships with all suppliers in collaboration with the marketing department
- Direct team work to meet the customer and project scope requirements
- Develop comprehensive project schedule with regional team members
- Produce regular reports and statistics on a weekly basis
- Participate in various projects and make recommendations to meet market demands

Key skills and qualifications:
- Demonstrated staff role and management skills
- Have an ability to do customer service
- Be detail-oriented, experienced and resourceful
- Working knowledge of Microsoft Office
- Legal driver's license and Driver experience
""".replace("."," ").replace("!"," ").replace("/"," ").split(" ")

In [56]:
outside_source_checker(email)

Spam Email/SMS detected
Result Value: 1
Confidence Level of Ham : 0.013080060322194078
Confidence Level of Spam : 21390633.41930753

        Note : These confidence scores are not percentages since they haven't been normalized


# Validating a Whatsapp message

![alt text](https://pbs.twimg.com/media/EA0BAMnW4AA8vhc?format=jpg&name=medium)

In [105]:
text_message = """
Don't think I mentioned this.
I'll be sending out the site to my list of beta users once we finish the PWA.
We want to be able to get feedback at that stage too before our VT launch.
""".replace("."," ").split(" ")

In [106]:
outside_source_checker(text_message)

Ham Email/SMS
Result Value : 0
Confidence Level of Ham : 0.5024047380295416
Confidence Level of Spam : 0.018384557032790103

        Note : These confidence scores are not percentages since they haven't been normalized
