## Import all necessary libraries

In [14]:
import numpy as np
import pandas as pd
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
dataframe = pd.read_csv('data.csv')
dataframe.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Function to clean the message

In [2]:
def process_message(dataframe = None, message = None):
    if dataframe is not None:
        dataframe['Processed'] = None
        for i in range(len(dataframe['Message'].values)):
            # lowering the texts
            temp_text = dataframe['Message'][i].lower()

            # removing punctuations 
            for letter in temp_text:
                if letter in punctuation:
                    temp_text = temp_text.replace(letter,'')

            # stop words
            for word in temp_text.split():
                if word in stopwords.words('english'):
                    temp_text = temp_text.replace(word,'')
                    temp_text = ' '.join(temp_text.split())

            # stemming
            stemmer = SnowballStemmer('english')
            result = ''
            for word in temp_text.split():
                result += stemmer.stem(word)
                result += ' '

            dataframe['Processed'][i] = result.split()
        return dataframe
    
    else:
        temp_text = message.lower()

        # removing punctuations
        for letter in temp_text:
            if letter in punctuation:
                temp_text = temp_text.replace(letter,'')

        # stop words
        for word in temp_text.split():
            if word in stopwords.words('english'):
                temp_text = temp_text.replace(word,'')
                temp_text = ' '.join(temp_text.split())

        # stemming
        stemmer = SnowballStemmer('english')
        result = ''
        for word in temp_text.split():
            result += stemmer.stem(word)
            result += ' '

        
        return result.split()


dataframe = process_message(dataframe = dataframe)

In [3]:
dataframe.shape

(5572, 3)

In [4]:
dataframe.head()

Unnamed: 0,Category,Message,Processed
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, pot, crazi, avail, bugi, n, great..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entri, 2, wkli, comp, w, f, cup, fl, tk..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, thnk, goe, usf, lves, around, re, ..."


## Number of Spam and Ham messages

In [5]:
print('Number of Spam: ' + str(dataframe[dataframe['Category']=='spam'].shape[0]))
print('Number of Ham: ' + str(dataframe[dataframe['Category']=='ham'].shape[0]))

Number of Spam: 747
Number of Ham: 4825


## Creating freqs dictionary that contains the (word,label) as key and its frequency as value

In [6]:
freqs = {}
for i in range(dataframe.shape[0]):
    if dataframe['Category'][i] == 'spam':
        for word in dataframe['Processed'][i]:
            if (word,0) in freqs.keys():
                freqs[(word,0)] += 1
            else:
                freqs[(word,0)] = 1
    else:
        for word in dataframe['Processed'][i]:
            if (word,1) in freqs.keys():
                freqs[(word,1)] += 1
            else:
                freqs[(word,1)] = 1

## Splitting datasets

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataframe['Message'], dataframe['Category'],random_state=42, test_size=0.1)

## Function to train the model

In [8]:
def train_naive_bayes(X_train, y_train, freqs):

    log_prior = 0
    log_likelihood = {}
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    N_spam = N_ham = 0
    for pair in freqs.keys():
        # if spam
        if pair[1] == 0:
            N_spam += freqs[pair]

        else:
            N_ham += freqs[pair]

    D = len(y_train)
    D_ham = len(y_train[y_train=='ham'])
    D_spam = len(y_train[y_train=='spam'])

    log_prior = np.log(D_ham/D) - np.log(D_spam/D)

    for word in vocab:
        freqs_ham = freqs.get((word,1),0)
        freqs_spam = freqs.get((word,0),0)

        p_w_ham = (freqs_ham + 1) / (N_ham + V)
        p_w_spam = (freqs_spam + 1) / (N_spam + V)

        log_likelihood[word] = np.log(p_w_ham/p_w_spam)
    
    return log_prior , log_likelihood



In [9]:
log_prior, log_likelihood = train_naive_bayes(X_train,y_train, freqs)
print(log_prior)
# print(log_likelihood)

1.8623995161821434


## Function to predict

In [10]:
def predict_naive_bayes(message, log_prior, log_likelihood):
    
    tokens = process_message(message=message)
    p=0

    # log_prior
    p += log_prior

    for word in tokens:
        p += log_likelihood.get(word,0)
    
    return p


In [11]:
p = predict_naive_bayes('I would like to give you a free vacation trip', log_prior, log_likelihood)
print(p)

8.145239039883805


## Function to test the model

In [12]:
def test_naive_bayes(X_test, y_test, log_prior, log_likelihood):
    accuracy = 0
    y_hat = []
    y = []
    for i in y_test.values:
        if i == 'ham':
            y.append(1)
        else:
            y.append(0)
    for doc in X_test:
        if predict_naive_bayes(doc,log_prior= log_prior, log_likelihood= log_likelihood) >0:
            # not a spam (ham)
            y_hat.append(1)
        else:
            # spam
            y_hat.append(0)
    
    error = np.mean(np.abs(np.array(y_hat)-np.array(y)))

    return 1-error

## Checking the accuracy

In [13]:
print('The accuracy of the model is: ' + str(test_naive_bayes(X_test,y_test,log_prior,log_likelihood)))

The accuracy of the model is: 0.989247311827957
