## Efficient Low Noise Naive Bayes

In [99]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import Counter
from IPython.display import Image
from itertools import dropwhile
from sklearn import metrics
import pprint as pp
import random
import time
import sys
import os
try:
   import cPickle as cPickle
except:
   import pickle as cPickle

### Load data

In [100]:
data = pd.read_csv('dataset/spam.csv', encoding='latin-1')

In [101]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [137]:
# Drop some unwanted columns that do not provide insight
#data_modified = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data_modified = data.loc[:, ~data.columns.str.contains('^Unnamed')]

In [138]:
data_modified = data_modified.rename(columns={"v1":"Label", "v2":"Text"})
data_modified.loc[(data_modified.Label == "spam") , 'Label'] = "SPAM"
data_modified.loc[(data_modified.Label == "ham") , 'Label'] = "HAM"
# Show last 5 in dataset
data_modified.tail(5)

Unnamed: 0,Label,Text
5567,SPAM,This is the 2nd time we have tried 2 contact u...
5568,HAM,Will Ì_ b going to esplanade fr home?
5569,HAM,"Pity, * was in mood for that. So...any other s..."
5570,HAM,The guy did some bitching but I acted like i'd...
5571,HAM,Rofl. Its true to its name


In [139]:
data_modified.Label.value_counts()

HAM     4825
SPAM     747
Name: Label, dtype: int64

### Data Preparation

Training Data

In [1]:
# first 4500 reviews
training_data = data_modified[4500:4972].reset_index(drop=True)
training_data_length = training_data.shape[0]
training_data.head()

NameError: name 'data_modified' is not defined

In [141]:
training_data.Label.value_counts()

HAM     3893
SPAM     607
Name: Label, dtype: int64

Shape of training data

In [142]:
pp.pprint(training_data.shape)
pp.pprint(training_data.Label.shape)

(4500, 2)
(4500,)


Validation Data

In [143]:
# middle 972 reviews
validation_data = data_modified[4500:4972].reset_index(drop=True)
validation_data_length = validation_data.shape[0]
validation_data.head()

Unnamed: 0,Label,Text
0,HAM,So wat's da decision?
1,HAM,Wot is u up 2 then bitch?
2,HAM,Stupid.its not possible
3,HAM,She told to hr that he want posting in chennai...
4,SPAM,Mobile Club: Choose any of the top quality ite...


In [144]:
validation_data.Label.value_counts()

HAM     408
SPAM     64
Name: Label, dtype: int64

Shape of validation data

In [145]:
pp.pprint(validation_data.shape)
pp.pprint(validation_data.Label.shape)

(472, 2)
(472,)


Testing data

In [146]:
# last 600 reviews
test_data = data_modified[4972:5572].reset_index(drop=True)
test_data_length = test_data.shape[0]
test_data.head()

Unnamed: 0,Label,Text
0,HAM,Oops I was in the shower when u called. Hey a ...
1,HAM,Aiyo u so poor thing... Then u dun wan 2 eat? ...
2,HAM,Yar... I tot u knew dis would happen long ago ...
3,HAM,You are gorgeous! keep those pix cumming :) th...
4,HAM,A boy was late 2 home. His father: \POWER OF F...


In [147]:
test_data.Label.value_counts()

HAM     524
SPAM     76
Name: Label, dtype: int64

Shape of Testing data

In [148]:
pp.pprint(test_data.shape)
pp.pprint(test_data.Label.shape)

(600, 2)
(600,)


### Building the Naive Bayes Classifier

### Calculate the Priors

These are simply the propbability of text having "POSITIVE" or "NEGATIVE" sentiment

In [149]:
# total positive reviews in training set
r_spam = training_data.Label[training_data.Label == "SPAM"].count()
# total negative reviews in training set
r_ham = training_data.Label[training_data.Label == "HAM"].count()
# total reviews in training set
r_total = training_data.Label.count()

In [150]:
r_spam

607

In [151]:
r_ham

3893

In [152]:
r_total

4500

In [153]:
# Drop some unwanted columns that do not provide insight
training_data.head()

Unnamed: 0,Label,Text
0,HAM,"Go until jurong point, crazy.. Available only ..."
1,HAM,Ok lar... Joking wif u oni...
2,SPAM,Free entry in 2 a wkly comp to win FA Cup fina...
3,HAM,U dun say so early hor... U c already then say...
4,HAM,"Nah I don't think he goes to usf, he lives aro..."


In [154]:
# r_positive/r_total
r_spam = r_spam/r_total
# r_negative/r_total
r_ham = r_ham/r_total

### Calculate the Likelihood

In [157]:
training_array = training_data.values
test_array = test_data.values
validation_array = validation_data.values
test_array[4]

array(['HAM', 'A boy was late 2 home. His father: \\POWER OF FRNDSHIP\\""'], dtype=object)

In [158]:
np.unique(training_array[1][1])
training_array[:,1]

array([ 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
       'Ok lar... Joking wif u oni...',
       "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
       ...,
       'Latest Nokia Mobile or iPOD MP3 Player +å£400 proze GUARANTEED! Reply with: WIN to 83355 now! Norcorp Ltd.å£1,50/Mtmsgrcvd18+',
       'SMS SERVICES. for your inclusive text credits, pls goto www.comuk.net login= 3qxj9 unsubscribe with STOP, no extra charge. help 08702840625.COMUK. 220-CM2 9AE',
       'Nvm take ur time.'], dtype=object)

In [203]:
class BinaryNBClassifier(object):
    def __init__(self):
        self.polar_cutoff = 0.2
        self.min_count = 5
        self.max_count = 50000
        # size of {POSITIVE: count, MEGATIVE: count} reviews
        self.label_instance_counts = Counter()
        # dict for pos/neg worrd counts
        self.feature_counts = {}
        # dict for word probabilities for each label
        
    def pre_process_data(self, training_XY):
        # training data size
        self.training_size = training_XY.shape[0]
        # size of NEGATIVE reviews
        positive_word_counts = Counter()
        # frequency of words in positive reviews
        positive_word_counts = Counter()
        # frequency of words in negative reviews
        negative_word_counts = Counter()
        # frequency of words in all reviews
        total_word_counts = Counter()
        
        # get the counts of words in each sentiment
        for i in range(self.training_size ):
            if(training_XY[i][0] == 'SPAM'):
                # unique, counts = np.unique(a, return_counts=True)
                self.label_instance_counts[training_XY[i][0]] += 1
                for word in training_XY[i][1].split(" "):
                    positive_word_counts[word] += 1
                    total_word_counts[word] += 1
            if(training_XY[i][0] == 'HAM'):
                self.label_instance_counts[training_XY[i][0]] += 1
                for word in training_XY[i][1].split(" "):
                    negative_word_counts[word] += 1
                    total_word_counts[word] += 1
        
        # eliminate noise words
        negative_word_counts = {key:value for key, value in negative_word_counts.items() if value > self.min_count and value < self.max_count}
        positive_word_counts = {key:value for key, value in positive_word_counts.items() if value > self.min_count and value < self.max_count}
       
        self.feature_counts["SPAM"] = positive_word_counts
        self.feature_counts["HAM"] = negative_word_counts
        #print(self.feature_counts["NEGATIVE"])
        

    def predict_probabilities(self, text):
        prediction_results = {}
        text_counts = Counter(text.split(" "))
        for label in self.feature_prior.keys():
            likelihood_prob = 1
            for word in text_counts:
                # For every word in the text, we get the number of times that word occured in the reviews for a given class
                occurence_count = text_counts.get(word)
                # avoid missing word occurances
                if word in self.feature_prior[label].keys():
                    likelihood_prob *=  occurence_count * self.feature_prior[label][word]
            # calculate probalities
            # Now we multiply by the probability of the class existing in the documents.
            prediction_results[label] = likelihood_prob * self.class_prior[label]
        return prediction_results
    
    def test(self, test_data_XY):
        df = pd.DataFrame(columns=['PREDICTION', 'LABEL'])
        verdict = ""
        for i in range(test_data_XY.shape[0]):
            predictions_dict = self.predict_probabilities(test_data_XY[i][1])
            if(predictions_dict["SPAM"] > predictions_dict["HAM"]):
                verdict = "SPAM"
            elif(predictions_dict["HAM"] > predictions_dict["SPAM"]):
                verdict = "HAM"
            else:
                verdict = "NEUTRAL"
            df = df.append({"PREDICTION": verdict, "LABEL":  test_data_XY[i][0]}, ignore_index=True)
        return df
       

    def fit(self, training_XY):
        # pre-process data
        self.pre_process_data(training_XY)
        # class prior
        self.class_prior = {key: self.label_instance_counts[key] / self.training_size for key in self.label_instance_counts.keys()}
        # feature prior
        self.feature_prior = {label: {word_key: self.feature_counts[label][word_key] / len(self.feature_counts[label]) for word_key in self.feature_counts[label].keys()} for label in self.feature_counts.keys()}
        # print probabilities for class and feature
        #print("Class Prior: {} \nFeature Prior: {}".format(np.array(self.class_prior), np.array(self.feature_prior)))

In [204]:
nBayes = BinaryNBClassifier()

In [205]:
nBayes.fit(training_array)

In [206]:
nBayes.predict_probabilities("Awesome, wonderfully, excellent delivered enjoyable product")

{'HAM': 0.005054906325357077, 'SPAM': 0.1348888888888889}

In [207]:
nBayes.predict_probabilities("Very poor quality. Product arrived while damaged with dents.I regret buying it")

{'HAM': 0.0004371256100457264, 'SPAM': 0.0009661008724682408}

In [208]:
data_test = nBayes.test(test_array)

In [209]:
data_correct = data_test.loc[(data_test.PREDICTION == data_test.LABEL) ]

In [210]:
test_accuracy = data_correct.LABEL.value_counts()[0] *100 / data_test.LABEL.value_counts()[0]
test_accuracy

13.931297709923664

In [211]:
data_correct.LABEL.value_counts()

HAM     73
SPAM    18
Name: LABEL, dtype: int64

In [176]:
data_test.LABEL.value_counts()

HAM     524
SPAM     76
Name: LABEL, dtype: int64