## Efficient Low Noise Naive Bayes

In [220]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import Counter
from IPython.display import Image
from itertools import dropwhile
from sklearn import metrics
from nltk.corpus import stopwords
import pprint as pp
import random
import time
import sys
import os
try:
   import cPickle as cPickle
except:
   import pickle as cPickle

### Load data

In [221]:
data = pd.read_csv('dataset/spam.csv', encoding='latin-1')

In [3]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# Drop some unwanted columns that do not provide insight
#data_modified = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data_modified = data.loc[:, ~data.columns.str.contains('^Unnamed')]

In [5]:
data_modified = data_modified.rename(columns={"v1":"Label", "v2":"Text"})
data_modified.loc[(data_modified.Label == "spam") , 'Label'] = "SPAM"
data_modified.loc[(data_modified.Label == "ham") , 'Label'] = "HAM"
# Show last 5 in dataset
data_modified.tail(5)

Unnamed: 0,Label,Text
5567,SPAM,This is the 2nd time we have tried 2 contact u...
5568,HAM,Will Ì_ b going to esplanade fr home?
5569,HAM,"Pity, * was in mood for that. So...any other s..."
5570,HAM,The guy did some bitching but I acted like i'd...
5571,HAM,Rofl. Its true to its name


In [6]:
data_modified.Label.value_counts()

HAM     4825
SPAM     747
Name: Label, dtype: int64

### Data Preparation

Training Data

In [7]:
# first 4500 reviews
training_data = data_modified[4500:4972].reset_index(drop=True)
training_data_length = training_data.shape[0]
training_data.head()

Unnamed: 0,Label,Text
0,HAM,So wat's da decision?
1,HAM,Wot is u up 2 then bitch?
2,HAM,Stupid.its not possible
3,HAM,She told to hr that he want posting in chennai...
4,SPAM,Mobile Club: Choose any of the top quality ite...


In [8]:
training_data.Label.value_counts()

HAM     408
SPAM     64
Name: Label, dtype: int64

Shape of training data

In [9]:
pp.pprint(training_data.shape)
pp.pprint(training_data.Label.shape)

(472, 2)
(472,)


Validation Data

In [10]:
# middle 972 reviews
validation_data = data_modified[4500:4972].reset_index(drop=True)
validation_data_length = validation_data.shape[0]
validation_data.head()

Unnamed: 0,Label,Text
0,HAM,So wat's da decision?
1,HAM,Wot is u up 2 then bitch?
2,HAM,Stupid.its not possible
3,HAM,She told to hr that he want posting in chennai...
4,SPAM,Mobile Club: Choose any of the top quality ite...


In [11]:
validation_data.Label.value_counts()

HAM     408
SPAM     64
Name: Label, dtype: int64

Shape of validation data

In [12]:
pp.pprint(validation_data.shape)
pp.pprint(validation_data.Label.shape)

(472, 2)
(472,)


Testing data

In [13]:
# last 600 reviews
test_data = data_modified[4972:5572].reset_index(drop=True)
test_data_length = test_data.shape[0]
test_data.head()

Unnamed: 0,Label,Text
0,HAM,Oops I was in the shower when u called. Hey a ...
1,HAM,Aiyo u so poor thing... Then u dun wan 2 eat? ...
2,HAM,Yar... I tot u knew dis would happen long ago ...
3,HAM,You are gorgeous! keep those pix cumming :) th...
4,HAM,A boy was late 2 home. His father: \POWER OF F...


In [14]:
test_data.Label.value_counts()

HAM     524
SPAM     76
Name: Label, dtype: int64

Shape of Testing data

In [15]:
pp.pprint(test_data.shape)
pp.pprint(test_data.Label.shape)

(600, 2)
(600,)


### Building the Naive Bayes Classifier

### Calculate the Priors

These are simply the propbability of text having "POSITIVE" or "NEGATIVE" sentiment

In [16]:
# total positive reviews in training set
r_spam = training_data.Label[training_data.Label == "SPAM"].count()
# total negative reviews in training set
r_ham = training_data.Label[training_data.Label == "HAM"].count()
# total reviews in training set
r_total = training_data.Label.count()

In [17]:
r_spam

64

In [18]:
r_ham

408

In [19]:
r_total

472

In [20]:
# Drop some unwanted columns that do not provide insight
training_data.head()

Unnamed: 0,Label,Text
0,HAM,So wat's da decision?
1,HAM,Wot is u up 2 then bitch?
2,HAM,Stupid.its not possible
3,HAM,She told to hr that he want posting in chennai...
4,SPAM,Mobile Club: Choose any of the top quality ite...


In [21]:
# r_positive/r_total
r_spam = r_spam/r_total
# r_negative/r_total
r_ham = r_ham/r_total

### Calculate the Likelihood

In [258]:
clean_data = pd.DataFrame(columns=['Label', 'Text'])
for i in range(training_data.shape[0]):
    text = ' '.join([word for word in training_data.Text[i].split(" ") if word not in stopwords.words("english")])
    sys.stdout.write("\rProgress:" + str(100 * i/float(training_data.shape[0]))[:4] + " %")
    clean_data.loc[i] = {'Label':training_data.Label[i], 'Text': text}

Progress:99.7 %

In [259]:
training_data = clean_data
training_array = training_data.values
test_array = test_data.values
validation_array = validation_data.values
test_array[4]

array(['HAM', 'A boy was late 2 home. His father: \\POWER OF FRNDSHIP\\""'], dtype=object)

In [261]:
class BinaryNBClassifier(object):
    def __init__(self):
        self.polar_cutoff = 0.0
        self.min_count = 2
        self.max_count = 15
        # size of {POSITIVE: count, MEGATIVE: count} reviews
        self.label_instance_counts = Counter()
        # dict for pos/neg worrd counts
        self.feature_counts = {}
        # dict for word probabilities for each label
        
    def pre_process_data(self, training_XY):
        # training data size
        self.training_size = training_XY.shape[0]
        # size of NEGATIVE reviews
        positive_word_counts = Counter()
        # frequency of words in positive reviews
        positive_word_counts = Counter()
        # frequency of words in negative reviews
        negative_word_counts = Counter()
        # frequency of words in all reviews
        total_word_counts = Counter()
        
        # get the counts of words in each sentiment
        for i in range(self.training_size ):
            text = ' '.join([word for word in training_XY[i][1].split(" ") if word not in stopwords.words("english")])
            if(training_XY[i][0] == 'SPAM'):
                # unique, counts = np.unique(a, return_counts=True)
                self.label_instance_counts[training_XY[i][0]] += 1
                for word in text.split(" "):
                    positive_word_counts[word] += 1
                    total_word_counts[word] += 1
            if(training_XY[i][0] == 'HAM'):
                self.label_instance_counts[training_XY[i][0]] += 1
                for word in text.split(" "):
                    negative_word_counts[word] += 1
                    total_word_counts[word] += 1
        
        # eliminate noise words
        negative_word_counts = {key:value for key, value in negative_word_counts.items() if value > self.min_count and value < self.max_count}
        positive_word_counts = {key:value for key, value in positive_word_counts.items() if value > self.min_count and value < self.max_count}
       
        self.feature_counts["SPAM"] = positive_word_counts
        self.feature_counts["HAM"] = negative_word_counts
        #print(self.feature_counts["SPAM"])
        

    def predict_probabilities(self, text):
        prediction_results = {}
        text_counts = Counter(text.split(" "))
        for label in self.feature_prior.keys():
            likelihood_prob = 1
            for word in text_counts:
                # For every word in the text, we get the number of times that word occured in the reviews for a given class
                occurence_count = text_counts.get(word)
                # avoid missing word occurances
                if word in self.feature_prior[label].keys():
                    likelihood_prob *=  occurence_count * self.feature_prior[label][word]
                else:
                    likelihood_prob *=  1
            # calculate probalities
            # Now we multiply by the probability of the class existing in the documents.
            prediction_results[label] = np.abs(likelihood_prob * self.class_prior[label])
        return prediction_results
    
    def test(self, test_data_XY):
        df = pd.DataFrame(columns=['PREDICTION', 'LABEL'])
        verdict = ""
        for i in range(test_data_XY.shape[0]):
            predictions_dict = self.predict_probabilities(test_data_XY[i][1])
            if(predictions_dict["SPAM"] > predictions_dict["HAM"]):
                verdict = "SPAM"
            elif(predictions_dict["HAM"] > predictions_dict["SPAM"]):
                verdict = "HAM"
            else:
                verdict = "NEUTRAL"
            df = df.append({"PREDICTION": verdict, "LABEL":  test_data_XY[i][0]}, ignore_index=True)
        return df
       

    def fit(self, training_XY):
        # pre-process data
        self.pre_process_data(training_XY)
        # class prior
        self.class_prior = {key: self.label_instance_counts[key] / self.training_size for key in self.label_instance_counts.keys()}
        # feature prior
        self.feature_prior = {label: {word_key: self.feature_counts[label][word_key] / len(self.feature_counts[label]) for word_key in self.feature_counts[label].keys()} for label in self.feature_counts.keys()}
        # print probabilities for class and feature
        #print("Class Prior: {} \nFeature Prior: {}".format(np.array(self.class_prior), np.array(self.feature_prior)))

In [262]:
nBayes = BinaryNBClassifier()

In [263]:
nBayes.fit(training_array)

In [264]:
nBayes.predict_probabilities("Although i told u dat i'm into baig face watches now but i really like e watch u gave cos it's fr u. Thanx 4 everything dat u've done today, i'm touched")

{'HAM': 4.9083984405191211e-12, 'SPAM': 0.0014856817422094566}

In [265]:
nBayes.predict_probabilities("Thanks for your subscription to Ringtone UK your mobile will be charged å£5/month Please confirm by replying YES or NO. If you reply NO you will not be charged")

{'HAM': 0.034438517117968803, 'SPAM': 0.014658726523133303}

In [266]:
nBayes.predict_probabilities("SIX chances to win CASH! From 100 to 20,000 pounds txt> ")

{'HAM': 0.86440677966101698, 'SPAM': 0.0054970224461749892}

In [241]:
data_test = nBayes.test(test_array)

In [242]:
data_correct = data_test.loc[(data_test.PREDICTION == data_test.LABEL) ]

In [243]:
test_accuracy = data_correct.LABEL.value_counts()[1] *100 / data_test.LABEL.value_counts()[1]
test_accuracy

13.157894736842104

In [244]:
data_correct.LABEL.value_counts()

HAM     166
SPAM     10
Name: LABEL, dtype: int64

In [210]:
data_test.LABEL.value_counts()

HAM     524
SPAM     76
Name: LABEL, dtype: int64

In [248]:
clean_data = pd.DataFrame(columns=['Text', 'Label'])
for i in range(training_data.shape[0]):
    text = ' '.join([word for word in training_data.Text[i].split(" ") if word not in stopwords.words("english")])
    sys.stdout.write("\rProgress:" + str(100 * i/float(training_data.shape[0]))[:4] + " %")
    clean_data.loc[i] = {'Text':text, 'Label': training_data.Label[i]}

Progress:99.7 %

In [249]:
clean_data.head(20)

Unnamed: 0,Text,Label
0,So wat's da decision?,HAM
1,Wot u 2 bitch?,HAM
2,Stupid.its possible,HAM
3,She told hr want posting chennai:)because i'm ...,HAM
4,Mobile Club: Choose top quality items mobile. ...,SPAM
5,When guys leaving?,HAM
6,He neva grumble sad lor... Hee... Buy tmr lor ...,HAM
7,Not able anything.,HAM
8,ÌÏ takin linear algebra today?,HAM
9,This weekend fine (an excuse much decorating),HAM
