## Efficient Low Noise Naive Bayes

In [966]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import Counter
from IPython.display import Image
from itertools import dropwhile
from sklearn import metrics
import pprint as pp
import random
import time
import sys
import os
try:
   import cPickle as cPickle
except:
   import pickle as cPickle

### Load data

In [1029]:
data = pd.read_csv('dataset/processed_pos_neg_reviews.csv', encoding='latin-1')

In [1030]:
data.head()

Unnamed: 0,ProductId,Rating,Summary,Text,Label
0,B004779XNW,5,Best K cup Coffee,I like almost all of the Green Mountain coffee...,POSITIVE
1,B00813GRG4,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,NEGATIVE
2,B000JGLE0U,4,yummy cakes,These cakes were really good and small enough ...,POSITIVE
3,B000UA0QIQ,2,Cough Medicine,If you are looking for the secret ingredient i...,NEGATIVE
4,B001VJ0B0I,5,Read before you buy,"First of all, it lists the ingredients right h...",POSITIVE


In [1031]:
data.Label.value_counts()

NEGATIVE    82037
POSITIVE    82037
Name: Label, dtype: int64

### Data Preparation

Training Data

In [1032]:
# first 70000 reviews
training_data = data[0:70000].reset_index(drop=True)
training_data_length = training_data.shape[0]
training_data.head()

Unnamed: 0,ProductId,Rating,Summary,Text,Label
0,B004779XNW,5,Best K cup Coffee,I like almost all of the Green Mountain coffee...,POSITIVE
1,B00813GRG4,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,NEGATIVE
2,B000JGLE0U,4,yummy cakes,These cakes were really good and small enough ...,POSITIVE
3,B000UA0QIQ,2,Cough Medicine,If you are looking for the secret ingredient i...,NEGATIVE
4,B001VJ0B0I,5,Read before you buy,"First of all, it lists the ingredients right h...",POSITIVE


Shape of training data

In [1033]:
pp.pprint(training_data.shape)
pp.pprint(training_data.Label.shape)

(70000, 5)
(70000,)


Validation Data

In [1034]:
# middle 35000 reviews
validation_data = data[70000:76019].reset_index(drop=True)
validation_data_length = validation_data.shape[0]
validation_data.head()

Unnamed: 0,ProductId,Rating,Summary,Text,Label
0,B001LGGH40,4,"Refreshing, but a tad too sweet",Nice refreshing beverage but with 120 cal a bi...,POSITIVE
1,B001VJ0B0I,2,"Gross by-products, sugar, and food colorings--...",I completely agree that the ingredients for th...,NEGATIVE
2,B004U7QQFC,5,My favorite coffeee,Just goes to show everyone has different taste...,POSITIVE
3,B001VJ0B0I,2,Read the Ingredient List,Make sure you read the list of ingredients bef...,NEGATIVE
4,B000EFHST2,5,Yummy!,The fastest and easiest Hamburger Helper I've ...,POSITIVE


Shape of validation data

In [1035]:
pp.pprint(validation_data.shape)
pp.pprint(validation_data.Label.shape)

(6019, 5)
(6019,)


Testing data

In [1036]:
# last 6018 reviews
test_data = data[76019:82037].reset_index(drop=True)
test_data_length = test_data.shape[0]
test_data.head()

Unnamed: 0,ProductId,Rating,Summary,Text,Label
0,B003NV2IG2,2,Inconsistent Taste,I first tasted Annie Chun's seaweed three year...,NEGATIVE
1,B000ODH4BG,5,"Not meant to be a meal, but a terrific filler",This soup is great tasting. It does contain q...,POSITIVE
2,B003NV2IG2,2,"Nothing like sushi, that's for sure","For me, these were a total bust. The texture s...",NEGATIVE
3,B005GIF5VU,4,The only treats my cat will eat,So my Mom often buys bags of kitty treats for ...,POSITIVE
4,B003NV2IG2,2,Mediocre for Seaweed Snacking,This is one of many varieties of seaweed snack...,NEGATIVE


Shape of Testing data

In [1037]:
pp.pprint(test_data.shape)
pp.pprint(test_data.Label.shape)

(6018, 5)
(6018,)


### Building the Naive Bayes Classifier

### Calculate the Priors

These are simply the propbability of text having "POSITIVE" or "NEGATIVE" sentiment

In [1038]:
# total positive reviews in training set
r_positive = training_data.Label[training_data.Label == "POSITIVE"].count()
# total negative reviews in training set
r_negative = training_data.Label[training_data.Label == "NEGATIVE"].count()
# total reviews in training set
r_total = training_data.Label.count()

In [1039]:
r_positive

35000

In [1040]:
r_negative

35000

In [1041]:
r_total

70000

In [1042]:
# Drop some unwanted columns that do not provide insight
training_data.head()

Unnamed: 0,ProductId,Rating,Summary,Text,Label
0,B004779XNW,5,Best K cup Coffee,I like almost all of the Green Mountain coffee...,POSITIVE
1,B00813GRG4,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,NEGATIVE
2,B000JGLE0U,4,yummy cakes,These cakes were really good and small enough ...,POSITIVE
3,B000UA0QIQ,2,Cough Medicine,If you are looking for the secret ingredient i...,NEGATIVE
4,B001VJ0B0I,5,Read before you buy,"First of all, it lists the ingredients right h...",POSITIVE


In [1043]:
# Drop some unwanted columns that do not provide insight
training_data_modified = training_data.drop(["ProductId", "Rating", "Summary"], axis=1)
# Drop some unwanted columns that do not provide insight
test_data_modified = test_data.drop(["ProductId", "Rating", "Summary"], axis=1)
# Drop some unwanted columns that do not provide insight
validation_data_modified = validation_data.drop(["ProductId", "Rating", "Summary"], axis=1)

In [1044]:
# r_positive/r_total
p_positive = r_positive/r_total
# r_negative/r_total
p_negative = r_negative/r_total

### Calculate the Likelihood

In [1045]:
training_array = training_data_modified.values
test_array = test_data_modified.values
validation_array = validation_data_modified.values
test_array[1]

array([ "This soup is great tasting.  It does contain quite a bit of sodium, but for those who think it is too salty can add more water to adjust the taste to their liking.  Also, as many others have previously stated, there is a severe lack in meat, vegetables, and even the noodles...what do you expect from a soup that came from a box?  A gourmet meal is not very likely.  Overall, this soup is great and quick to make.  Many can use the soup on the go as a snack to hold them over for an actual meal or even a late night snack for those who don't feel like eating a meal.  This product in my opinion, is great for busy bodies who sometimes have to skip or hold off a meal.",
       'POSITIVE'], dtype=object)

In [1046]:
np.unique(training_array[1][1])
training_array[:,1]

array(['POSITIVE', 'NEGATIVE', 'POSITIVE', ..., 'NEGATIVE', 'POSITIVE',
       'NEGATIVE'], dtype=object)

In [1047]:
class BinaryNBClassifier(object):
    def __init__(self):
        self.polar_cutoff = 0.2
        self.min_count = 50
        self.max_count = 30000
        # size of {POSITIVE: count, MEGATIVE: count} reviews
        self.label_instance_counts = Counter()
        # dict for pos/neg worrd counts
        self.feature_counts = {}
        # dict for word probabilities for each label
        
    def pre_process_data(self, training_XY):
        # training data size
        self.training_size = training_XY.shape[0]
        # size of NEGATIVE reviews
        positive_word_counts = Counter()
        # frequency of words in positive reviews
        positive_word_counts = Counter()
        # frequency of words in negative reviews
        negative_word_counts = Counter()
        # frequency of words in all reviews
        total_word_counts = Counter()
        
        # get the counts of words in each sentiment
        for i in range(self.training_size ):
            if(training_XY[i][1] == 'POSITIVE'):
                # unique, counts = np.unique(a, return_counts=True)
                self.label_instance_counts[training_XY[i][1]] += 1
                for word in training_XY[i][0].split(" "):
                    positive_word_counts[word] += 1
                    total_word_counts[word] += 1
            if(training_XY[i][1] == 'NEGATIVE'):
                self.label_instance_counts[training_XY[i][1]] += 1
                for word in training_XY[i][0].split(" "):
                    negative_word_counts[word] += 1
                    total_word_counts[word] += 1
        
        # eliminate noise words
        negative_word_counts = {key:value for key, value in negative_word_counts.items() if value > self.min_count and value < self.max_count}
        positive_word_counts = {key:value for key, value in positive_word_counts.items() if value > self.min_count and value < self.max_count}
       
        self.feature_counts["POSITIVE"] = positive_word_counts
        self.feature_counts["NEGATIVE"] = negative_word_counts
        #print(self.feature_counts["NEGATIVE"])
        

    def predict_probabilities(self, text):
        prediction_results = {}
        text_counts = Counter(text.split(" "))
        for label in self.feature_prior.keys():
            likelihood_prob = 1
            for word in text_counts:
                # For every word in the text, we get the number of times that word occured in the reviews for a given class
                occurence_count = text_counts.get(word)
                # avoid missing word occurances
                if word in self.feature_prior[label].keys():
                    likelihood_prob *=  occurence_count * self.feature_prior[label][word]
            # calculate probalities
            # Now we multiply by the probability of the class existing in the documents.
            prediction_results[label] = likelihood_prob * self.class_prior[label]
        return prediction_results
    
    def test(self, test_data_XY):
        df = pd.DataFrame(columns=['PREDICTION', 'LABEL'])
        verdict = ""
        for i in range(test_data_XY.shape[0]):
            predictions_dict = self.predict_probabilities(test_data_XY[i][0])
            if(predictions_dict["POSITIVE"] > predictions_dict["NEGATIVE"]):
                verdict = "POSITIVE"
            elif(predictions_dict["NEGATIVE"] > predictions_dict["POSITIVE"]):
                verdict = "NEGATIVE"
            else:
                verdict = "NEUTRAL"
            df = df.append({"PREDICTION": verdict, "LABEL":  test_data_XY[i][1]}, ignore_index=True)
        return df
       

    def fit(self, training_XY):
        # pre-process data
        self.pre_process_data(training_XY)
        # class prior
        self.class_prior = {key: self.label_instance_counts[key] / self.training_size for key in self.label_instance_counts.keys()}
        # feature prior
        self.feature_prior = {label: {word_key: self.feature_counts[label][word_key] / len(self.feature_counts[label]) for word_key in self.feature_counts[label].keys()} for label in self.feature_counts.keys()}
        # print probabilities for class and feature
        #print("Class Prior: {} \nFeature Prior: {}".format(np.array(self.class_prior), np.array(self.feature_prior)))

In [1048]:
nBayes = BinaryNBClassifier()

In [1049]:
nBayes.fit(training_array)

In [1050]:
nBayes.predict_probabilities("Awesome, wondefully, excellent delivered enjoyable product")

{'NEGATIVE': 3.9234629732563476e-05, 'POSITIVE': 0.0004153809161042203}

In [1051]:
nBayes.predict_probabilities("Very poor quality. Product arrived while damaged with dents.I regret buying it")

{'NEGATIVE': 5.480974290790589e-09, 'POSITIVE': 3.9498205449964203e-10}

In [1052]:
data_test = nBayes.test(test_array)

In [1053]:
data_correct = data_test.loc[(data_test.PREDICTION == data_test.LABEL) ]

In [1054]:
test_accuracy = data_correct.LABEL.value_counts()[0] *100 / data_test.LABEL.value_counts()[0]
test_accuracy

76.271186440677965