## Efficient Low Noise Naive Bayes

In [45]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys

### Load data

In [37]:
data = pd.read_csv('dataset/processed_pos_neg_reviews.csv', encoding='latin-1')

In [38]:
data.head()

Unnamed: 0,ProductId,Rating,Summary,Text,Label
0,B006CMVE7S,4,No taste with filtered bottle,I guess some of you may have guessed this befo...,POSITIVE
1,B00813GRG4,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,NEGATIVE
2,B00570H26I,4,"Delicious pasta, but not for peanut allergies!",I have to agree with the previous posters that...,POSITIVE
3,B000UA0QIQ,2,Cough Medicine,If you are looking for the secret ingredient i...,NEGATIVE
4,B000GBOM0C,5,great treat,My pups love this chicken/rice treat(10lb Russ...,POSITIVE


In [39]:
data.Label.value_counts()

NEGATIVE    82037
POSITIVE    82037
Name: Label, dtype: int64

### Data Preparation

Training Data

In [43]:
# first 82037 reviews
training_data = data[0:82037].reset_index(drop=True)
training_data_length = training_data.shape[0]
training_data.head()

Unnamed: 0,ProductId,Rating,Summary,Text,Label
0,B006CMVE7S,4,No taste with filtered bottle,I guess some of you may have guessed this befo...,POSITIVE
1,B00813GRG4,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,NEGATIVE
2,B00570H26I,4,"Delicious pasta, but not for peanut allergies!",I have to agree with the previous posters that...,POSITIVE
3,B000UA0QIQ,2,Cough Medicine,If you are looking for the secret ingredient i...,NEGATIVE
4,B000GBOM0C,5,great treat,My pups love this chicken/rice treat(10lb Russ...,POSITIVE


In [16]:
training_data.Label.value_counts()

POSITIVE    41019
NEGATIVE    41018
Name: Label, dtype: int64

Shape of training data

In [17]:
print(training_data.shape)
print(training_data.Label.shape)

(82037, 5)
(82037,)


### Building the Naive Bayes Classifier

### Calculate the Priors

These are simply the propbability of text having "POSITIVE" or "NEGATIVE" sentiment

### Calculate the Likelihood

These are simply the propability of text pointing to a particular class

### Clean Our Training Data

In [None]:
clean_data = pd.DataFrame(columns=['Label', 'Text'])[:5000]
for i in range(training_data.shape[0]):
    text = ' '.join([word for word in training_data.Text[i].split(" ") if word not in stopwords.words("english")])
    sys.stdout.write("\rProgress:" + str(100 * i/float(training_data.shape[0]))[:4] + " %")
    clean_data.loc[i] = {'Label':training_data.Label[i], 'Text': text}

In [70]:
# save our modified data to save time later on
#clean_data.to_csv("dataset/clean_training_reviews.csv", mode = 'w', index=False, encoding='latin-1')

In [69]:
# load our saved data
clean_data = pd.read_csv('dataset/clean_training_reviews.csv', encoding='latin-1')
training_data = training_data[:500]

In [70]:
# split into train and test
from sklearn import cross_validation

data_train, data_test, labels_train, labels_test = cross_validation.train_test_split(training_data.Text, training_data.Label, 
                                                                                     test_size=0.2, random_state=40)

# text vectorization for text to numbers
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif

vectorizer = CountVectorizer()
data_train_transformed = vectorizer.fit_transform(data_train)
data_test_transformed  = vectorizer.transform(data_test)

# slim the data for training and testing
selector = SelectPercentile(f_classif, percentile=10)
selector.fit(data_train_transformed, labels_train)
data_train_transformed = selector.transform(data_train_transformed).toarray()
data_test_transformed  = selector.transform(data_test_transformed).toarray()

In [109]:
class BernoulliNBClassifier(object):
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def fit(self, X, y):
        count_sample = X.shape[0]
        separated = [[x for x, t in zip(X, y) if t == c] for c in np.unique(y)]
        self.class_log_prior_ = [self.log(len(i) / count_sample) for i in separated]
        count = np.array([np.array(i).sum(axis=0) for i in separated]) + self.alpha
        smoothing = 2 * self.alpha
        n_doc = np.array([len(i) + smoothing for i in separated])
        self.feature_prob_ = count / n_doc[np.newaxis].T
        return self

    def predict_log_proba(self, X):
        return [(self.log(self.feature_prob_) * x + self.log(1 - self.feature_prob_) * np.abs(x - 1))
                .sum(axis=1) + self.class_log_prior_ for x in X]

    def predict(self, X, num_label=True):
        score_mat = np.argmax(self.predict_log_proba(X), axis=1)
        # return vector of predictions
        return score_mat.reshape(X.shape[0])
        
    def digit_to_label(self, digits):
        # palette must be given in sorted order
        palette = [0, 1]
        # key gives the new values you wish palette to be mapped to
        key = np.array(["NEGATIVE", "POSITIVE"])
        labels = np.digitize(digits.ravel(), palette, right=True)
        # return umwrapped to labels
        return key[index]

    def accuracy_score(self, predictionsY, targetsY):
        data = {'Targets': labels_test, 'Predictions': predictions}
        df = pd.DataFrame(data)
        df.loc[(df.Predictions== 0) , 'Predictions'] = "NEGATIVE"
        df.loc[(df.Predictions == 1) , 'Predictions'] = "POSITIVE"
        # calculate and return accuracy as a percentage
        return df.loc[(df.Predictions== df.Targets)].shape[0] *100 / df.count()[0]
    
    def log(self, a):
        return np.log(a)

In [110]:
clf = BernoulliNBClassifier()
clf.fit(data_train_transformed, labels_train)
predictions = clf.predict(data_test_transformed, num_label=False)

print(clf.accuracy_score(predictions, labels_test))

49.0


