## Predictive Theory

In [16]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import Counter
import pprint as pp
import random
import time
import sys
import os
try:
   import cPickle as cPickle
except:
   import pickle as cPickle

### Load Data

In [17]:
data = pd.read_csv('dataset/processed_pos_neg_reviews.csv', encoding='latin-1')

In [18]:
data.Label.value_counts()

NEGATIVE    82037
POSITIVE    82037
Name: Label, dtype: int64

In [19]:
data.head()

Unnamed: 0,ProductId,Rating,Summary,Text,Label
0,B006CMVE7S,4,No taste with filtered bottle,I guess some of you may have guessed this befo...,POSITIVE
1,B00813GRG4,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,NEGATIVE
2,B00570H26I,4,"Delicious pasta, but not for peanut allergies!",I have to agree with the previous posters that...,POSITIVE
3,B000UA0QIQ,2,Cough Medicine,If you are looking for the secret ingredient i...,NEGATIVE
4,B000GBOM0C,5,great treat,My pups love this chicken/rice treat(10lb Russ...,POSITIVE


### Data Preparation

Training Data

In [20]:
# first 70000 reviews
training_data = data[0:70000].reset_index(drop=True)
training_data_length = training_data.shape[0]
training_data.head()

Unnamed: 0,ProductId,Rating,Summary,Text,Label
0,B006CMVE7S,4,No taste with filtered bottle,I guess some of you may have guessed this befo...,POSITIVE
1,B00813GRG4,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,NEGATIVE
2,B00570H26I,4,"Delicious pasta, but not for peanut allergies!",I have to agree with the previous posters that...,POSITIVE
3,B000UA0QIQ,2,Cough Medicine,If you are looking for the secret ingredient i...,NEGATIVE
4,B000GBOM0C,5,great treat,My pups love this chicken/rice treat(10lb Russ...,POSITIVE


Shape of Training Data

In [21]:
pp.pprint(training_data.shape)
pp.pprint(training_data.Label.shape)

(70000, 5)
(70000,)


### Develop a Predictive Theory

In [22]:
def pretty_print_text_and_label(input_data, i):
    print(input_data.Label[i] + "\t:\t" + input_data.Text[i][:80] + "...")

In [23]:
print("labels \t : \t\t texts\n")
# choose  a random spam set to analyse
# random.randrange(start, stop, step)
for i in range(0, 6):
    pretty_print_text_and_label(training_data, random.randrange(0, training_data.shape[0]))


labels 	 : 		 texts

NEGATIVE	:	If you didn't like the taste of cashews, you probably wouldn't eat them. But thi...
POSITIVE	:	If only they put more in the box. They put plenty of dried berries in the box, i...
POSITIVE	:	I saw this in a segment on a television show about foods to try if you want to c...
POSITIVE	:	My neighbor friend gave me this Hawaii Roasters Coffee as a gift for my Birthday...
NEGATIVE	:	I bought this thinking it would make a wonderful gift to my neighbor's young son...
NEGATIVE	:	We bought this for our Paramount 6 oz popper. We were disappointed by the taste....


It is very easy to distinguish a `NEGATIVE` from a `POSITIVE` review. `NEGATIVE` reviews occasionaly contain words like **worst**, **disgusting**, **inedible**, **fake**, **desceptive**, **refund**, **spoilt** e.t.c. This way we can let our network learn some of the words assocaiated with `NEGATIVE reviews` and based on such criteria we can classify a a new review as either `POSITIVE` OR `NEGATIVE`.

#### Theory Validation

In [24]:
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()
positive_negative_ratios = Counter()

In [25]:
for i in range(training_data_length):
    if(training_data.Label[i] == "NEGATIVE"):
        for word in training_data.Text[i].split(" "):
            negative_counts[word] += 1
            total_counts[word] += 1
    if(training_data.Label[i] == "POSITIVE"):
        for word in training_data.Text[i].split(" "):
            positive_counts[word] += 1
            total_counts[word] += 1

In [26]:
# print first 10 words
pp.pprint(positive_counts.most_common()[0:10])

[('the', 96560),
 ('I', 82725),
 ('and', 75810),
 ('a', 72108),
 ('to', 60274),
 ('', 59473),
 ('of', 47635),
 ('is', 44519),
 ('it', 38521),
 ('for', 32756)]


The overall count or frequency of words doesn't really tell us anything about the affinity of some words toward certain sentiments. One way to get the words with sentimental values is to use the:

$$ {positive negative ratio}= \frac{frequency of word in positive reviews}{frequency of word in negative reviews}$$

In [13]:
for word,count in list(total_counts.most_common()):
    if(count > 100):
        positive_negative_ratio = float(positive_counts[word]) / float(negative_counts[word]+1)
        positive_negative_ratios[word] = positive_negative_ratio

for word,ratio in positive_negative_ratios.most_common():
    if(ratio > 1):
        positive_negative_ratios[word] = np.log(ratio)
    else:
        positive_negative_ratios[word] = -np.log((1/(ratio+0.001)))

In [14]:
# words most frequently seen in a review with a "POSITIVE" label
pp.pprint(positive_negative_ratios.most_common()[0:10])

[('/>Highly', 4.6347289882296359),
 ('delicious!', 2.8371272433773522),
 ('Highly', 2.7273080177066245),
 ('Perfect', 2.7146947438208788),
 ('Excellent', 2.544498746990246),
 ('downside', 2.4541349911212467),
 ('amazing!', 2.2617630984737906),
 ('pleasantly', 2.2407096892759584),
 ('Great', 2.0373300563297092),
 ('beats', 2.0188169198634012)]


In [15]:
# words most frequently seen in a text with a "NEGATIVE" label
pp.pprint(list(reversed(positive_negative_ratios.most_common()))[0:10])

[('NO<br', -6.9077552789821368),
 ('desk,', -4.299259050687561),
 ('deliberately', -3.9595188268977228),
 ('menadione', -3.5209466347720815),
 ('intentionally', -3.4833958767229474),
 ('garbage.', -3.4342372357403557),
 ('refund.', -3.25557390095137),
 ('nasty.', -3.2448429835246579),
 ('ripping', -3.2038896704313373),
 ('banned', -3.0692809615768031)]
