## Predictive Theory

In [17]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import Counter
import pprint as pp
import random
import time
import sys
import os
try:
   import cPickle as cPickle
except:
   import pickle as cPickle

### Load Data

In [18]:
data = pd.read_csv('dataset/processed_pos_neg_reviews.csv', encoding='latin-1')

In [19]:
data.Label.value_counts()

POSITIVE    82037
NEGATIVE    82037
Name: Label, dtype: int64

In [20]:
data.head()

Unnamed: 0,ProductId,Rating,Summary,Text,Label
0,B001NXHY8Y,5,Daughter Approved,I was very upset when I was unable to find an ...,POSITIVE
1,B00813GRG4,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,NEGATIVE
2,B001BS4G6O,4,Pretty good but you can do better,Pill pockets usually work well though sometime...,POSITIVE
3,B000UA0QIQ,2,Cough Medicine,If you are looking for the secret ingredient i...,NEGATIVE
4,B005HG9ET0,5,Great taste and refreshing,I'm an avid drinker of Smart Water because of ...,POSITIVE


### Data Preparation

Training Data

In [21]:
# first 70000 reviews
training_data = data[0:70000].reset_index(drop=True)
training_data_length = training_data.shape[0]
training_data.head()

Unnamed: 0,ProductId,Rating,Summary,Text,Label
0,B001NXHY8Y,5,Daughter Approved,I was very upset when I was unable to find an ...,POSITIVE
1,B00813GRG4,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,NEGATIVE
2,B001BS4G6O,4,Pretty good but you can do better,Pill pockets usually work well though sometime...,POSITIVE
3,B000UA0QIQ,2,Cough Medicine,If you are looking for the secret ingredient i...,NEGATIVE
4,B005HG9ET0,5,Great taste and refreshing,I'm an avid drinker of Smart Water because of ...,POSITIVE


Shape of Training Data

In [22]:
pp.pprint(training_data.shape)
pp.pprint(training_data.Label.shape)

(70000, 5)
(70000,)


### Develop a Predictive Theory

In [23]:
def pretty_print_text_and_label(input_data, i):
    print(input_data.Label[i] + "\t:\t" + input_data.Text[i][:80] + "...")

In [24]:
print("labels \t : \t\t texts\n")
# choose  a random spam set to analyse
# random.randrange(start, stop, step)
for i in range(0, 6):
    pretty_print_text_and_label(training_data, random.randrange(0, training_data.shape[0]))


labels 	 : 		 texts

NEGATIVE	:	Maybe they were better 3-4 years ago but today in 2012 they have gone down hill....
NEGATIVE	:	Dogs will eat most anything reasonably edible, the ingredients in this dog food ...
NEGATIVE	:	Yes, dogs love these treats but educate yourself before you buy any dog treats m...
POSITIVE	:	The Grove Square Hot Coco K-Cups have a good flavor but I find that the k-cups d...
NEGATIVE	:	I wouldn't recommend this to anyone. After a few months, I've harvested about 5 ...
POSITIVE	:	I love this tea!<a href="http://www.amazon.com/gp/product/B006R0DOK0">Zhena's Gy...


It is very easy to distinguish a `NEGATIVE` from a `POSITIVE` review. `NEGATIVE` reviews occasionaly contain words like **worst**, **disgusting**, **inedible**, **fake**, **desceptive**, **refund**, **spoilt** e.t.c. This way we can let our network learn some of the words assocaiated with `NEGATIVE reviews` and based on such criteria we can classify a a new review as either `POSITIVE` OR `NEGATIVE`.

#### Theory Validation

In [25]:
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()
positive_negative_ratios = Counter()

In [26]:
for i in range(training_data_length):
    if(training_data.Label[i] == "NEGATIVE"):
        for word in training_data.Text[i].split(" "):
            negative_counts[word] += 1
            total_counts[word] += 1
    if(training_data.Label[i] == "POSITIVE"):
        for word in training_data.Text[i].split(" "):
            positive_counts[word] += 1
            total_counts[word] += 1

In [31]:
# print first 10 words
pp.pprint(positive_counts.most_common()[0:10])

[('the', 97807),
 ('I', 82744),
 ('and', 76217),
 ('a', 72792),
 ('', 60985),
 ('to', 59880),
 ('of', 47331),
 ('is', 44501),
 ('it', 38054),
 ('for', 32750)]


The overall count or frequency of words doesn't really tell us anything about the affinity of some words toward certain sentiments. One way to get the words with sentimental values is to use the:

$$ {positive negative ratio}= \frac{frequency of word in positive reviews}{frequency of word in negative reviews}$$

In [32]:
for word,count in list(total_counts.most_common()):
    if(count > 100):
        positive_negative_ratio = float(positive_counts[word]) / float(negative_counts[word]+1)
        positive_negative_ratios[word] = positive_negative_ratio

for word,ratio in positive_negative_ratios.most_common():
    if(ratio > 1):
        positive_negative_ratios[word] = np.log(ratio)
    else:
        positive_negative_ratios[word] = -np.log((1/(ratio+0.001)))

In [33]:
# words most frequently seen in a review with a "POSITIVE" label
pp.pprint(positive_negative_ratios.most_common()[0:10])

[('hooked.', 3.5263605246161616),
 ('Perfect', 2.9601050959108397),
 ('delicious!', 2.8332133440562162),
 ('Highly', 2.7300291078209855),
 ('Excellent', 2.5236211619686908),
 ('downside', 2.1972245773362196),
 ('pleasantly', 2.1377104498038118),
 ('saves', 2.0541237336955462),
 ('Great', 2.0149030205422647),
 ('best!', 1.9993988340062996)]


In [34]:
# words most frequently seen in a text with a "NEGATIVE" label
pp.pprint(list(reversed(positive_negative_ratios.most_common()))[0:10])

[('NO<br', -6.9077552789821368),
 ('menadione', -3.7974196226891421),
 ('deliberately', -3.6850314455177626),
 ('Buyer', -3.3183849487575627),
 ('disgusting.', -3.1898372688882821),
 ('desk,', -3.1005338958061834),
 ('garbage.', -3.0391618314813269),
 ('gross.', -3.0283925629026656),
 ('nasty.', -2.9635959921316002),
 ('dust.', -2.9511021545373284)]
