## Data Preprocessing

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import Counter
import pprint as pp
import random
import time
import sys
import os
try:
   import cPickle as cPickle
except:
   import pickle as cPickle

### Data Loading

In [2]:
data = pd.read_csv('dataset/reviews.csv', encoding='latin-1')

In [3]:
pp.pprint(data.columns)

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')


In [4]:
# Show first 3 in dataset
data.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


In [5]:
# Drop some unwanted columns that do not provide insight
data_modified = data.drop(["Id", "UserId", "ProfileName", "HelpfulnessNumerator", "HelpfulnessDenominator", "Time"], axis=1)
# Reanme some of the columns to more meaningful naming
data_modified = data_modified.rename(columns={"Score":"Rating"})

In [6]:
pp.pprint(data_modified.columns)

Index(['ProductId', 'Rating', 'Summary', 'Text'], dtype='object')


In [7]:
# Show last 5 in dataset
data_modified.tail(10)

Unnamed: 0,ProductId,Rating,Summary,Text
568444,B001EO7N10,5,Best Value for Chinese 5 Spice,"As a foodie, I use a lot of Chinese 5 Spice po..."
568445,B001EO7N10,5,Five Spice Powder,"You can make this mix yourself, but the Star A..."
568446,B001EO7N10,2,Mixed wrong,I had ordered some of these a few months back ...
568447,B001EO7N10,5,"If its all natural, this is like panacea of Sp...","Hoping there is no MSG in this, this tastes ex..."
568448,B001EO7N10,5,Very large ground spice jars.,My only complaint is that there's so much of i...
568449,B001EO7N10,5,Will not do without,Great for sesame chicken..this is a good if no...
568450,B003S1WTCU,2,disappointed,I'm disappointed with the flavor. The chocolat...
568451,B004I613EE,5,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568452,B004I613EE,5,Favorite Training and reward treat,These are the BEST treats for training and rew...
568453,B001LR2CU2,5,Great Honey,"I am very satisfied ,product is as advertised,..."


In [8]:
pp.pprint(data_modified.Rating.value_counts())

5    363122
4     80655
1     52268
3     42640
2     29769
Name: Rating, dtype: int64


We have `363122` reviews with 5 star rating, `80655` with 4 star rating and so on and so forth upto 1 start rating.

Let's do some little bit of cleaning where `Rating > 2.5` is a `POSITIVE` review and those below `< 2.5` are a `NEGATIVE` review.

In [9]:
data_modified.loc[(data_modified.Rating > 2.5) , 'Label'] = "POSITIVE"
data_modified.loc[(data_modified.Rating < 2.5) , 'Label'] = "NEGATIVE"

In [10]:
data_modified.head(10)

Unnamed: 0,ProductId,Rating,Summary,Text,Label
0,B001E4KFG0,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,POSITIVE
1,B00813GRG4,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,NEGATIVE
2,B000LQOCH0,4,"""Delight"" says it all",This is a confection that has been around a fe...,POSITIVE
3,B000UA0QIQ,2,Cough Medicine,If you are looking for the secret ingredient i...,NEGATIVE
4,B006K2ZZ7K,5,Great taffy,Great taffy at a great price. There was a wid...,POSITIVE
5,B006K2ZZ7K,4,Nice Taffy,I got a wild hair for taffy and ordered this f...,POSITIVE
6,B006K2ZZ7K,5,Great! Just as good as the expensive brands!,This saltwater taffy had great flavors and was...,POSITIVE
7,B006K2ZZ7K,5,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...,POSITIVE
8,B000E7L2R4,5,Yay Barley,Right now I'm mostly just sprouting this so my...,POSITIVE
9,B00171APVA,5,Healthy Dog Food,This is a very healthy dog food. Good for thei...,POSITIVE


In [11]:
# save our modified data to save time later on
data_modified.to_csv("dataset/pre_processed_reviews.csv", mode = 'w', index=False, encoding='latin-1')

In [12]:
# load our saved data
processed_data = pd.read_csv('dataset/pre_processed_reviews.csv', encoding='latin-1')

In [13]:
# get the size of our dataset
pp.pprint(processed_data.count())

ProductId    568454
Rating       568454
Summary      568428
Text         568454
Label        568454
dtype: int64


In [14]:
processed_data.Label.value_counts()

POSITIVE    486417
NEGATIVE     82037
Name: Label, dtype: int64

In [15]:
pos = processed_data.Label.value_counts()[0]
neg = processed_data.Label.value_counts()[1]
neg_pos_frac = float(neg/pos)

In [16]:
pp.pprint(neg_pos_frac)

0.16865570076703734


Our data is a little unbalanced, hence we need to litle bit more processing. Take the **82037** `NEGATIVE` reviews and alternately mix with **82037** `POSITIVE` reviews.

In [17]:
# view only NEGATIVE reviews
data_negative = processed_data.loc[(processed_data.Label == "NEGATIVE")].reset_index(drop=True)

In [18]:
# show sample
data_negative.head(2)

Unnamed: 0,ProductId,Rating,Summary,Text,Label
0,B00813GRG4,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,NEGATIVE
1,B000UA0QIQ,2,Cough Medicine,If you are looking for the secret ingredient i...,NEGATIVE


In [19]:
data_negative.Label.value_counts()

NEGATIVE    82037
Name: Label, dtype: int64

In [20]:
# save our modified data to save time later on
data_negative.to_csv("dataset/processed_negative_reviews.csv", mode = 'w', index=False, encoding='latin-1')

In [21]:
# view only POSITIVE reviews
data_positive = processed_data.loc[(processed_data.Label == "POSITIVE")].sample(frac=neg_pos_frac).reset_index(drop=True)
# show sample
data_positive.head(2)

Unnamed: 0,ProductId,Rating,Summary,Text,Label
0,B006CMVE7S,4,No taste with filtered bottle,I guess some of you may have guessed this befo...,POSITIVE
1,B00570H26I,4,"Delicious pasta, but not for peanut allergies!",I have to agree with the previous posters that...,POSITIVE


In [22]:
data_positive.Label.value_counts()

POSITIVE    82037
Name: Label, dtype: int64

In [23]:
# save our modified data to save time later on
data_positive.to_csv("dataset/processed_positive_reviews.csv", mode = 'w', index=False, encoding='latin-1')

#### Further processing.

In [24]:
pos_df = pd.read_csv("dataset/processed_positive_reviews.csv", encoding='latin-1')
neg_df = pd.read_csv("dataset/processed_negative_reviews.csv", encoding='latin-1')

Combine the positive and negative DataFrames into one. Make sure it is a balanced data set.

In [25]:
data_pos_neg = pd.concat([pos_df, neg_df], keys=["POSITIVE", "NEGATIVE"]).sort_index(level=1, sort_remaining=False).reset_index(drop=True)

Make sure it is a balanced data set.

In [26]:
data_pos_neg.Label.value_counts()

NEGATIVE    82037
POSITIVE    82037
Name: Label, dtype: int64

In [27]:
data_pos_neg.head(6)

Unnamed: 0,ProductId,Rating,Summary,Text,Label
0,B006CMVE7S,4,No taste with filtered bottle,I guess some of you may have guessed this befo...,POSITIVE
1,B00813GRG4,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,NEGATIVE
2,B00570H26I,4,"Delicious pasta, but not for peanut allergies!",I have to agree with the previous posters that...,POSITIVE
3,B000UA0QIQ,2,Cough Medicine,If you are looking for the secret ingredient i...,NEGATIVE
4,B000GBOM0C,5,great treat,My pups love this chicken/rice treat(10lb Russ...,POSITIVE
5,B0009XLVG0,1,My Cats Are Not Fans of the New Food,My cats have been happily eating Felidae Plati...,NEGATIVE


In [28]:
# save our modified data to save time later on
data_pos_neg.to_csv("dataset/processed_pos_neg_reviews.csv", mode = 'w', index=False, encoding='latin-1')