In [1]:
import pandas as pd
import pickle
import gzip
import json

In [2]:
#Code adapted from the datasource http://jmcauley.ucsd.edu/data/amazon/
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [3]:
#Convert json to pd dataframe
df = getDF('Home_and_Kitchen_5.json.gz')

In [4]:
#Check datatype and information
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6898955 entries, 0 to 6898954
Data columns (total 12 columns):
 #   Column          Dtype  
---  ------          -----  
 0   overall         float64
 1   verified        bool   
 2   reviewTime      object 
 3   reviewerID      object 
 4   asin            object 
 5   reviewerName    object 
 6   reviewText      object 
 7   summary         object 
 8   unixReviewTime  int64  
 9   vote            object 
 10  style           object 
 11  image           object 
dtypes: bool(1), float64(1), int64(1), object(9)
memory usage: 638.2+ MB


In [5]:
#Visualize the data
df

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5.0,True,"11 5, 2015",A8LUWTIPU9CZB,0560467893,Linda Fahner,"Great product, love it!!",Five Stars,1446681600,,,
1,3.0,True,"05 7, 2015",A3B6GKQQ1JJ167,0560467893,Harry Slaughter,"Pretty flimsy, but does the job. If your corne...",Meh,1430956800,2,,
2,5.0,True,"01 22, 2014",A3MCTN65BU7XRA,0681795107,luckyg,So much better than plastic mug types--keeps c...,Recommend,1390348800,,{'Color:': ' Brushed Stainless'},
3,1.0,True,"10 30, 2013",A7JVZFSXVY9RL,0681795107,Nickleen,I like my coffee hot; borderline scorching but...,Not keeping coffee hot for long enough,1383091200,,{'Color:': ' Brushed Stainless'},
4,1.0,True,"09 20, 2013",A2RQ7VLAK1SHPU,0681795107,Lacemaker427,This mug does only a fair job of keeping coffe...,Leaks like a waterfall when at an angle!,1379635200,,{'Color:': ' Red'},
...,...,...,...,...,...,...,...,...,...,...,...,...
6898950,5.0,True,"08 8, 2016",AB4CZUDHN52H5,B01HJEJDBQ,J. Mayer,I purchased these as decorations for my classr...,Beautiful Pom poms,1470614400,,{'Color:': ' Mint Green/Purple/White'},
6898951,2.0,True,"09 5, 2017",A12NA9GEGY6I3E,B01HJEOT2E,Loreli,getting the used tea leaves out of this is ext...,cute but not very functional,1504569600,,{'Color:': ' Blue'},
6898952,4.0,True,"07 30, 2017",A38PQCNDGGWSPQ,B01HJEOT2E,Becca,Good deal. It took a while to recieve so I kin...,Good deal. It took a while to recieve so ...,1501372800,,{'Color:': ' Blue'},
6898953,4.0,True,"07 28, 2018",A2AY2C5EW8VOO7,B01HJGJNWS,Dr. Quinzel,Great price and okay quality. I know it's not ...,Four Stars,1532736000,,{'Color:': ' Black'},


## Data Preparation and Cleaning
#### 1. The column name 'vote','style','image' does not provide much information for this project and 'reviewerName' is not required since the dataset contains 'reviewer id'
#### 2. Drop all the unverified reviews
#### 3. Conver reviewtime to date format 
#### 4. Find all duplicate entries (I.e If the reviewer has reviewed the same product more than once, Keep the latest review)
#### 5. Check for null values remove entries where both summary and reviewtext are nan
#### 6. Find all the url in summary text and replace them with ''
#### 7. Combine Review text and summary column and drop summary column and reset index
#### 8. Find all the url in review text and remove
#### 9. Check for null values
#### 10. Save pickle file

In [6]:
#Drop columns that are not required for analysis
df.drop(columns=['vote','style','image','reviewerName'],inplace=True)

In [7]:
#View top 5 rows
df.head(5)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewText,summary,unixReviewTime
0,5.0,True,"11 5, 2015",A8LUWTIPU9CZB,560467893,"Great product, love it!!",Five Stars,1446681600
1,3.0,True,"05 7, 2015",A3B6GKQQ1JJ167,560467893,"Pretty flimsy, but does the job. If your corne...",Meh,1430956800
2,5.0,True,"01 22, 2014",A3MCTN65BU7XRA,681795107,So much better than plastic mug types--keeps c...,Recommend,1390348800
3,1.0,True,"10 30, 2013",A7JVZFSXVY9RL,681795107,I like my coffee hot; borderline scorching but...,Not keeping coffee hot for long enough,1383091200
4,1.0,True,"09 20, 2013",A2RQ7VLAK1SHPU,681795107,This mug does only a fair job of keeping coffe...,Leaks like a waterfall when at an angle!,1379635200


In [8]:
#Check for the unique values in verified column
df.verified.unique()

array([ True, False])

In [9]:
#Drop all the unverified reviews
drp_ind = df[df['verified']==False].index
df.drop(drp_ind,inplace=True)

In [10]:
#Covert unixReview time to date date format and add a column 'rev_date'
df['rev_date'] = pd.to_datetime(df.unixReviewTime,unit = 's')

In [11]:
#drop unwanted columns
df.drop(columns=['reviewTime','unixReviewTime','verified'],inplace=True)

In [12]:
df

Unnamed: 0,overall,reviewerID,asin,reviewText,summary,rev_date
0,5.0,A8LUWTIPU9CZB,0560467893,"Great product, love it!!",Five Stars,2015-11-05
1,3.0,A3B6GKQQ1JJ167,0560467893,"Pretty flimsy, but does the job. If your corne...",Meh,2015-05-07
2,5.0,A3MCTN65BU7XRA,0681795107,So much better than plastic mug types--keeps c...,Recommend,2014-01-22
3,1.0,A7JVZFSXVY9RL,0681795107,I like my coffee hot; borderline scorching but...,Not keeping coffee hot for long enough,2013-10-30
4,1.0,A2RQ7VLAK1SHPU,0681795107,This mug does only a fair job of keeping coffe...,Leaks like a waterfall when at an angle!,2013-09-20
...,...,...,...,...,...,...
6898950,5.0,AB4CZUDHN52H5,B01HJEJDBQ,I purchased these as decorations for my classr...,Beautiful Pom poms,2016-08-08
6898951,2.0,A12NA9GEGY6I3E,B01HJEOT2E,getting the used tea leaves out of this is ext...,cute but not very functional,2017-09-05
6898952,4.0,A38PQCNDGGWSPQ,B01HJEOT2E,Good deal. It took a while to recieve so I kin...,Good deal. It took a while to recieve so ...,2017-07-30
6898953,4.0,A2AY2C5EW8VOO7,B01HJGJNWS,Great price and okay quality. I know it's not ...,Four Stars,2018-07-28


In [13]:
#Sorting the rows by rev_date , old reviews to latest reviews
df = df.sort_values(by = 'rev_date')