#Predicting product ratings from Amazon reviews using Google Prediction API#

### Import the various libraries to be used ###

In [21]:
import urllib
from bs4 import BeautifulSoup
import nltk
import string
from nltk.stem.porter import *
import numpy as np
from nltk.corpus import stopwords

### Create a function to read a json file as a list of json objects###

In [22]:
def parseData(fname):
  for l in urllib.urlopen(fname):
    yield eval(l)


### Since the text submitted to the Google prediction API has to be dense, we convert the raw product reviews into a bag of words by removing the punctuations and reducing each word to its root form (stemming)###

In [23]:
def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    m = [w for w in words if not w in stops]   
    #
    # 6. add stemming
    stemmer = PorterStemmer()
    meaningful_words = []
    for w in m:
      w = stemmer.stem(w)
      meaningful_words.append(w)
      
    
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))


In [24]:
def catArray_to_text(category):
    #Function to convert the category from a 2d array to a string with formatting
    cat = np.asarray(category)
    newcat = []
    for arr in cat:
        newcat.append("-".join(arr))
        newcats = [categori.replace(' ', '_') for categori in newcat]
    return (" ".join(newcats))

### Read the data ###

In [25]:
data = list(parseData("data.json"))

### Split the data into training and test set. Since the prediction API recommends to use as big a training set possible, out of the 1 million data points, I use 990,000 data points for training the model and 10,000 data points for testing the model. ###

In [29]:
train = data[:990000]

In [30]:
test = data[990000:]

### Format the training and the test file according to the required format of the API. The first column is the product ratings, i.e., the labels and the remaining columns are the different features###

In [31]:
#open the training file in which the modified data has to be written
train_file = open("train.json", 'w')

for l in train:
  itemID,rating,helpful,reviewText,reviewerID,summary,unixReviewTime,category = \
  l['itemID'],l['rating'],l['helpful'],l['reviewText'],l['reviewerID'],l['summary'],l['unixReviewTime'],l['category']
  
  #count the helpfulness of the review from the given attributes 'nHelpful' and 'outOf'
  helpfulness = int(helpful['nHelpful'])*1.0/int(helpful['outOf'])
  #pre-process the review text
  clean_review = review_to_words( reviewText )
  #for Approach 1, categories is not used
  clean_cat = catArray_to_text(category)
  #write the clean json object to file
  train_file.write('{' + "'rating': "          + str(rating)                  + ', ' 
                        + "'itemID': "          + '"' + itemID + '"'           + ', '
                        + "'helpful': "         + str(helpfulness)             + ', ' 
                        + "'text': "            + '"' + clean_review + '"'     + ', '
                        + "'reviewerID': "      + '"' + reviewerID + '"'       + ', '
                        # for Approach 1, remove categories
                        + "'categories': "      + '"' + clean_cat + '"'        + ', '
                        + "'unixReviewTime': "  + str(unixReviewTime)
                        + '}' + '\n')

train_file.close()

In [32]:
#open the test file in which the modified data has to be written
test_file = open("test.json", 'w')

for l in test:
  itemID,rating,helpful,reviewText,reviewerID,summary,unixReviewTime,category = l['itemID'],l['rating'],l['helpful'],l['reviewText'],l['reviewerID'],l['summary'],l['unixReviewTime'],l['category']
    
  #count the helpfulness of the review from the given attributes 'nHelpful' and 'outOf'
  helpfulness = int(helpful['nHelpful'])*1.0/int(helpful['outOf'])
    
  #pre-process the review text
  clean_review = review_to_words( reviewText ) 
  clean_cat = catArray_to_text(category)  
  #write the clean json object to file
  test_file.write('{' + "'rating': "          + str(rating)                  + ', ' 
                        + "'itemID': "          + '"' + itemID + '"'           + ', '
                        + "'helpful': "         + str(helpfulness)             + ', ' 
                        + "'text': "            + '"' + clean_review + '"'     + ', '
                        + "'reviewerID': "      + '"' + reviewerID + '"'       + ', '
                        + "'categories': "      + '"' + clean_cat + '"'        + ', '
                        + "'unixReviewTime': "  + str(unixReviewTime)
                        + '}' + '\n')

test_file.close()

### Now, once the data is prepared, upload the files to google prediction API and run the command line script to create a prediction file where the first row is the actual rating and the second row is the predicted value.###

### After running the script and preparing the prediction file, calculate the accuracy of your predictions.###

In [36]:
num = 0
err = 0
sqErr = 0
oneErr =   [0,0,0,0,0]
twoErr =   [0,0,0,0,0]
threeErr = [0,0,0,0,0]
fourErr =  [0,0,0,0,0]
fiveErr =  [0,0,0,0,0]
predictionsFile = open("predictions.txt", 'r')
for l in predictionsFile:
    num = num+1
    actual, pred = l.split()
    actual = float(actual)
    pred = float(pred)
    if actual != pred:
        err = err+1
        if actual == 1.0:
            oneErr[int(pred) - 1] += 1
        elif actual == 2.0:
            twoErr[int(pred) - 1] += 1
        elif actual == 3.0:
            threeErr[int(pred) - 1] += 1
        elif actual == 4.0:
            fourErr[int(pred) - 1] += 1
        else:
            fiveErr[int(pred) - 1] += 1
predictionsFile.close()


###Print out the results: Accuracy, Mean Squared Error, Confusion Matrix###

In [37]:
print 'Total number of predictions made: ' , num
print 'Total number of incorrect predictions' , err
print 'accuracy is ' , round((1 - err*1.0/num)*100,2)
print 'Below is the confusion matrix'
print oneErr
print twoErr
print threeErr
print fourErr
print fiveErr

Total number of predictions made:  9999
Total number of incorrect predictions 3548
accuracy is  64.52
Below is the confusion matrix
[0, 41, 67, 41, 133]
[334, 0, 129, 98, 139]
[190, 57, 0, 262, 307]
[72, 22, 145, 0, 1082]
[83, 20, 48, 278, 0]


In [38]:
num1 = 0
err1 = 0
sqErr1 = 0
oneErr1 =   [0,0,0,0,0]
twoErr1 =   [0,0,0,0,0]
threeErr1 = [0,0,0,0,0]
fourErr1 =  [0,0,0,0,0]
fiveErr1 =  [0,0,0,0,0]
predictionsFile = open("predictions_withCat.txt", 'r')
for l in predictionsFile:
    num1 = num1+1
    actual1, pred1 = l.split()
    actual1 = float(actual1)
    pred1 = float(pred1)
    if actual1 != pred1:
        err1 = err1+1
        if actual1 == 1.0:
            oneErr1[int(pred1) - 1] += 1
        elif actual1 == 2.0:
            twoErr1[int(pred1) - 1] += 1
        elif actual1 == 3.0:
            threeErr1[int(pred1) - 1] += 1
        elif actual1 == 4.0:
            fourErr1[int(pred1) - 1] += 1
        else:
            fiveErr1[int(pred1) - 1] += 1
predictionsFile.close()


In [39]:
print 'Total number of predictions made: ' , num1
print 'Total number of incorrect predictions' , err1
print 'accuracy is ' , round((1 - err1*1.0/num1)*100,2)
print 'Below is the confusion matrix'
print oneErr1
print twoErr1
print threeErr1
print fourErr1
print fiveErr1

Total number of predictions made:  10000
Total number of incorrect predictions 3637
accuracy is  63.63
Below is the confusion matrix
[0, 60, 69, 45, 165]
[276, 0, 157, 83, 161]
[162, 79, 0, 214, 352]
[59, 43, 162, 0, 1133]
[66, 34, 76, 241, 0]
