#Predicting product ratings from Amazon reviews using Google Prediction API#

##Using 10 fold cross validation method

### Import the various libraries to be used ###

In [87]:
import urllib
from bs4 import BeautifulSoup
import nltk
import string
from nltk.stem.porter import *
import numpy as np
from nltk.corpus import stopwords
from sklearn.utils import shuffle

### Create a function to read a json file as a list of json objects###

In [2]:
def parseData(fname):
  for l in urllib.urlopen(fname):
    yield eval(l)

### Since the text submitted to the Google prediction API has to be dense, we convert the raw product reviews into a bag of words by removing the punctuations and reducing each word to its root form (stemming)###

In [3]:
def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    m = [w for w in words if not w in stops]   
    #
    # 6. add stemming
    stemmer = PorterStemmer()
    meaningful_words = []
    for w in m:
      w = stemmer.stem(w)
      meaningful_words.append(w)
      
    
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))


In [4]:
def catArray_to_text(category):
    #Function to convert the category from a 2d array to a string with formatting
    cat = np.asarray(category)
    newcat = []
    for arr in cat:
        newcat.append("-".join(arr))
        newcats = [categori.replace(' ', '_') for categori in newcat]
    return (" ".join(newcats))

### Read the data ###

In [5]:
data = list(parseData("data.json"))

###Split the data into different ratings

In [9]:
rate1 = []
rate2 = []
rate3 = []
rate4 = []
rate5 = []
for d in data:
    if float(d['rating']) == 1.0:
        rate1.append(d)
    elif float(d['rating']) == 2.0:
        rate2.append(d)
    elif float(d['rating']) == 3.0:
        rate3.append(d)
    elif float(d['rating']) == 4.0:
        rate4.append(d)
    elif float(d['rating']) == 5.0:
        rate5.append(d)

###Create a new dataset with equal distribution of all ratings

In [16]:
newd = []
newd.extend(rate1[:20000])
newd.extend(rate2[:20000])
newd.extend(rate3[:20000])
newd.extend(rate4[:20000])
newd.extend(rate5[:20000])

In [20]:
#shuffle the data
newd = shuffle(newd, random_state=2015)

In [70]:
#open the training/testing file in which the modified data has to be written

def formatFile(fileArray):
    #chane the name of the file you want to write to
    train_file = open("put_appropriate_name_here.json", 'w')
    for l in fileArray:
      itemID,rating,helpful,reviewText,reviewerID,summary,unixReviewTime,category = \
      l['itemID'],l['rating'],l['helpful'],l['reviewText'],l['reviewerID'],l['summary'],l['unixReviewTime'],l['category']
      #count the helpfulness of the review from the given attributes 'nHelpful' and 'outOf'
      helpfulness = int(helpful['nHelpful'])*1.0/int(helpful['outOf'])
      #pre-process the review text and the categories
      clean_review = review_to_words( reviewText )
      clean_cat = catArray_to_text(category)
      #write the clean json object to file
      train_file.write('{' + "'rating': "          + str(rating)                  + ', ' 
                            + "'itemID': "          + '"' + itemID + '"'           + ', '
                            + "'helpful': "         + str(helpfulness)             + ', ' 
                            + "'text': "            + '"' + clean_review + '"'     + ', '
                            + "'reviewerID': "      + '"' + reviewerID + '"'       + ', '
                            + "'categories': "      + '"' + clean_cat + '"'        + ', '
                            + "'unixReviewTime': "  + str(unixReviewTime)
                            + '}' + '\n')

    train_file.close()

In [22]:
from sklearn import cross_validation
kf = cross_validation.KFold(len(newd), n_folds=10)

In [29]:
#create the train/test split for each iteration of the crossvalidation
i = 0
train = []
test = []    
for train_ind, test_ind in kf:
    training = []
    testing = []
    for index in train_ind:
        training.append(newd[index])
    for index in test_ind:
        testing.append(newd[index])
    train.append(training)
    test.append(testing)
    i+=1   
    

In [33]:
formatFile(train[0])

In [37]:
formatFile(train[1])

In [39]:
formatFile(train[2])

In [41]:
formatFile(train[3])

In [43]:
formatFile(train[4])

In [45]:
formatFile(train[5])

In [47]:
formatFile(train[6])

In [49]:
formatFile(train[7])

In [51]:
formatFile(train[8])

In [53]:
formatFile(train[9])

In [35]:
formatFile(test[0])

In [55]:
formatFile(test[1])

In [57]:
formatFile(test[2])

In [59]:
formatFile(test[3])

In [61]:
formatFile(test[4])

In [63]:
formatFile(test[5])

In [65]:
formatFile(test[6])

In [67]:
formatFile(test[7])

In [69]:
formatFile(test[8])

In [71]:
formatFile(test[9])

### Format the training and the test file according to the required format of the API. The first column is the product ratings, i.e., the labels and the remaining columns are the different features###

### Now, once the data is prepared, upload the files to google prediction API and run the command line script to create a prediction file where the first row is the actual rating and the second row is the predicted value.###

### After running the script and preparing the prediction file, calculate the accuracy of your predictions.###

###Print out the results: Accuracy, Mean Squared Error, Confusion Matrix###

In [173]:
acc = []
num = 0
err = 0
sqErr = 0
oneErr =   [0,0,0,0,0]
twoErr =   [0,0,0,0,0]
threeErr = [0,0,0,0,0]
fourErr =  [0,0,0,0,0]
fiveErr =  [0,0,0,0,0]
predictionsFile = open("predictions_0.txt", 'r')
for l in predictionsFile:
    #print(l.split())
    num = num+1
    actual,pred = l.split()
    actual = float(actual)
    pred = float(pred)
    if actual != pred:
        err = err+1
        if actual == 1.0:
            oneErr[int(pred) - 1] += 1
        elif actual == 2.0:
            twoErr[int(pred) - 1] += 1
        elif actual == 3.0:
            threeErr[int(pred) - 1] += 1
        elif actual == 4.0:
            fourErr[int(pred) - 1] += 1
        else:
            fiveErr[int(pred) - 1] += 1
predictionsFile.close()


In [174]:
accuracy = round((1 - err*1.0/num)*100,2)
acc.append(accuracy)
print ('Total number of predictions made: ' , num) 
print ('Total number of incorrect predictions' , err)
print ('accuracy is ' , accuracy)
print ('Below is the confusion matrix')
print (oneErr)
print (twoErr)
print (threeErr)
print (fourErr)
print (fiveErr)

Total number of predictions made:  10000
Total number of incorrect predictions 5474
accuracy is  45.26
Below is the confusion matrix
[0, 365, 167, 128, 137]
[635, 0, 394, 227, 186]
[273, 388, 0, 526, 306]
[91, 164, 266, 0, 681]
[75, 71, 80, 314, 0]


In [175]:
num = 0
err = 0
sqErr = 0
oneErr =   [0,0,0,0,0]
twoErr =   [0,0,0,0,0]
threeErr = [0,0,0,0,0]
fourErr =  [0,0,0,0,0]
fiveErr =  [0,0,0,0,0]
predictionsFile = open("predictions_1.txt", 'r')
for l in predictionsFile:
    #print(l.split())
    num = num+1
    actual,pred = l.split()
    actual = float(actual)
    pred = float(pred)
    if actual != pred:
        err = err+1
        if actual == 1.0:
            oneErr[int(pred) - 1] += 1
        elif actual == 2.0:
            twoErr[int(pred) - 1] += 1
        elif actual == 3.0:
            threeErr[int(pred) - 1] += 1
        elif actual == 4.0:
            fourErr[int(pred) - 1] += 1
        else:
            fiveErr[int(pred) - 1] += 1
predictionsFile.close()


In [176]:
accuracy = round((1 - err*1.0/num)*100,2)
acc.append(accuracy)
print ('Total number of predictions made: ' , num) 
print ('Total number of incorrect predictions' , err)
print ('accuracy is ' , accuracy)
print ('Below is the confusion matrix')
print (oneErr)
print (twoErr)
print (threeErr)
print (fourErr)
print (fiveErr)

Total number of predictions made:  10000
Total number of incorrect predictions 5630
accuracy is  43.7
Below is the confusion matrix
[0, 353, 249, 127, 154]
[628, 0, 503, 277, 215]
[269, 278, 0, 567, 287]
[68, 120, 323, 0, 595]
[61, 48, 115, 393, 0]


In [177]:
num = 0
err = 0
sqErr = 0
oneErr =   [0,0,0,0,0]
twoErr =   [0,0,0,0,0]
threeErr = [0,0,0,0,0]
fourErr =  [0,0,0,0,0]
fiveErr =  [0,0,0,0,0]
predictionsFile = open("predictions_2.txt", 'r')
for l in predictionsFile:
    #print(l.split())
    num = num+1
    actual,pred = l.split()
    actual = float(actual)
    pred = float(pred)
    if actual != pred:
        err = err+1
        if actual == 1.0:
            oneErr[int(pred) - 1] += 1
        elif actual == 2.0:
            twoErr[int(pred) - 1] += 1
        elif actual == 3.0:
            threeErr[int(pred) - 1] += 1
        elif actual == 4.0:
            fourErr[int(pred) - 1] += 1
        else:
            fiveErr[int(pred) - 1] += 1
predictionsFile.close()


In [178]:
accuracy = round((1 - err*1.0/num)*100,2)
acc.append(accuracy)
print ('Total number of predictions made: ' , num) 
print ('Total number of incorrect predictions' , err)
print ('accuracy is ' , accuracy)
print ('Below is the confusion matrix')
print (oneErr)
print (twoErr)
print (threeErr)
print (fourErr)
print (fiveErr)

Total number of predictions made:  9054
Total number of incorrect predictions 5219
accuracy is  42.36
Below is the confusion matrix
[0, 374, 192, 139, 144]
[423, 0, 436, 261, 184]
[218, 314, 0, 487, 236]
[79, 140, 352, 0, 556]
[85, 72, 147, 380, 0]


In [179]:
num = 0
err = 0
sqErr = 0
oneErr =   [0,0,0,0,0]
twoErr =   [0,0,0,0,0]
threeErr = [0,0,0,0,0]
fourErr =  [0,0,0,0,0]
fiveErr =  [0,0,0,0,0]
predictionsFile = open("predictions_3.txt", 'r')
for l in predictionsFile:
    #print(l.split())
    num = num+1
    actual,pred = l.split()
    actual = float(actual)
    pred = float(pred)
    if actual != pred:
        err = err+1
        if actual == 1.0:
            oneErr[int(pred) - 1] += 1
        elif actual == 2.0:
            twoErr[int(pred) - 1] += 1
        elif actual == 3.0:
            threeErr[int(pred) - 1] += 1
        elif actual == 4.0:
            fourErr[int(pred) - 1] += 1
        else:
            fiveErr[int(pred) - 1] += 1
predictionsFile.close()


In [180]:
accuracy = round((1 - err*1.0/num)*100,2)
acc.append(accuracy)
print ('Total number of predictions made: ' , num) 
print ('Total number of incorrect predictions' , err)
print ('accuracy is ' , accuracy)
print ('Below is the confusion matrix')
print (oneErr)
print (twoErr)
print (threeErr)
print (fourErr)
print (fiveErr)

Total number of predictions made:  8340
Total number of incorrect predictions 4671
accuracy is  43.99
Below is the confusion matrix
[0, 300, 208, 95, 114]
[447, 0, 408, 235, 155]
[219, 252, 0, 516, 232]
[67, 79, 269, 0, 509]
[66, 45, 115, 340, 0]


In [181]:
num = 0
err = 0
sqErr = 0
oneErr =   [0,0,0,0,0]
twoErr =   [0,0,0,0,0]
threeErr = [0,0,0,0,0]
fourErr =  [0,0,0,0,0]
fiveErr =  [0,0,0,0,0]
predictionsFile = open("predictions_4.txt", 'r')
for l in predictionsFile:
    #print(l.split())
    num = num+1
    actual,pred = l.split()
    actual = float(actual)
    pred = float(pred)
    if actual != pred:
        err = err+1
        if actual == 1.0:
            oneErr[int(pred) - 1] += 1
        elif actual == 2.0:
            twoErr[int(pred) - 1] += 1
        elif actual == 3.0:
            threeErr[int(pred) - 1] += 1
        elif actual == 4.0:
            fourErr[int(pred) - 1] += 1
        else:
            fiveErr[int(pred) - 1] += 1
predictionsFile.close()


In [182]:
accuracy = round((1 - err*1.0/num)*100,2)
acc.append(accuracy)
print ('Total number of predictions made: ' , num) 
print ('Total number of incorrect predictions' , err)
print ('accuracy is ' , accuracy)
print ('Below is the confusion matrix')
print (oneErr)
print (twoErr)
print (threeErr)
print (fourErr)
print (fiveErr)

Total number of predictions made:  10000
Total number of incorrect predictions 5577
accuracy is  44.23
Below is the confusion matrix
[0, 424, 245, 131, 164]
[538, 0, 461, 223, 184]
[270, 350, 0, 478, 271]
[118, 138, 337, 0, 541]
[93, 93, 145, 373, 0]


In [183]:
num = 0
err = 0
sqErr = 0
oneErr =   [0,0,0,0,0]
twoErr =   [0,0,0,0,0]
threeErr = [0,0,0,0,0]
fourErr =  [0,0,0,0,0]
fiveErr =  [0,0,0,0,0]
predictionsFile = open("predictions_5.txt", 'r')
for l in predictionsFile:
    #print(l.split())
    num = num+1
    actual,pred = l.split()
    actual = float(actual)
    pred = float(pred)
    if actual != pred:
        err = err+1
        if actual == 1.0:
            oneErr[int(pred) - 1] += 1
        elif actual == 2.0:
            twoErr[int(pred) - 1] += 1
        elif actual == 3.0:
            threeErr[int(pred) - 1] += 1
        elif actual == 4.0:
            fourErr[int(pred) - 1] += 1
        else:
            fiveErr[int(pred) - 1] += 1
predictionsFile.close()


In [184]:
accuracy = round((1 - err*1.0/num)*100,2)
acc.append(accuracy)
print ('Total number of predictions made: ' , num) 
print ('Total number of incorrect predictions' , err)
print ('accuracy is ' , accuracy)
print ('Below is the confusion matrix')
print (oneErr)
print (twoErr)
print (threeErr)
print (fourErr)
print (fiveErr)

Total number of predictions made:  8704
Total number of incorrect predictions 5042
accuracy is  42.07
Below is the confusion matrix
[0, 386, 207, 132, 104]
[436, 0, 391, 218, 167]
[215, 323, 0, 457, 224]
[102, 137, 340, 0, 531]
[91, 83, 156, 342, 0]


In [185]:
num = 0
err = 0
sqErr = 0
oneErr =   [0,0,0,0,0]
twoErr =   [0,0,0,0,0]
threeErr = [0,0,0,0,0]
fourErr =  [0,0,0,0,0]
fiveErr =  [0,0,0,0,0]
predictionsFile = open("predictions_6.txt", 'r')
for l in predictionsFile:
    #print(l.split())
    num = num+1
    actual,pred = l.split()
    actual = float(actual)
    pred = float(pred)
    if actual != pred:
        err = err+1
        if actual == 1.0:
            oneErr[int(pred) - 1] += 1
        elif actual == 2.0:
            twoErr[int(pred) - 1] += 1
        elif actual == 3.0:
            threeErr[int(pred) - 1] += 1
        elif actual == 4.0:
            fourErr[int(pred) - 1] += 1
        else:
            fiveErr[int(pred) - 1] += 1
predictionsFile.close()


In [186]:
accuracy = round((1 - err*1.0/num)*100,2)
acc.append(accuracy)
print ('Total number of predictions made: ' , num) 
print ('Total number of incorrect predictions' , err)
print ('accuracy is ' , accuracy)
print ('Below is the confusion matrix')
print (oneErr)
print (twoErr)
print (threeErr)
print (fourErr)
print (fiveErr)

Total number of predictions made:  3610
Total number of incorrect predictions 2005
accuracy is  44.46
Below is the confusion matrix
[0, 151, 80, 50, 45]
[175, 0, 177, 90, 64]
[95, 133, 0, 191, 73]
[36, 56, 133, 0, 197]
[35, 24, 60, 140, 0]


In [187]:
num = 0
err = 0
sqErr = 0
oneErr =   [0,0,0,0,0]
twoErr =   [0,0,0,0,0]
threeErr = [0,0,0,0,0]
fourErr =  [0,0,0,0,0]
fiveErr =  [0,0,0,0,0]
predictionsFile = open("predictions_7.txt", 'r')
for l in predictionsFile:
    #print(l.split())
    num = num+1
    actual,pred = l.split()
    actual = float(actual)
    pred = float(pred)
    if actual != pred:
        err = err+1
        if actual == 1.0:
            oneErr[int(pred) - 1] += 1
        elif actual == 2.0:
            twoErr[int(pred) - 1] += 1
        elif actual == 3.0:
            threeErr[int(pred) - 1] += 1
        elif actual == 4.0:
            fourErr[int(pred) - 1] += 1
        else:
            fiveErr[int(pred) - 1] += 1
predictionsFile.close()


In [188]:
accuracy = round((1 - err*1.0/num)*100,2)
acc.append(accuracy)
print ('Total number of predictions made: ' , num) 
print ('Total number of incorrect predictions' , err)
print ('accuracy is ' , accuracy)
print ('Below is the confusion matrix')
print (oneErr)
print (twoErr)
print (threeErr)
print (fourErr)
print (fiveErr)

Total number of predictions made:  10000
Total number of incorrect predictions 5659
accuracy is  43.41
Below is the confusion matrix
[0, 353, 231, 123, 160]
[571, 0, 455, 303, 215]
[252, 306, 0, 615, 233]
[91, 116, 284, 0, 671]
[68, 42, 104, 466, 0]


In [189]:
num = 0
err = 0
sqErr = 0
oneErr =   [0,0,0,0,0]
twoErr =   [0,0,0,0,0]
threeErr = [0,0,0,0,0]
fourErr =  [0,0,0,0,0]
fiveErr =  [0,0,0,0,0]
predictionsFile = open("predictions_8.txt", 'r')
for l in predictionsFile:
    #print(l.split())
    num = num+1
    actual,pred = l.split()
    actual = float(actual)
    pred = float(pred)
    if actual != pred:
        err = err+1
        if actual == 1.0:
            oneErr[int(pred) - 1] += 1
        elif actual == 2.0:
            twoErr[int(pred) - 1] += 1
        elif actual == 3.0:
            threeErr[int(pred) - 1] += 1
        elif actual == 4.0:
            fourErr[int(pred) - 1] += 1
        else:
            fiveErr[int(pred) - 1] += 1
predictionsFile.close()


In [190]:
accuracy = round((1 - err*1.0/num)*100,2)
acc.append(accuracy)
print ('Total number of predictions made: ' , num) 
print ('Total number of incorrect predictions' , err)
print ('accuracy is ' , accuracy)
print ('Below is the confusion matrix')
print (oneErr)
print (twoErr)
print (threeErr)
print (fourErr)
print (fiveErr)

Total number of predictions made:  10000
Total number of incorrect predictions 5656
accuracy is  43.44
Below is the confusion matrix
[0, 314, 250, 153, 154]
[565, 0, 555, 270, 214]
[229, 252, 0, 601, 272]
[87, 96, 338, 0, 678]
[73, 42, 124, 389, 0]


In [191]:
num = 0
err = 0
sqErr = 0
oneErr =   [0,0,0,0,0]
twoErr =   [0,0,0,0,0]
threeErr = [0,0,0,0,0]
fourErr =  [0,0,0,0,0]
fiveErr =  [0,0,0,0,0]
predictionsFile = open("predictions_9.txt", 'r')
for l in predictionsFile:
    #print(l.split())
    num = num+1
    actual,pred = l.split()
    actual = float(actual)
    pred = float(pred)
    if actual != pred:
        err = err+1
        if actual == 1.0:
            oneErr[int(pred) - 1] += 1
        elif actual == 2.0:
            twoErr[int(pred) - 1] += 1
        elif actual == 3.0:
            threeErr[int(pred) - 1] += 1
        elif actual == 4.0:
            fourErr[int(pred) - 1] += 1
        else:
            fiveErr[int(pred) - 1] += 1
predictionsFile.close()


In [192]:
accuracy = round((1 - err*1.0/num)*100,2)
acc.append(accuracy)
print ('Total number of predictions made: ' , num) 
print ('Total number of incorrect predictions' , err)
print ('accuracy is ' , accuracy)
print ('Below is the confusion matrix')
print (oneErr)
print (twoErr)
print (threeErr)
print (fourErr)
print (fiveErr)

Total number of predictions made:  10000
Total number of incorrect predictions 5530
accuracy is  44.7
Below is the confusion matrix
[0, 366, 243, 131, 117]
[574, 0, 466, 281, 168]
[254, 323, 0, 536, 233]
[92, 108, 343, 0, 600]
[75, 48, 167, 405, 0]


In [195]:
print ('The average accuracy of the model is' ,np.mean(acc))

The average accuracy of the model is 43.762
