In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
import numpy
import string
import random
from sklearn import linear_model
import dateutil.parser as parser
import numpy as np
from matplotlib import pyplot as plt

In [2]:
def parse(path):
    g = open(path, 'r')
    for l in g:
        yield eval(l)

### Evaluation metrics:
def MSE(Y1, Y2):
    return np.mean((Y1-Y2)**2)

def binary_error_rate(Ypred, Ytest):
    # Check binary error rate, see report section 2
    TP = sum( np.logical_and(Ypred>=4.0, Ytest>=4.0) )
    FP = sum( np.logical_and(Ypred>=4.0, Ytest<4.0) )
    TN = sum( np.logical_and(Ypred<4.0, Ytest<4.0) )
    FN = sum( np.logical_and(Ypred<4.0, Ytest>=4.0) )

    assert TP+FP+TN+FN == len(Ytest)

    Accuracy = (TP + TN) / len(Ytest)
    BER = 1 - 0.5*(TP / (TP + FN) + TN / (TN + FP))
    print(f"TP:{TP}, FP:{FP}, TN:{TN}, FN:{FN}")
    print(f"Accuracy:{Accuracy}, BER:{BER}")
    
    
def round_predictions(predictions):
    '''
    Round predictions to the nearest integer
    '''
    rounded_predictions = np.zeros_like(predictions)
    for i, pred in enumerate(predictions):
        rounded_predictions[i] = int(round(pred))
    return rounded_predictions

def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [3]:
dataDir = "/Users/cairui/Downloads/lthing_data/"
fileName = "reviews.json"
# review data
data = []
for d in parse(dataDir + fileName):
    # filter review without rating
    if ('stars' not in d) or d['stars'] == None: continue
    # or without valid date
    if ('unixtime' not in d) or (d['unixtime'] == None) or (d['unixtime'] == -86400): continue

    # for this moment, we don't need actual text
    d['length'] = len(d['comment']) # store the length
    del d['comment']
    
    # train and test
    data.append(d)

print(f"Total number of reviews with a rating: {len(data)}")
data[0]

Total number of reviews with a rating: 1383597


{'work': '3206242',
 'flags': [],
 'unixtime': 1194393600,
 'stars': 5.0,
 'nhelpful': 0,
 'time': 'Nov 7, 2007',
 'user': 'van_stef',
 'length': 83}

In [5]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
# very likely to have same ratings, so use list
ratingsPerItem = defaultdict(list)
ratingsPerUser = defaultdict(list)

# Each User has a list of length 3: nhelpful, #abuse, #not_a_review
recordPerUser = defaultdict(lambda:[0,0,0])

for d in data:
    
    u, i, r, dt = d['user'], d['work'], d['stars'], d['unixtime']
    reviewsPerUser[u].append(d)
    reviewsPerItem[i].append(d)
    usersPerItem[i].add(u)
    itemsPerUser[u].add(i)
    ratingsPerItem[i].append(r)
    ratingsPerUser[u].append(r)
    
    if d['nhelpful']:
        recordPerUser[u][0] += d['nhelpful']
    if d['flags']:
        if 'abuse' in d['flags']:
            recordPerUser[u][1] += 1
        if 'not_a_review' in d['flags']:
            recordPerUser[u][2] += 1

# calculate 2 global averages for cold-start user or book: average of each user's/book's average ratings
avgBookRating = sum( sum(ratingsPerItem[i])/len(ratingsPerItem[i]) for i in ratingsPerItem)/len(ratingsPerItem)
avgUserRating = sum( sum(ratingsPerUser[u])/len(ratingsPerUser[u]) for u in ratingsPerUser)/len(ratingsPerUser)
print(avgBookRating, avgUserRating)

3.7650352301523946 3.9681081288404285


In [56]:
train_data = []
test_data = []
for u in reviewsPerUser:
    cut = int(len(reviewsPerUser[u]) * 0.8)
    for r in reviewsPerUser[u][:cut]:
        train_data.append(r)
    for r in reviewsPerUser[u][cut:]:
        test_data.append(r)

In [57]:
len(train_data)

1069138

In [58]:
len(train_data) / 1383597

0.7727235604008971

In [59]:
itemsTrain = set()
for d in train_data:
    i = d['work']
    itemsTrain.add(i)

In [60]:
itemsTest = set()
for d in test_data:
    i = d['work']
    itemsTest.add(i)

In [63]:
len(itemsTest) - len(itemsTrain.intersection(itemsTest))

55178

In [64]:
new_test_data = []
for idx in range(0, len(test_data)):
    i = test_data[idx]['work']
    if i not in itemsTrain:
        train_data.append(test_data[idx])
        itemsTrain.add(i)
    else:
        new_test_data.append(test_data[idx])
del test_data
test_data = new_test_data
del new_test_data

In [65]:
len(test_data)

259281

In [66]:
len(train_data) / 1383597

0.8126036699992845

In [67]:
itemsTest = set()
for d in test_data:
    i = d['work']
    itemsTest.add(i)
len(itemsTest) - len(itemsTrain.intersection(itemsTest))

0