In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
import numpy
import string
import random
from sklearn import linear_model
import dateutil.parser as parser
import numpy as np
from matplotlib import pyplot as plt

#### Helper functions:

In [2]:
def parse(path):
    g = open(path, 'r')
    for l in g:
        yield eval(l)

### Evaluation metrics:
def MSE(Y1, Y2):
    return np.mean((Y1-Y2)**2)

def binary_error_rate(Ypred, Ytest):
    # Check binary error rate, see report section 2
    TP = sum( np.logical_and(Ypred>=4.0, Ytest>=4.0) )
    FP = sum( np.logical_and(Ypred>=4.0, Ytest<4.0) )
    TN = sum( np.logical_and(Ypred<4.0, Ytest<4.0) )
    FN = sum( np.logical_and(Ypred<4.0, Ytest>=4.0) )

    assert TP+FP+TN+FN == len(Ytest)

    Accuracy = (TP + TN) / len(Ytest)
    BER = 1 - 0.5*(TP / (TP + FN) + TN / (TN + FP))
    print(f"TP:{TP}, FP:{FP}, TN:{TN}, FN:{FN}")
    print(f"Accuracy:{Accuracy}, BER:{BER}")
    
    
def round_predictions(predictions):
    '''
    Round predictions to the nearest integer
    '''
    rounded_predictions = np.zeros_like(predictions)
    for i, pred in enumerate(predictions):
        rounded_predictions[i] = int(round(pred))
    return rounded_predictions

#### Get data

In [3]:
data = []
for d in parse("./lthing_data/reviews.json"):
    # filter review without rating
    if 'stars' not in d: continue
    # There are also 90 reviews with no date
    if not d['time']: continue

    # for this moment, we don't need actual text
    d['length'] = len(d['comment']) # store the length
    del d['comment']
    
    # train and test
    data.append(d)

print(f"Total number of reviews with a rating: {len(data)}")
data[0]

Total number of reviews with a rating: 1387125


{'work': '3206242',
 'flags': [],
 'unixtime': 1194393600,
 'stars': 5.0,
 'nhelpful': 0,
 'time': 'Nov 7, 2007',
 'user': 'van_stef',
 'length': 83}

In [4]:
# 8:2 train-test split
cut = int(len(data) * 0.8)
train_data, test_data = data[:cut], data[cut:]
del data # save memory

### Populate useful data structures.

In [103]:
usersPerItem = defaultdict(list)
itemsPerUser = defaultdict(list)

# very likely to have same ratings, so use list
ratingsPerItem = defaultdict(list)
ratingsPerUser = defaultdict(list)

# Each User has a list of length 3: nhelpful, #abuse, #not_a_review
recordPerUser = defaultdict(lambda:[0,0,0])

for d in train_data:
    
    u, i, r, dt, dl = d['user'], d['work'], d['stars'], parser.parse(d['time']), d['length']
    usersPerItem[i].append( (dt, u, dl) )
    itemsPerUser[u].append( (dt, i, dl) )
    ratingsPerItem[i].append(r)
    ratingsPerUser[u].append(r)
    
    if d['nhelpful']:
        recordPerUser[u][0] += d['nhelpful']
    if d['flags']:
        if 'abuse' in d['flags']:
            recordPerUser[u][1] += 1
        if 'not_a_review' in d['flags']:
            recordPerUser[u][2] += 1

# calculate 2 global averages for cold-start user or book: average of each user's/book's average ratings
avgBookRating = sum( sum(ratingsPerItem[i])/len(ratingsPerItem[i]) for i in ratingsPerItem)/len(ratingsPerItem)
avgUserRating = sum( sum(ratingsPerUser[u])/len(ratingsPerUser[u]) for i in ratingsPerUser)/len(ratingsPerUser)

print(avgBookRating, avgUserRating)

3.7654177390812986 3.5


In [6]:
print(f"A total of {len(recordPerUser)} users out of {len(itemsPerUser)} have helpful votes, abuse review, or not-a-review")

A total of 26105 users out of 65768 have helpful votes, abuse review, or not-a-review


# baseline model

### A relevant simple baseline that predicts rating based on the average rating given by a user, and the average rating received by a book.

In [7]:
def feature_baseline(d):
    global avgBookRating, avgUserRating
    result = [1] # bias term
    u, i = d['user'], d['work']
    if u in ratingsPerUser:
        result.append( sum(ratingsPerUser[u]) / len(ratingsPerUser[u]) )
    else:
        result.append(avgUserRating)
    if i in ratingsPerItem:
        result.append( sum(ratingsPerItem[i]) / len(ratingsPerItem[i]) )
    else:
        result.append(avgBookRating)
    return result

In [8]:
Xtrain_baseline = np.array( [feature_baseline(d) for d in train_data] )
Xtest_baseline = np.array( [feature_baseline(d) for d in test_data] )

Ytrain = np.array( [d['stars'] for d in train_data] )
Ytest = np.array( [d['stars'] for d in test_data] )

In [20]:
model_baseline = linear_model.LinearRegression()
model_baseline.fit(Xtrain_baseline, Ytrain)

Ypred_baseline = model_baseline.predict(Xtest_baseline)
Ypred_baseline

array([4.04929319, 3.68309618, 3.59532967, ..., 3.66405694, 3.55441755,
       4.53077834])

In [21]:
print("Ypred baseline:")
print("MSE = ", MSE(Ypred_baseline, Ytest))
binary_error_rate(Ypred_baseline, Ytest)

Ypred baseline:
MSE =  0.8515584703996876
TP:78705, FP:17295, TN:86892, FN:94533
Accuracy:0.5969072722357394, BER:0.35584120736810165


In [22]:
# round Ypred to nearest integers
rounded_Ypred_baseline = round_predictions(Ypred_baseline)
print("Rounded Ypred baseline:")
print("MSE = ", MSE(rounded_Ypred_baseline, Ytest))
binary_error_rate(rounded_Ypred_baseline, Ytest)

Rounded Ypred baseline:
MSE =  0.9418248175182482
TP:147137, FP:62126, TN:42061, FN:26101
Accuracy:0.6819789132197891, BER:0.3734793809657283


In [23]:
# Compare it to a even more trivial baseline: always predict median 4.0
trivial_baseline = np.array([4.0]*len(Ytest))
print("Trivial baseline:")
print("MSE = ", MSE(trivial_baseline, Ytest))
binary_error_rate(trivial_baseline, Ytest)

Trivial baseline:
MSE =  1.0327899432278995
TP:173238, FP:104187, TN:0, FN:0
Accuracy:0.6244498513111651, BER:0.5


### Basic feature engineering design

In [104]:
""" Integrate features relavent to a user """
def featureU(u, t):
    global avgUserRating
    if u not in ratingsPerUser:
        # cold-start user, see below
        return [0, avgUserRating, 0,0,0, 0]
    
    f = []
    # How many books this user have read; if cold-start, 0.
    f.append( len(itemsPerUser[u]) )
    # average rating this user gives; average of all users' average ratings (see baseline)
    f.append( sum(ratingsPerUser[u]) / len(ratingsPerUser[u]) )
    
    # This user's number of 'not_a_review' and 'abuse' comments respectively; 0, 0
    # Let the model decide their effect on rating prediction
    f += recordPerUser[u]
    # nhelpful received; 0
    
    # ??? time (maybe in month?) since his last reading; 
    
    # number of books he has read till this time t; 0
    # General opinion (rating habit may change as one read more books)
    c = 0
    dt = parser.parse(t)
    for item in itemsPerUser[u]:
        if item[0] >= dt:
            c += 1
    f.append(c)
    
    return f

In [105]:
# test featureU
[featureU(d['user'], 'Apr 7, 2011') for d in test_data[:5]]

[[59, 3.940677966101695, 4, 0, 0, 38],
 [2, 3.5, 0, 0, 0, 0],
 [20, 3.85, 2, 0, 0, 3],
 [138, 3.6340579710144927, 8, 0, 0, 21],
 [19, 3.6842105263157894, 7, 4, 4, 0]]

In [110]:
""" Integrate features relavent to a book """
def featureI(i, t):
    global avgBookRating
    if i not in ratingsPerItem:
        # cold-start book
        return [0, avgBookRating, 0, 0, 0]
    
    f = []
    # How many users have read this book; if cold-start, 0.
    f.append( len(usersPerItem[i]) )
    
    # average rating this book receives; average of all books' average ratings (see baseline)
    f.append( sum(ratingsPerItem[i]) / len(ratingsPerItem[i]) )
    
    # length of time interval (in month) this book was read by people (t_last_read - t_first_read); 0
    all_times = [user[0] for user in usersPerItem[i]]
    f.append( (max(all_times) - min(all_times)).days / 30 )
    
    # average length of the comment it received; 0
    all_lengths = [user[2] for user in usersPerItem[i]]
    f.append( sum(all_lengths) / len(all_lengths) )
    
    # number of users who have read this book till time t; 0
    c = 0
    dt = parser.parse(t)
    for user in usersPerItem[i]:
        if user[0] >= dt:
            c += 1
    f.append(c)
    
    return f

In [111]:
# test featureI
[featureI(d['work'], 'Apr 7, 2011') for d in test_data[:5]]

[[52, 3.9903846153846154, 74.0, 778.2884615384615, 18],
 [10, 3.9, 58.1, 299.5, 4],
 [29, 3.5172413793103448, 79.06666666666666, 766.551724137931, 13],
 [16, 3.1875, 56.733333333333334, 864.875, 1],
 [7, 4.357142857142857, 64.0, 445.14285714285717, 2]]