In [11]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
import numpy
import string
import random
from sklearn import linear_model
import dateutil.parser as parser
from datetime import date
import numpy as np
from matplotlib import pyplot as plt

In [2]:
f = open("./lthing_data/edges.txt", 'r')
count = 0
for line in f:
    if count < 10:
        print(line)
    count += 1

f.close()
count

Rodo anehan

Rodo sevilemar

Rodo dingsi

Rodo slash

RelaxedReader AnnRig

RelaxedReader bookbroke

RelaxedReader Bumpersmom

RelaxedReader DivaColumbus

RelaxedReader AnnRig

RelaxedReader bookbroke



219790

In [3]:
def parse(path):
    g = open(path, 'r')
    for l in g:
        yield eval(l)

In [4]:
count = 0
data = []

for d in parse("./lthing_data/reviews.json"):
    # filter review without rating
    if 'stars' not in d: continue
    # There are also 90 reviews with no date
    if not d['time']: continue

    # for this moment, we don't need actual text
    d['length'] = len(d['comment'])
    del d['comment']
    
    # train and test
    data.append(d)
    count += 1

print(f"Total number of reviews with a rating: {count}")
data[1]

Total number of reviews with a rating: 1387125


{'work': '12198649',
 'flags': [],
 'unixtime': 1333756800,
 'stars': 5.0,
 'nhelpful': 0,
 'time': 'Apr 7, 2012',
 'user': 'dwatson2',
 'length': 2582}

In [5]:
# 8:2 train-test split
cut = int(count * 0.8)
train_data, test_data = data[:cut], data[cut:]
del data # save memory

### Populate useful data structures.

In [6]:
usersPerItem = defaultdict(list)
itemsPerUser = defaultdict(list)

# very likely to have same ratings, so use list
ratingsPerItem = defaultdict(list)
ratingsPerUser = defaultdict(list)

# Each User has a list of length 3: nhelpful, #abuse, #not_a_review
recordPerUser = defaultdict(lambda:[0,0,0])

for d in train_data:
    
    u, i, r, dt = d['user'], d['work'], d['stars'], parser.parse(d['time'])
    usersPerItem[i].append( (dt, u) )
    itemsPerUser[u].append( (dt, i) )
    ratingsPerItem[i].append(r)
    ratingsPerUser[u].append(r)
    
    if d['nhelpful']:
        recordPerUser[u][0] += d['nhelpful']
    if d['flags']:
        if 'abuse' in d['flags']:
            recordPerUser[u][1] += 1
        if 'not_a_review' in d['flags']:
            recordPerUser[u][2] += 1

# calculate 2 global averages for cold-start user or book: average of each user's/book's average ratings
avgBookRating = sum( sum(ratingsPerItem[i])/len(ratingsPerItem[i]) for i in ratingsPerItem)/len(ratingsPerItem)
avgUserRating = sum( sum(ratingsPerUser[u])/len(ratingsPerUser[u]) for i in ratingsPerUser)/len(ratingsPerUser)

print(avgBookRating, avgUserRating)

3.7654177390812986 3.5


In [7]:
print(f"A total of {len(recordPerUser)} users out of {len(itemsPerUser)} have helpful votes, abuse review, or not-a-review")

A total of 26105 users out of 65768 have helpful votes, abuse review, or not-a-review


### Sort interaction data structures by time for later use

In [8]:
for i in usersPerItem:
    usersPerItem[i].sort()
    
for u in itemsPerUser:
    itemsPerUser[u].sort()

# baseline model

### A relevant simple baseline that predicts rating based on the average rating given by a user, and the average rating received by a book.

In [None]:
def feature_baseline(d):
    global avgBookRating, avgUserRating
    result = [1] # bias term
    u, i = d['user'], d['work']
    if u in ratingsPerUser:
        result.append( sum(ratingsPerUser[u]) / len(ratingsPerUser[u]) )
    else:
        result.append(avgUserRating)
    if i in ratingsPerItem:
        result.append( sum(ratingsPerItem[i]) / len(ratingsPerItem[i]) )
    else:
        result.append(avgBookRating)
    return result

In [None]:
Xtrain_baseline = np.array( [feature_baseline(d) for d in train_data] )
Xtest_baseline = np.array( [feature_baseline(d) for d in test_data] )

Ytrain = np.array( [d['stars'] for d in train_data] )
Ytest = np.array( [d['stars'] for d in test_data] )

In [None]:
model_baseline = linear_model.LinearRegression()
model_baseline.fit(Xtrain_baseline, Ytrain)

Ypred_baseline = model_baseline.predict(Xtest_baseline)
Ypred_baseline

In [None]:
def MSE(Y1, Y2):
    return np.mean( (Y1-Y2)**2 )

In [None]:
# Check MSE
MSE(Ypred_baseline, Ytest)

In [None]:
# Compare it to a even more trivial baseline: always predict median 4.0
MSE([4.0]*len(Ytest), Ytest)

In [None]:
# Check binary error rate, see report section 2
TP = sum( np.logical_and(Ypred_baseline>=4.0, Ytest>=4.0) )
FP = sum( np.logical_and(Ypred_baseline>=4.0, Ytest<4.0) )
TN = sum( np.logical_and(Ypred_baseline<4.0, Ytest<4.0) )
FN = sum( np.logical_and(Ypred_baseline<4.0, Ytest>=4.0) )

assert TP+FP+TN+FN == len(Ytest)

Accuracy = (TP + TN) / len(Ytest)
BER = 1 - 0.5*(TP / (TP + FN) + TN / (TN + FP))
print(f"TP:{TN}, FP:{FP}, TN:{TN}, FN:{FN}")
print(f"Accuracy:{Accuracy}, BER:{BER}")

## Basic feature engineering design

In [16]:
""" Integrate features relavent to a user """
def featureU(u, t):
    global avgUserRating
    if u not in ratingsPerUser:
        # cold-start user, see below
        return [0, avgUserRating, 0,0,0, 0]
    
    f = []
    # How many books this user have read; if cold-start, 0.
    f.append( len(itemsPerUser[u]) )
    # average rating this user gives; average of all users' average ratings (see baseline)
    f.append( sum(ratingsPerUser[u]) / len(ratingsPerUser[u]) )
    
    # This user's number of 'not_a_review' and 'abuse' comments respectively; 0, 0
    # Let the model decide their effect on rating prediction
    f += recordPerUser[u]
    # nhelpful received; 0
    
    # ??? time (maybe in month?) since his last reading; 
    
    # number of books he has read till this time t; 0
    # General opinion (rating habit may change as one read more books)
    c = 0
    dt = parser.parse(t)
    while c<len(itemsPerUser[u]) and dt <= itemsPerUser[u][c][0]: 
        c += 1
    f.append(c)
    
    return f

In [17]:
# test our function
[featureU(d['user'], 'Apr 7, 2012') for d in test_data][:10]

[[59, 3.940677966101695, 4, 0, 0, 0],
 [2, 3.5, 0, 0, 0, 0],
 [20, 3.85, 2, 0, 0, 0],
 [138, 3.6340579710144927, 8, 0, 0, 0],
 [19, 3.6842105263157894, 7, 4, 4, 0],
 [111, 4.045045045045045, 0, 0, 0, 111],
 [12, 3.5, 0, 0, 0, 12],
 [705, 3.248936170212766, 369, 1, 0, 0],
 [2, 4.0, 0, 0, 0, 0],
 [55, 4.1454545454545455, 0, 0, 3, 0]]

In [None]:
def featureI(i, t):
    # How many users have read this book; 0
    
    # average rating this book receives; average of all books' average ratings (see baseline)
    
    # length of time interval (in month) this book was read by people (t_last_read - t_first_read); 0
    
    # average length of the comment it received; 0
    
    # number of users who have read this book till time t; 0
    
    pass

In [None]:
def featureInter(u, i):
    # old-school book similarity (is this book similar to what I have read?); 0
    
    # AFTER make use of social networks: how often was this book read by the user's friends?
    # cold-start book:0; cold-start user: (total read/total user)
    
    pass

In [None]:
c = 0
for d in parse("./lthing_data/reviews.json"):
    if 'stars' not in d: continue
    if d['flags']:
        print(d)
        c += 1
    if c==5: break