In [None]:
import json
from matplotlib import pyplot as plt
from collections import defaultdict
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
import sklearn
import numpy as np
import pandas as pd
import random
import gzip
import math

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data_path = "/content/drive/MyDrive/ass2/"

In [None]:
# Load the gzipped JSONL file
df = pd.read_json(data_path + "Amazon_Fashion.jsonl.gz", lines=True, compression='gzip')


In [None]:
df = df.sort_values(by='timestamp').drop_duplicates(subset=['user_id', 'asin'], keep='last')


In [None]:
df = df[df['user_id'].map(df['user_id'].value_counts()) > 1]

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [None]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
ratingDict = {}

for index, row in train_df.iterrows():
    user, item, r = row['user_id'], row['asin'], row['rating']
    reviewsPerUser[user].append((item, int(r)))
    reviewsPerItem[item].append((user, int(r)))
    ratingDict[(user, item)] = r

In [None]:
ratingMean = sum(ratingDict.values()) / len(ratingDict)
alpha = ratingMean
print(alpha)

4.03654636820466


In [None]:
print(len(reviewsPerUser))

273434


In [None]:
N = len(train_df)
nUsers = len(reviewsPerUser)
nItems = len(reviewsPerItem)
users = list(reviewsPerUser.keys())
items = list(reviewsPerItem.keys())

In [None]:
len(users)

273434

In [None]:
userBiases = defaultdict(float)
itemBiases = defaultdict(float)

In [None]:
userGamma = {}
itemGamma = {}

In [None]:
K = 2


In [None]:
for u in reviewsPerUser:
    userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]

for i in reviewsPerItem:
    itemGamma[i] = [random.random() * 0.1 - 0.05 for k in range(K)]

In [None]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [None]:
def unpack(theta):
    global alpha
    global userBiases
    global itemBiases
    global userGamma
    global itemGamma
    index = 0
    alpha = theta[index]
    index += 1
    userBiases = dict(zip(users, theta[index:index+nUsers]))
    index += nUsers
    itemBiases = dict(zip(items, theta[index:index+nItems]))
    index += nItems
    for u in users:
        userGamma[u] = theta[index:index+K]
        index += K
    for i in items:
        itemGamma[i] = theta[index:index+K]
        index += K

In [None]:
def inner(x, y):
    return sum([a*b for a,b in zip(x,y)])

In [None]:
def prediction(user, item):
    if user not in userBiases and item not in itemBiases:
        return alpha
    elif user not in userBiases:
        return alpha + itemBiases[item]
    elif item not in itemBiases:
        return alpha + userBiases[user]
    else:
        return alpha + userBiases[user] + itemBiases[item] + inner(userGamma[user], itemGamma[item])

In [None]:
def cost(theta, labels, lamb):
    unpack(theta)
    predictions = [prediction(row.user_id, row.asin) for row in train_df.itertuples(index=False)]
    cost = MSE(predictions, labels)
    print("MSE = " + str(cost))
    for u in users:
        cost += lamb*userBiases[u]**2
        for k in range(K):
            cost += lamb*userGamma[u][k]**2
    for i in items:
        cost += lamb*itemBiases[i]**2
        for k in range(K):
            cost += lamb*itemGamma[i][k]**2
    return cost

In [None]:
import numpy
def derivative(theta, labels, lamb):
    unpack(theta)
    N = len(train_df)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    dUserGamma = {}
    dItemGamma = {}
    for u in reviewsPerUser:
        dUserGamma[u] = [0.0 for k in range(K)]
    for i in reviewsPerItem:
        dItemGamma[i] = [0.0 for k in range(K)]
    for row in train_df.itertuples(index=False):
        u, i = row.user_id, row.asin
        pred = prediction(u, i)
        diff = pred - row.rating
        dalpha += 2/N*diff
        dUserBiases[u] += 2/N*diff
        dItemBiases[i] += 2/N*diff
        for k in range(K):
            dUserGamma[u][k] += 2/N*itemGamma[i][k]*diff
            dItemGamma[i][k] += 2/N*userGamma[u][k]*diff
    for u in userBiases:
        dUserBiases[u] += 2*lamb*userBiases[u]
        for k in range(K):
            dUserGamma[u][k] += 2*lamb*userGamma[u][k]
    for i in itemBiases:
        dItemBiases[i] += 2*lamb*itemBiases[i]
        for k in range(K):
            dItemGamma[i][k] += 2*lamb*itemGamma[i][k]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    for u in users:
        dtheta += dUserGamma[u]
    for i in items:
        dtheta += dItemGamma[i]
    return numpy.array(dtheta)

In [None]:
import scipy.optimize

labels = [int(r) for r in ratingDict.values()]
scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + # Initialize alpha
                                   [0.0]*(nUsers+nItems) + # Initialize beta
                                   [random.random() * 0.1 - 0.05 for k in range(K*(nUsers+nItems))], # Gamma
                             derivative, args = (labels, 0.000018), maxiter=10)

MSE = 1.8224548432113514
MSE = 1.8163992614676003
MSE = 1.793294703528695
MSE = 49.22237523382162
MSE = 1.7921412867693303
MSE = 1.662522863821909
MSE = 1.6476503982772897
MSE = 1.592747121964441
MSE = 1.3955590017923836
MSE = 1.348353907636786
MSE = 1.2779766614036527
MSE = 1.267441163036944
MSE = 1.2777695779947635
MSE = 1.2787404511803198


(array([ 4.02121757,  0.15843348,  0.13848546, ...,  0.01095544,
         0.00680693, -0.00743778]),
 1.500763220466869,
 {'grad': array([-5.52958883e-05, -1.14236484e-06, -3.50777097e-07, ...,
          3.76501082e-07,  2.45813031e-07, -2.68593824e-07]),
  'task': 'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT',
  'funcalls': 14,
  'nit': 10,
  'warnflag': 1})

In [None]:
print(userBiases)

In [None]:
testRatingDict = {}
for index, row in test_df.iterrows():
    user, item, r = row['user_id'], row['asin'], row['rating']
    testRatingDict[(user, item)] = r

In [65]:
test_labels = [int(r) for r in testRatingDict.values()]

predictions = []
for index, row in test_df.iterrows():
    user, item = row['user_id'], row['asin']
    predictions.append(prediction(user, item))

print(f"MSE: {MSE(predictions, test_labels)}")

KeyboardInterrupt: 