In [2]:
import json
from matplotlib import pyplot as plt
from collections import defaultdict
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
import sklearn
import numpy as np
import pandas as pd
import random
import gzip
import math

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
data_path = "/content/drive/MyDrive/ass2/"

In [36]:
def setup_dataset_values(i):
    global_vars()  # Declare all globals
    global df, train_df, test_df
    df = df[df['user_id'].map(df['user_id'].value_counts()) > i]
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    for index, row in train_df.iterrows():
        user, item, r = row['user_id'], row['asin'], row['rating']
        reviewsPerUser[user].append((item, int(r)))
        reviewsPerItem[item].append((user, int(r)))
        ratingDict[(user, item)] = r

In [18]:
def setup_k(K=2):
  global_vars()
  for u in reviewsPerUser:
      userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]

  for i in reviewsPerItem:
      itemGamma[i] = [random.random() * 0.1 - 0.05 for k in range(K)]

In [7]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [8]:
def unpack(theta):
    global alpha
    global userBiases
    global itemBiases
    global userGamma
    global itemGamma
    index = 0
    alpha = theta[index]
    index += 1
    userBiases = dict(zip(users, theta[index:index+nUsers]))
    index += nUsers
    itemBiases = dict(zip(items, theta[index:index+nItems]))
    index += nItems
    for u in users:
        userGamma[u] = theta[index:index+K]
        index += K
    for i in items:
        itemGamma[i] = theta[index:index+K]
        index += K

In [9]:
def inner(x, y):
    return sum([a*b for a,b in zip(x,y)])

In [10]:
def prediction(user, item):
    if user not in userBiases and item not in itemBiases:
        return alpha
    elif user not in userBiases:
        return alpha + itemBiases[item]
    elif item not in itemBiases:
        return alpha + userBiases[user]
    else:
        return alpha + userBiases[user] + itemBiases[item] + inner(userGamma[user], itemGamma[item])

In [11]:
def cost(theta, labels, lamb):
    unpack(theta)
    predictions = [prediction(row.user_id, row.asin) for row in train_df.itertuples(index=False)]
    cost = MSE(predictions, labels)
    print("MSE = " + str(cost))
    for u in users:
        cost += lamb*userBiases[u]**2
        for k in range(K):
            cost += lamb*userGamma[u][k]**2
    for i in items:
        cost += lamb*itemBiases[i]**2
        for k in range(K):
            cost += lamb*itemGamma[i][k]**2
    return cost

In [12]:
import numpy
def derivative(theta, labels, lamb):
    unpack(theta)
    N = len(train_df)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    dUserGamma = {}
    dItemGamma = {}
    for u in reviewsPerUser:
        dUserGamma[u] = [0.0 for k in range(K)]
    for i in reviewsPerItem:
        dItemGamma[i] = [0.0 for k in range(K)]
    for row in train_df.itertuples(index=False):
        u, i = row.user_id, row.asin
        pred = prediction(u, i)
        diff = pred - row.rating
        dalpha += 2/N*diff
        dUserBiases[u] += 2/N*diff
        dItemBiases[i] += 2/N*diff
        for k in range(K):
            dUserGamma[u][k] += 2/N*itemGamma[i][k]*diff
            dItemGamma[i][k] += 2/N*userGamma[u][k]*diff
    for u in userBiases:
        dUserBiases[u] += 2*lamb*userBiases[u]
        for k in range(K):
            dUserGamma[u][k] += 2*lamb*userGamma[u][k]
    for i in itemBiases:
        dItemBiases[i] += 2*lamb*itemBiases[i]
        for k in range(K):
            dItemGamma[i][k] += 2*lamb*itemGamma[i][k]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    for u in users:
        dtheta += dUserGamma[u]
    for i in items:
        dtheta += dItemGamma[i]
    return numpy.array(dtheta)

In [33]:
import scipy.optimize

def train_model(lam=0.000018):
    global_vars()
    labels = [int(r) for r in ratingDict.values()]
    scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + # Initialize alpha
                                      [0.0]*(nUsers+nItems) + # Initialize beta
                                      [random.random() * 0.1 - 0.05 for k in range(K*(nUsers+nItems))], # Gamma
                                derivative, args = (labels, lam), maxiter=15)

In [30]:
def test_model(i):
    global_vars()
    testRatingDict = {}
    for index, row in test_df.iterrows():
        user, item, r = row['user_id'], row['asin'], row['rating']
        testRatingDict[(user, item)] = r
    test_labels = [int(r) for r in testRatingDict.values()]

    predictions = []
    for index, row in test_df.iterrows():
        user, item = row['user_id'], row['asin']
        predictions.append(prediction(user, item))

    print(f"MSE on Test Set >{i}: {MSE(predictions, test_labels)}")

In [27]:
def global_vars():
    global df, train_df, test_df, reviewsPerUser, reviewsPerItem, ratingDict
    global ratingMean, alpha, userBiases, itemBiases, userGamma, itemGamma
    global K, users, items, N, nUsers, nItems


In [34]:
global_vars()
df = pd.read_json(data_path + "Amazon_Fashion.jsonl.gz", lines=True, compression='gzip')
df = df.sort_values(by='timestamp').drop_duplicates(subset=['user_id', 'asin'], keep='last')

In [35]:
for i in range(5):
  reviewsPerUser = defaultdict(list)
  reviewsPerItem = defaultdict(list)
  ratingDict = {}

  setup_dataset_values(i)

  ratingMean = sum(ratingDict.values()) / len(ratingDict)
  alpha = ratingMean
  print(alpha)

  N = len(train_df)
  nUsers = len(reviewsPerUser)
  nItems = len(reviewsPerItem)
  users = list(reviewsPerUser.keys())
  items = list(reviewsPerItem.keys())

  userBiases = defaultdict(float)
  itemBiases = defaultdict(float)

  userGamma = {}
  itemGamma = {}

  K=2
  setup_k(K)

  train_model()

  test_model(i)

3.969301534165928
MSE = 2.036334278549398
MSE = 2.0322129300920215
MSE = 3.5571874425534995
MSE = 2.0319361604910915
MSE = 2.024920971608158
MSE = 2.0229982359430614
MSE = 2.015637988236508
MSE = 1.9914880980177287
MSE = 1.9247077307288114
MSE = 1.8718348873453623
MSE = 1.8123948118015945
MSE = 1.78497509622038
MSE = 1.7771442791317196
MSE = 1.7745681922680299
MSE = 1.7711148203916114
MSE = 1.7665543615874901
MSE = 1.7647591110293763
MSE = 1.7653072016426552
MSE = 2.0010640978752816
MSE = 1.76530548816635
MSE = 1.7648668949944464
MSE on Test Set >0: 1.9470041345212992
4.03654636820466
MSE = 1.822457652361796
MSE = 1.8164019682959098
MSE = 1.7932960753366582
MSE = 63.70139010285385
MSE = 1.7924234182835677
MSE = 1.6625066678710019
MSE = 1.6400992422091958
MSE = 1.5610827893776549
MSE = 1.3986900838386163
MSE = 1.3481175955541347
MSE = 1.2796716280961113
MSE = 1.2646149288931114
MSE = 1.2767973633406398
MSE = 1.2783887128950888
MSE = 1.273345561342958
MSE = 1.267436576953906
MSE = 1.2588