In [1]:
import json
from matplotlib import pyplot as plt
from collections import defaultdict
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
import sklearn
import numpy as np
import pandas as pd
import random
import gzip
import math

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data_path = "/content/drive/MyDrive/ass2/"

In [4]:
def setup_dataset_values(i):
    global_vars()  # Declare all globals
    global df, train_df, test_df
    df = df[df['user_id'].map(df['user_id'].value_counts()) > i]
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    for index, row in train_df.iterrows():
        user, item, r = row['user_id'], row['asin'], row['rating']
        reviewsPerUser[user].append((item, int(r)))
        reviewsPerItem[item].append((user, int(r)))
        ratingDict[(user, item)] = r

In [5]:
def setup_k(K=2):
  global_vars()
  for u in reviewsPerUser:
      userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]

  for i in reviewsPerItem:
      itemGamma[i] = [random.random() * 0.1 - 0.05 for k in range(K)]

In [6]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [7]:
def unpack(theta):
    global alpha
    global userBiases
    global itemBiases
    global userGamma
    global itemGamma
    index = 0
    alpha = theta[index]
    index += 1
    userBiases = dict(zip(users, theta[index:index+nUsers]))
    index += nUsers
    itemBiases = dict(zip(items, theta[index:index+nItems]))
    index += nItems
    for u in users:
        userGamma[u] = theta[index:index+K]
        index += K
    for i in items:
        itemGamma[i] = theta[index:index+K]
        index += K

In [8]:
def inner(x, y):
    return sum([a*b for a,b in zip(x,y)])

In [9]:
def prediction(user, item):
    if user not in userBiases and item not in itemBiases:
        return alpha
    elif user not in userBiases:
        return alpha + itemBiases[item]
    elif item not in itemBiases:
        return alpha + userBiases[user]
    else:
        return alpha + userBiases[user] + itemBiases[item] + inner(userGamma[user], itemGamma[item])

In [10]:
def cost(theta, labels, lamb):
    unpack(theta)
    predictions = [prediction(row.user_id, row.asin) for row in train_df.itertuples(index=False)]
    cost = MSE(predictions, labels)
    print("MSE = " + str(cost))
    for u in users:
        cost += lamb*userBiases[u]**2
        for k in range(K):
            cost += lamb*userGamma[u][k]**2
    for i in items:
        cost += lamb*itemBiases[i]**2
        for k in range(K):
            cost += lamb*itemGamma[i][k]**2
    return cost

In [11]:
import numpy
def derivative(theta, labels, lamb):
    unpack(theta)
    N = len(train_df)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    dUserGamma = {}
    dItemGamma = {}
    for u in reviewsPerUser:
        dUserGamma[u] = [0.0 for k in range(K)]
    for i in reviewsPerItem:
        dItemGamma[i] = [0.0 for k in range(K)]
    for row in train_df.itertuples(index=False):
        u, i = row.user_id, row.asin
        pred = prediction(u, i)
        diff = pred - row.rating
        dalpha += 2/N*diff
        dUserBiases[u] += 2/N*diff
        dItemBiases[i] += 2/N*diff
        for k in range(K):
            dUserGamma[u][k] += 2/N*itemGamma[i][k]*diff
            dItemGamma[i][k] += 2/N*userGamma[u][k]*diff
    for u in userBiases:
        dUserBiases[u] += 2*lamb*userBiases[u]
        for k in range(K):
            dUserGamma[u][k] += 2*lamb*userGamma[u][k]
    for i in itemBiases:
        dItemBiases[i] += 2*lamb*itemBiases[i]
        for k in range(K):
            dItemGamma[i][k] += 2*lamb*itemGamma[i][k]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    for u in users:
        dtheta += dUserGamma[u]
    for i in items:
        dtheta += dItemGamma[i]
    return numpy.array(dtheta)

In [18]:
import scipy.optimize

def train_model(lam=0.000018, iter=15):
    global_vars()
    labels = [int(r) for r in ratingDict.values()]
    scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + # Initialize alpha
                                      [0.0]*(nUsers+nItems) + # Initialize beta
                                      [random.random() * 0.1 - 0.05 for k in range(K*(nUsers+nItems))], # Gamma
                                derivative, args = (labels, lam), maxiter=iter)

In [19]:
def test_model(i):
    global_vars()
    testRatingDict = {}
    for index, row in test_df.iterrows():
        user, item, r = row['user_id'], row['asin'], row['rating']
        testRatingDict[(user, item)] = r
    test_labels = [int(r) for r in testRatingDict.values()]

    predictions = []
    for index, row in test_df.iterrows():
        user, item = row['user_id'], row['asin']
        predictions.append(prediction(user, item))

    print(f"MSE on Test Set >{i}: {MSE(predictions, test_labels)}")

In [93]:
def most_popular_items():
  global_vars()
  items_ratings = defaultdict(list)
  for index, row in train_df.iterrows():
    user, item, r = row['user_id'], row['asin'], row['rating']
    items_ratings[item].append(r)

  for i in items_ratings:
    items_ratings[i] = sum(items_ratings[i]) / len(items_ratings[i])

  # sorted_items = sorted(items_ratings.items(), key=lambda x: x[1], reverse=True)

  # num_items = int(threshold * len(sorted_items))
  # for i in range(num_items):
  #   item, num_items = sorted_items[i]
  #   sorted_items = item, (num_items - i)/num_items * 5

  return items_ratings


In [103]:
# def test_binary_model(i, items_ratings, min_pred=4.5, pred_weight=0.8):
#     global_vars()
#     testRatingDict = {}
#     for index, row in test_df.iterrows():
#         user, item, r = row['user_id'], row['asin'], row['rating']
#         testRatingDict[(user, item)] = 1 if r >= 5 else 0
#     test_labels = [int(r) for r in testRatingDict.values()]

#     predictions = []
#     for index, row in test_df.iterrows():
#         user, item = row['user_id'], row['asin']
#         item_rating = items_ratings[item]
#         user_rating = prediction(user, item)
#         if item_rating == []:
#           item_rating = user_rating
#         predictions.append(1 if (float(prediction(user, item)) * pred_weight + item_rating * (1-pred_weight)) > 4.25 else 0)

#     correct_predictions = sum(p == t for p, t in zip(predictions, test_labels))
#     accuracy = correct_predictions / len(test_labels)
#     print(f"Accuracy on Test Set >{i}: {accuracy}")

In [109]:
def test_binary_model(i, min_pred=4.5):
    global_vars()
    testRatingDict = {}
    for index, row in test_df.iterrows():
        user, item, r = row['user_id'], row['asin'], row['rating']
        testRatingDict[(user, item)] = 1 if r >= 5 else 0
    test_labels = [int(r) for r in testRatingDict.values()]

    predictions = []
    for index, row in test_df.iterrows():
        user, item = row['user_id'], row['asin']
        predictions.append(1 if prediction(user, item) > min_pred else 0)

    correct_predictions = sum(p == t for p, t in zip(predictions, test_labels))
    accuracy = correct_predictions / len(test_labels)
    print(f"Accuracy on Test Set >{i}: {accuracy}")

In [39]:
def global_vars():
    global df, train_df, test_df, reviewsPerUser, reviewsPerItem, ratingDict
    global ratingMean, alpha, userBiases, itemBiases, userGamma, itemGamma
    global K, users, items, N, nUsers, nItems


In [102]:
global_vars()
df = pd.read_json(data_path + "Amazon_Fashion.jsonl.gz", lines=True, compression='gzip')
df = df.sort_values(by='timestamp').drop_duplicates(subset=['user_id', 'asin'], keep='last')

In [74]:
for i in range(5):
  reviewsPerUser = defaultdict(list)
  reviewsPerItem = defaultdict(list)
  ratingDict = {}

  setup_dataset_values(i)

  ratingMean = sum(ratingDict.values()) / len(ratingDict)
  alpha = ratingMean
  print(alpha)

  N = len(train_df)
  nUsers = len(reviewsPerUser)
  nItems = len(reviewsPerItem)
  users = list(reviewsPerUser.keys())
  items = list(reviewsPerItem.keys())

  userBiases = defaultdict(float)
  itemBiases = defaultdict(float)

  userGamma = {}
  itemGamma = {}

  K=2
  setup_k(K)

  train_model()

  test_model(i)

4.302792321116929
MSE = 0.7485909219785619
MSE = 0.6557793956145602
MSE = 0.6856859678841623
MSE = 0.6438548633715457
MSE = 0.6358874478205441
MSE = 0.6248790218743709
MSE = 0.5998303027629743
MSE = 0.5631131866580048
MSE = 0.4507885065639781
MSE = 0.2816122574069517
MSE = 0.11628780580733528
MSE = 0.090087821823675
MSE = 0.055449658819668124
MSE = 0.053431803247919234
MSE = 0.051690472812926276
MSE = 0.0473536971363998
MSE = 0.043487433622815756
MSE on Test Set >0: 0.8280260375173287
4.302792321116929
MSE = 0.7486681085494382
MSE = 0.6558502685763842
MSE = 0.6857915956674386
MSE = 0.6439181099261089
MSE = 0.6359523158493781
MSE = 0.6249491317392294
MSE = 0.5999218316760455
MSE = 0.5632511720332779
MSE = 0.4512120742927437
MSE = 0.2823367783522124
MSE = 0.11637852612765753
MSE = 0.08938196467354244
MSE = 0.054759421299688894
MSE = 0.052674633912387116
MSE = 0.05067304768482212
MSE = 0.04587957018989647


KeyboardInterrupt: 

In [108]:
for i in [5]:
  reviewsPerUser = defaultdict(list)
  reviewsPerItem = defaultdict(list)
  ratingDict = {}

  setup_dataset_values(i)

  ratingMean = sum(ratingDict.values()) / len(ratingDict)
  alpha = ratingMean
  print(alpha)

  N = len(train_df)
  nUsers = len(reviewsPerUser)
  nItems = len(reviewsPerItem)
  users = list(reviewsPerUser.keys())
  items = list(reviewsPerItem.keys())

  userBiases = defaultdict(float)
  itemBiases = defaultdict(float)

  userGamma = {}
  itemGamma = {}

  K=2
  setup_k(K)

  train_model(iter=6)

  test_model(i)

  test_binary_model(i)

4.091264751154438
MSE = 1.5107765293965436
MSE = 1.4911956432941582
MSE = 1.4223045419726748
MSE = 18.73306612072364
MSE = 1.4168129005188979
MSE = 1.2940768356201795
MSE = 1.2610934499504882
MSE = 1.1516939257175234
MSE = 0.9679665701543252
MSE = 0.8931284399477152
MSE on Test Set >5: 1.3821747026996511
Accuracy on Test Set >5: 0.6128783991790662


In [107]:
print(df.head())

         rating                                        title  \
317735        3                                     So so...   
1008946       4            Soft as a glove, warm and toasty!   
829044        4  Great compartmentalization, a little stress   
270950        5                             Fabulous design!   
270949        5                     Really, a ten-star item!   

                                                      text images        asin  \
317735   I bought this item because I dread toe shoes. ...     []  B000689EOY   
1008946  Feb 2006:<br />I'm the type of guy who has nev...     []  B000AS2IX4   
829044   Pros:<br /><br />1.great compartmentalization ...     []  B000E3F1SO   
270950   I love animal-inspired jewelry, and this is on...     []  B00064USCU   
270949   Of all the wonderful pieces I have found at Da...     []  B00064UU1Y   

        parent_asin                       user_id           timestamp  \
317735   B000689EOY  AG6BLQGKWPSBB4KW7O4LFYTN4ONA 2004-