In [26]:
import json
from matplotlib import pyplot as plt
from collections import defaultdict
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
import sklearn
import numpy as np
import pandas as pd
import random
import gzip
import math

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data_path = "/content/drive/MyDrive/ass2/"

In [47]:
def setup_dataset_values(i):
    global_vars()  # Declare all globals
    global df, train_df, test_df
    df['year'] = pd.to_datetime(df['timestamp']).dt.year  # Extract year from timestamp
    df = df[df['user_id'].map(df['user_id'].value_counts()) > i]
    train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

    for index, row in train_df.iterrows():
        user, item, r, year = row['user_id'], row['asin'], row['rating'], row['year']
        reviewsPerUser[user].append((item, int(r), year))
        reviewsPerItem[item].append((user, int(r), year))
        ratingDict[(user, item, year)] = r

In [21]:
def setup_k(K=2):
  global_vars()
  for u in reviewsPerUser:
      userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]

  for i in reviewsPerItem:
      itemGamma[i] = [random.random() * 0.1 - 0.05 for k in range(K)]

In [22]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [52]:
def accuracy(predictions, labels):
    correct = sum(1 for pred, label in zip(predictions, labels) if round(pred) == label)
    return correct / len(labels)

In [20]:
def unpack(theta):
    global alpha
    global userBiases
    global itemBiases
    global userGamma
    global itemGamma
    global timeBias  # Add timeBias

    index = 0

    # Unpack alpha
    alpha = theta[index]
    index += 1

    # Unpack user biases
    userBiases = dict(zip(users, theta[index:index + nUsers]))
    index += nUsers

    # Unpack item biases
    itemBiases = dict(zip(items, theta[index:index + nItems]))
    index += nItems

    # Unpack time biases
    timeBias = dict(zip(years, theta[index:index + nYears]))
    index += nYears

    # Unpack user gamma (latent factors)
    for u in users:
        userGamma[u] = theta[index:index + K]
        index += K

    # Unpack item gamma (latent factors)
    for i in items:
        itemGamma[i] = theta[index:index + K]
        index += K


In [24]:
def inner(x, y):
    return sum([a*b for a,b in zip(x,y)])

In [12]:
def prediction(user, item, year):
    if user not in userBiases and item not in itemBiases and year not in timeBias:
        return alpha
    elif user not in userBiases:
        return alpha + itemBiases[item] + timeBias[year]
    elif item not in itemBiases:
        return alpha + userBiases[user] + timeBias[year]
    elif year not in timeBias:
        return alpha + userBiases[user] + itemBiases[item]
    else:
        return alpha + userBiases[user] + itemBiases[item] + timeBias[year] + inner(userGamma[user], itemGamma[item])

In [14]:
def cost(theta, labels, lamb):
    unpack(theta)
    predictions = [prediction(row.user_id, row.asin, row.year) for row in train_df.itertuples(index=False)]
    cost = MSE(predictions, labels)
    print("MSE = " + str(cost))

    for u in users:
        cost += lamb * userBiases[u]**2
        for k in range(K):
            cost += lamb * userGamma[u][k]**2
    for i in items:
        cost += lamb * itemBiases[i]**2
        for k in range(K):
            cost += lamb * itemGamma[i][k]**2
    for year in years:  # Add regularization for timeBias
        cost += lamb * timeBias[year]**2
    return cost

In [72]:
def squash_prediction(pred, threshold=4.5, scale=5, steepness=10):
    """
    Squashes predictions from [0, 5] to [0, 1], with a steeper increase from 4.5 to 5.

    Args:
        pred (float): The raw prediction in the range [0, 5].
        threshold (float): The threshold where 0.5 is the midpoint for classification.
        scale (float): Scaling factor to normalize the input range.
        steepness (float): Controls the steepness of the curve (default: 10).

    Returns:
        float: Squashed value in the range [0, 1].
    """
    if pred <= threshold:
        # Scale to [-steepness, 0] for range [0, threshold]
        shifted_pred = -steepness * (1 - pred / threshold)
    else:
        # Scale to [0, steepness] for range [threshold, 5]
        shifted_pred = steepness * ((pred - threshold) / (scale - threshold))

    # Apply sigmoid function
    squashed_value = 1 / (1 + np.exp(-shifted_pred))

    return squashed_value


def cost_binary(theta, labels, lamb):
    """Binary classification loss function using log loss."""
    unpack(theta)

    # Predictions as probabilities
    predictions = [
        squash_prediction(prediction(row.user_id, row.asin, row.year))
        for row in train_df.itertuples(index=False)
    ]
    print(predictions)
    # Compute log loss
    epsilon = 1e-15  # Small value to avoid log(0)
    predictions = np.clip(predictions, epsilon, 1 - epsilon)
    labels = np.array(labels)
    log_loss = -np.mean(labels * np.log(predictions) + (1 - labels) * np.log(1 - predictions))

    print("Log Loss = " + str(log_loss))

    # Add regularization terms
    regularization = 0
    for u in users:
        regularization += lamb * userBiases[u]**2
        for k in range(K):
            regularization += lamb * userGamma[u][k]**2
    for i in items:
        regularization += lamb * itemBiases[i]**2
        for k in range(K):
            regularization += lamb * itemGamma[i][k]**2
    for year in years:
        regularization += lamb * timeBias[year]**2

    return log_loss + regularization


In [73]:
def derivative(theta, labels, lamb):
    unpack(theta)
    N = len(train_df)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    dTimeBias = defaultdict(float)  # Gradient for timeBias
    dUserGamma = {}
    dItemGamma = {}
    for u in reviewsPerUser:
        dUserGamma[u] = [0.0 for k in range(K)]
    for i in reviewsPerItem:
        dItemGamma[i] = [0.0 for k in range(K)]
    for row in train_df.itertuples(index=False):
        u, i, year = row.user_id, row.asin, row.year
        pred = prediction(u, i, year)
        diff = pred - row.rating
        dalpha += 2/N * diff
        dUserBiases[u] += 2/N * diff
        dItemBiases[i] += 2/N * diff
        dTimeBias[year] += 2/N * diff  # Update timeBias gradient
        for k in range(K):
            dUserGamma[u][k] += 2/N * itemGamma[i][k] * diff
            dItemGamma[i][k] += 2/N * userGamma[u][k] * diff
    for u in userBiases:
        dUserBiases[u] += 2 * lamb * userBiases[u]
        for k in range(K):
            dUserGamma[u][k] += 2 * lamb * userGamma[u][k]
    for i in itemBiases:
        dItemBiases[i] += 2 * lamb * itemBiases[i]
        for k in range(K):
            dItemGamma[i][k] += 2 * lamb * itemGamma[i][k]
    for year in timeBias:  # Regularization for timeBias
        dTimeBias[year] += 2 * lamb * timeBias[year]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    for year in years:
        dtheta.append(dTimeBias[year])  # Add timeBias gradient
    for u in users:
        dtheta += dUserGamma[u]
    for i in items:
        dtheta += dItemGamma[i]
    return np.array(dtheta)

In [74]:
import scipy.optimize

def train_model(lam=0.000018, iter=15, cost=cost):
    global_vars()
    labels = [int(r) for r in ratingDict.values()]
    scipy.optimize.fmin_l_bfgs_b(cost, [alpha] +  # Initialize alpha
                                      [0.0] * (nUsers + nItems + nYears) +  # Initialize biases
                                      [random.random() * 0.1 - 0.05 for _ in range(K * (nUsers + nItems))],  # Gamma
                                derivative, args=(labels, lam), maxiter=iter)

In [63]:
def test_model(i):
    global_vars()
    testRatingDict = {}
    for index, row in test_df.iterrows():
        user, item, year, r = row['user_id'], row['asin'], row['year'], row['rating']
        testRatingDict[(user, item, year)] = r
    test_labels = [int(r) for r in testRatingDict.values()]

    predictions = []
    for index, row in test_df.iterrows():
        user, item, year = row['user_id'], row['asin'], row['year']
        predictions.append(prediction(user, item, year))

    print(f"MSE on Test Set >{i}: {MSE(predictions, test_labels)}")

In [70]:
def test_binary_model(i, min_pred=4.5):
    global_vars()
    testRatingDict = {}
    for index, row in test_df.iterrows():
        user, item, r = row['user_id'], row['asin'], row['rating']
        testRatingDict[(user, item)] = 1 if r >= 5 else 0
    test_labels = [int(r) for r in testRatingDict.values()]

    predictions = []
    for index, row in test_df.iterrows():
        user, item = row['user_id'], row['asin']
        predictions.append(1 if prediction(user, item) > min_pred else 0)

    correct_predictions = sum(p == t for p, t in zip(predictions, test_labels))
    accuracy = correct_predictions / len(test_labels)
    print(f"Accuracy on Test Set >{i}: {accuracy}")

In [None]:
def most_popular_items():
  global_vars()
  items_ratings = defaultdict(list)
  for index, row in train_df.iterrows():
    user, item, r = row['user_id'], row['asin'], row['rating']
    items_ratings[item].append(r)

  for i in items_ratings:
    items_ratings[i] = sum(items_ratings[i]) / len(items_ratings[i])

  # sorted_items = sorted(items_ratings.items(), key=lambda x: x[1], reverse=True)

  # num_items = int(threshold * len(sorted_items))
  # for i in range(num_items):
  #   item, num_items = sorted_items[i]
  #   sorted_items = item, (num_items - i)/num_items * 5

  return items_ratings


In [None]:
# def test_binary_model(i, items_ratings, min_pred=4.5, pred_weight=0.8):
#     global_vars()
#     testRatingDict = {}
#     for index, row in test_df.iterrows():
#         user, item, r = row['user_id'], row['asin'], row['rating']
#         testRatingDict[(user, item)] = 1 if r >= 5 else 0
#     test_labels = [int(r) for r in testRatingDict.values()]

#     predictions = []
#     for index, row in test_df.iterrows():
#         user, item = row['user_id'], row['asin']
#         item_rating = items_ratings[item]
#         user_rating = prediction(user, item)
#         if item_rating == []:
#           item_rating = user_rating
#         predictions.append(1 if (float(prediction(user, item)) * pred_weight + item_rating * (1-pred_weight)) > 4.25 else 0)

#     correct_predictions = sum(p == t for p, t in zip(predictions, test_labels))
#     accuracy = correct_predictions / len(test_labels)
#     print(f"Accuracy on Test Set >{i}: {accuracy}")

In [11]:
def global_vars():
    global df, train_df, test_df, reviewsPerUser, reviewsPerItem, ratingDict
    global ratingMean, alpha, userBiases, itemBiases, userGamma, itemGamma
    global timeBias  # Add timeBias
    global K, users, items, years, N, nUsers, nItems, nYears

In [64]:
global_vars()
df = pd.read_json(data_path + "Amazon_Fashion.jsonl.gz", lines=True, compression='gzip')
df = df.sort_values(by='timestamp').drop_duplicates(subset=['user_id', 'asin'], keep='last')

In [49]:
df_fiveStar = df.copy()  # Create a copy of the DataFrame to avoid modifying the original
df_fiveStar['rating'] = df_fiveStar['rating'].apply(lambda x: 1 if x >= 5 else 0)

In [51]:
df.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,year
317735,3,So so...,I bought this item because I dread toe shoes. ...,[],B000689EOY,B000689EOY,AG6BLQGKWPSBB4KW7O4LFYTN4ONA,2004-11-23 16:49:07,9,True,2004
1008946,4,"Soft as a glove, warm and toasty!",Feb 2006:<br />I'm the type of guy who has nev...,[],B000AS2IX4,B000AS2IX4,AGI3EZWWHBQJAOF3UP2USMJQHIIQ,2006-02-20 16:15:32,15,True,2006
829044,4,"Great compartmentalization, a little stress",Pros:<br /><br />1.great compartmentalization ...,[],B000E3F1SO,B000E3F1SO,AGMWACNMAG74AXBF7IJ22IOZSZPA,2007-01-10 20:06:21,7,True,2007
270950,5,Fabulous design!,"I love animal-inspired jewelry, and this is on...",[],B00064USCU,B00064USCU,AF52LUUKLAOPKZCIH7N5OY5FVSVQ,2007-02-21 00:15:43,6,False,2007
270949,5,"Really, a ten-star item!",Of all the wonderful pieces I have found at Da...,[],B00064UU1Y,B00064UU1Y,AF52LUUKLAOPKZCIH7N5OY5FVSVQ,2007-02-21 00:45:14,11,False,2007


In [9]:
df['year'] = df['timestamp'].dt.year
average_rating_by_year = df.groupby('year')['rating'].mean()
print(average_rating_by_year)

year
2002    4.833333
2003    4.500000
2004    3.941176
2005    3.753623
2006    4.026316
2007    4.187135
2008    4.268503
2009    4.259286
2010    4.165227
2011    4.106914
2012    4.173492
2013    4.126679
2014    4.095253
2015    4.101071
2016    4.059618
2017    3.988711
2018    3.971940
2019    3.999919
2020    3.980213
2021    3.822763
2022    3.778959
2023    3.921193
Name: rating, dtype: float64


In [28]:
for i in range(5):
  reviewsPerUser = defaultdict(list)
  reviewsPerItem = defaultdict(list)
  ratingDict = {}

  setup_dataset_values(i)

  ratingMean = sum(ratingDict.values()) / len(ratingDict)
  alpha = ratingMean
  print(alpha)

  N = len(train_df)
  nUsers = len(reviewsPerUser)
  nItems = len(reviewsPerItem)
  users = list(reviewsPerUser.keys())
  items = list(reviewsPerItem.keys())

  userBiases = defaultdict(float)
  itemBiases = defaultdict(float)

  userGamma = {}
  itemGamma = {}

  K=2
  setup_k(K)

  train_model()

  test_model(i)

4.091264751154438
MSE = 1.5107585858058121
MSE = 1.5698315397763944
MSE = 1.5033516927663244
MSE = 1.5061404153886362
MSE = 1.5026883567866838
MSE = 1.5020493085760187
MSE = 1.4997711825636613
MSE = 1.488364308627418
MSE = 1.4762649171138706
MSE = 1.4269424519362037
MSE = 1.3588835326980806
MSE = 1.6606406954335793
MSE = 1.3344245844923641
MSE = 1.2680038558560565
MSE = 1.220486652591355
MSE = 1.1791252005039692
MSE = 1.1406472674571968
MSE = 1.129418657873728
MSE = 1.109238299759471
MSE = 1.0714052553971485
MSE = 1.0563574734442958
MSE on Test Set >0: 1.439021118376767
4.091264751154438
MSE = 1.5107922041404631
MSE = 1.5698247129403464
MSE = 1.5033833241108974
MSE = 1.5061327816128394
MSE = 1.5027200586684362
MSE = 1.5020771312309769
MSE = 1.4997870370745383


KeyboardInterrupt: 

In [58]:
for i in [5]:
  reviewsPerUser = defaultdict(list)
  reviewsPerItem = defaultdict(list)
  ratingDict = {}

  setup_dataset_values(i)

  ratingMean = sum(ratingDict.values()) / len(ratingDict)
  alpha = ratingMean
  print(alpha)

  N = len(train_df)
  nUsers = len(reviewsPerUser)
  nItems = len(reviewsPerItem)
  users = list(reviewsPerUser.keys())
  items = list(reviewsPerItem.keys())

  timeBias = defaultdict(float)
  years = df['year'].unique()
  nYears = len(years)

  userBiases = defaultdict(float)
  itemBiases = defaultdict(float)

  userGamma = {}
  itemGamma = {}

  K=2
  setup_k(K)

  train_model(iter=26)

  test_model(i)

  test_binary_model(i)
 #25 1.354

KeyboardInterrupt: 

In [None]:
print(df.head())

         rating                                        title  \
317735        3                                     So so...   
1008946       4            Soft as a glove, warm and toasty!   
829044        4  Great compartmentalization, a little stress   
270950        5                             Fabulous design!   
270949        5                     Really, a ten-star item!   

                                                      text images        asin  \
317735   I bought this item because I dread toe shoes. ...     []  B000689EOY   
1008946  Feb 2006:<br />I'm the type of guy who has nev...     []  B000AS2IX4   
829044   Pros:<br /><br />1.great compartmentalization ...     []  B000E3F1SO   
270950   I love animal-inspired jewelry, and this is on...     []  B00064USCU   
270949   Of all the wonderful pieces I have found at Da...     []  B00064UU1Y   

        parent_asin                       user_id           timestamp  \
317735   B000689EOY  AG6BLQGKWPSBB4KW7O4LFYTN4ONA 2004-

In [None]:
for i in [5]:
  #df = df_fiveStar.copy()
  reviewsPerUser = defaultdict(list)
  reviewsPerItem = defaultdict(list)
  ratingDict = {}

  setup_dataset_values(i)

  ratingMean = sum(ratingDict.values()) / len(ratingDict)
  alpha = ratingMean
  print(alpha)

  N = len(train_df)
  nUsers = len(reviewsPerUser)
  nItems = len(reviewsPerItem)
  users = list(reviewsPerUser.keys())
  items = list(reviewsPerItem.keys())

  timeBias = defaultdict(float)
  years = df['year'].unique()
  nYears = len(years)

  userBiases = defaultdict(float)
  itemBiases = defaultdict(float)

  userGamma = {}
  itemGamma = {}

  K=2
  setup_k(K)

  train_model(iter=26, cost=cost_binary)

  test_model(i)

  test_binary_model(i)
 #25 1.354