## Resources

- https://towardsdatascience.com/introduction-to-latent-matrix-factorization-recommender-systems-8dfc63b94875
- https://eugeneyan.com/writing/recommender-systems-baseline-pytorch/
- julian mccauley amazon dataset

In [1]:
import os
import gzip
import json
import pandas as pd
import random
import collections
from collections import defaultdict
from tqdm import tqdm



In [2]:
# Define a “pretty print” function pprint for dict objects and dataframes.
pprint = lambda x: print(json.dumps(x, indent=2)) if isinstance(x, dict) else display(x)

## Loading in Data

In [3]:
reviews = []
items = []

In [4]:
file = "data/Software.jsonl.gz"

with gzip.open(file, 'rt') as fp:
    for line in fp:
        reviews.append(json.loads(line.strip()))


In [5]:
file = "data/meta_Software.jsonl.gz"

with gzip.open(file, 'rt') as fp:
    for line in fp:
        items.append(json.loads(line.strip()))

## Exploratory Analysis

In [None]:
reviews_df = pd.read_json("data/Software.jsonl.gz", lines=True, compression='gzip')

In [None]:
items_df = pd.read_json("data/meta_Software.jsonl.gz", lines=True, compression='gzip')

#### Display basic summary stats of the data

In [7]:
# reviews summary stats
print("############################## DATA INFO ############################## ")
print(reviews_df.info())      # DataFrame info

print("############################## DATA DESCRIBE ############################## ")
print(reviews_df.describe())  # Descriptive statistics

print("############################## DATA HEAD ############################## ")
print(reviews_df.head())      # First 5 rows

print("############################## DATA SHAPE ############################## ")
print(reviews_df.shape)       # Shape of DataFrame

# items summary stats
print("############################## DATA INFO ############################## ")
print(items_df.info())        # DataFrame info

print("############################## DATA DESCRIBE ############################## ")
print(items_df.describe())    # Descriptive statistics

print("############################## DATA HEAD ############################## ")
print(items_df.head())        # First 5 rows

print("############################## DATA SHAPE ############################## ")
print(items_df.shape)         # Shape of DataFrame

############################## DATA INFO ############################## 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4880181 entries, 0 to 4880180
Data columns (total 10 columns):
 #   Column             Dtype         
---  ------             -----         
 0   rating             int64         
 1   title              object        
 2   text               object        
 3   images             object        
 4   asin               object        
 5   parent_asin        object        
 6   user_id            object        
 7   timestamp          datetime64[ns]
 8   helpful_vote       int64         
 9   verified_purchase  bool          
dtypes: bool(1), datetime64[ns](1), int64(2), object(6)
memory usage: 339.7+ MB
None
############################## DATA DESCRIBE ############################## 
             rating                      timestamp  helpful_vote
count  4.880181e+06                        4880181  4.880181e+06
mean   3.935087e+00  2016-08-11 20:56:15.320964864  4.

## Pre-Processing Dataset

In [None]:
print("######## NaNs ######## ")
print("Items: ")
print(items_df.isna().sum())
print("\nReviews: ")
print(reviews_df.isna().sum())

print("######## Duplicates ########")
print(reviews_df.duplicated(subset=['asin', 'user_id', 'timestamp']).sum())

######## NaNs ######## 
Items: 
main_category           0
title                   0
average_rating          0
rating_number           0
features                0
description             0
price               94886
images                  0
videos                  0
store               11331
categories              0
details                 0
parent_asin             0
bought_together    112590
dtype: int64

Reviews: 
rating               0
title                0
text                 0
images               0
asin                 0
parent_asin          0
user_id              0
timestamp            0
helpful_vote         0
verified_purchase    0
dtype: int64
######## Duplicates ########
7276


701528

#### K-core Filtering (k=5)
- Retain users with ≥5 reviews and items with ≥5 reviews.
- Remove duplicate reviews, keeping the earliest for each `{userID, parent_asin}`.

In [6]:
# Helper Functions

def load_ratings(rev):
    inters = []
    for review in rev:
        item, user, rating, time = review['parent_asin'], review['user_id'], review['rating'], review['timestamp']
        inters.append((user, item, float(rating), int(time)))
    return inters

def get_user2count(inters):
    user2count = collections.defaultdict(int)
    for unit in inters:
        user2count[unit[0]] += 1
    return user2count


def get_item2count(inters):
    item2count = collections.defaultdict(int)
    for unit in inters:
        item2count[unit[1]] += 1
    return item2count


def generate_candidates(unit2count, threshold):
    cans = set()
    for unit, count in unit2count.items():
        if count >= threshold:
            cans.add(unit)
    return cans, len(unit2count) - len(cans)

In [7]:
# Make the interactions in order and remove duplicate reviews

def make_inters_in_order(inters):
    user2inters, new_inters = collections.defaultdict(list), []
    for inter in inters:
        user, item, rating, timestamp = inter
        user2inters[user].append((user, item, rating, timestamp))
    for user in user2inters:
        user_inters = user2inters[user]
        user_inters.sort(key=lambda d: d[3])
        his_items = set()
        for inter in user_inters:
            user, item, rating, timestamp = inter
            if item in his_items:
                continue
            his_items.add(item)
            new_inters.append(inter)
    return new_inters

# filter by k-core (5 in this case)
def filter_inters(inters, user_k_core_threshold=0, item_k_core_threshold=0):
    new_inters = []
    # filter by k-core
    if user_k_core_threshold or item_k_core_threshold:
        print('\nFiltering by k-core:')
        idx = 0
        user2count = get_user2count(inters)
        item2count = get_item2count(inters)

        while True:
            new_user2count = collections.defaultdict(int)
            new_item2count = collections.defaultdict(int)
            users, n_filtered_users = generate_candidates(
                user2count, user_k_core_threshold)
            items, n_filtered_items = generate_candidates(
                item2count, item_k_core_threshold)
            if n_filtered_users == 0 and n_filtered_items == 0:
                break
            for unit in inters:
                if unit[0] in users and unit[1] in items:
                    new_inters.append(unit)
                    new_user2count[unit[0]] += 1
                    new_item2count[unit[1]] += 1
            idx += 1
            inters, new_inters = new_inters, []
            user2count, item2count = new_user2count, new_item2count
            print('    Epoch %d The number of inters: %d, users: %d, items: %d'
                    % (idx, len(inters), len(user2count), len(item2count)))
    return inters


In [8]:
# Preprocessing step

def preprocess_rating(inters):
    print('Process rating data: ')
    print(' Dataset: reviews',)

    # load ratings
    rating_inters = load_ratings(inters)


    # Sort and remove repeated reviews
    rating_inters = make_inters_in_order(rating_inters)

    # K-core filtering;
    print('The number of raw inters: ', len(rating_inters))
    kcore_rating_inters = filter_inters(rating_inters,
                                        user_k_core_threshold=5,
                                        item_k_core_threshold=5)

    # return: list of (user_ID, item_ID, rating, timestamp)
    return kcore_rating_inters, rating_inters

In [9]:
inters,_ = preprocess_rating(reviews)


Process rating data: 
 Dataset: reviews
The number of raw inters:  4828480

Filtering by k-core:
    Epoch 1 The number of inters: 1353435, users: 157062, items: 32850
    Epoch 2 The number of inters: 1302721, users: 152319, items: 18486
    Epoch 3 The number of inters: 1281240, users: 146980, items: 18143
    Epoch 4 The number of inters: 1278612, users: 146779, items: 17654
    Epoch 5 The number of inters: 1277242, users: 146453, items: 17635
    Epoch 6 The number of inters: 1277020, users: 146436, items: 17596
    Epoch 7 The number of inters: 1276876, users: 146402, items: 17594
    Epoch 8 The number of inters: 1276864, users: 146399, items: 17594
    Epoch 9 The number of inters: 1276852, users: 146399, items: 17591
    Epoch 10 The number of inters: 1276840, users: 146396, items: 17591


#### Last-out Split
We split our dataset into training, validation, and test using the "leave-last-out data split" method. 

- Training part: the first N-2 items;
- Validation part: the (N-1)-th item;
- Testing part: the N-th item.



In [10]:
# helper method

def make_inters_in_order(inters):
    user2inters, new_inters = collections.defaultdict(list), collections.defaultdict(list)
    for inter in inters:
        user, item, rating, timestamp = inter
        user2inters[user].append((user, item, rating, timestamp))
    for user in user2inters:
        user_inters = user2inters[user]
        user_inters.sort(key=lambda d: d[3])
        his_items = set()
        for inter in user_inters:
            user, item, rating, timestamp = inter
            if item in his_items:
                continue
            his_items.add(item)
            new_inters[user].append(inter)
    return new_inters


In [11]:
def last_out_split(inters):
    train_data = []
    valid_data = []
    test_data = []

    # Order the inters
    ordered_inters = make_inters_in_order(inters=inters)


    for user in tqdm(ordered_inters, desc='Creating train/valid/test lists'):
        cur_inter = ordered_inters[user]
        # Add the last interaction to the test set
        test_data.append((cur_inter[-1][0], cur_inter[-1][1], cur_inter[-1][2], cur_inter[-1][3]))
        
        if len(cur_inter) > 1:
            # Add the second-to-last interaction to the validation set
            valid_data.append((cur_inter[-2][0], cur_inter[-2][1], cur_inter[-2][2], cur_inter[-2][3]))
        
        if len(cur_inter) > 2:
            # Add the remaining interactions to the training set
            for i in range(len(cur_inter) - 2):
                train_data.append((cur_inter[i][0], cur_inter[i][1], cur_inter[i][2], cur_inter[i][3]))
    return train_data, valid_data, test_data


In [12]:
train_data, valid_data, test_data = last_out_split(inters)

Creating train/valid/test lists: 100%|██████████| 146396/146396 [00:03<00:00, 38528.43it/s]


## Purchasing Prediction
In this section, I will develop several baseline models and iteratively improve upon them using advanced techniques discussed in class, as well as deep learning methods. These models will include Most Popular, item-to-item based collaborative filtering, Bayesian Personalized Ranking (BPR), and a deep learning model implemented with PyTorch.


#### Most Popular Model

In [127]:
from collections import Counter

def mostPopularClassification(data, threshold=0.75):
    """
    Identifies the most popular items based on their frequency in the data.
    
    Parameters:
        data (list): A list of tuples or dictionaries where the first element/key represents the item.
        threshold (float): The proportion of total occurrences to include in the popular set.

    Returns:
        set: A set of the most popular items.
    """
    # Count occurrences of each item
    item_counts = Counter(entry[1] for entry in data)
    total_occurrences = sum(item_counts.values())

    # Sort items by frequency in descending order
    sorted_items = sorted(item_counts.items(), key=lambda x: x[1], reverse=True)

    # Select items until the threshold is reached
    popular_items = set()
    cumulative_count = 0
    for item, count in sorted_items:
        cumulative_count += count
        popular_items.add(item)
        if cumulative_count >= threshold * total_occurrences:
            break

    return popular_items

In [128]:
popular_items = mostPopularClassification(train_data, 0.75)

In [129]:
user_set = set()
item_set = set()
purchased_set = set()

# Collect from train and validation data
for u, b, r,_ in train_data + valid_data:  # Avoid test_data
    user_set.add(u)
    item_set.add(b)
    purchased_set.add((u, b))  # Add observed interactions

In [130]:
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
for u,i,r,_ in tqdm(train_data):
    ratingsPerUser[u].append((i,r))
    ratingsPerItem[i].append((u,r))

100%|██████████| 984048/984048 [00:03<00:00, 323031.38it/s]


In [21]:
# Make validation dataset include negative sets at an equal rate as positive set

def generate_negatives(valid_data, item_set):
    """
    Generate one negative sample for each positive interaction in valid_data.
    
    Args:
        valid_data (list): List of tuples (user, item, ...) representing positive interactions.
        item_set (set): Set of all possible items.
        purchased_set (set): Set of all (user, item) pairs that are positive interactions.
    
    Returns:
        list: List of negative samples corresponding to each positive interaction.
    """
    negatives = []
    for user, item, rating, time in tqdm(valid_data, desc= "Creating negative set"):  # Unpack the valid_data tuple
        # Get items the user has not purchased
        non_purchased_items = item_set - {i for i, r in ratingsPerUser[user]}
        
        # Randomly sample one non-purchased item
        if non_purchased_items:
            sampled_item = random.choice(list(non_purchased_items))
            negatives.append((user, sampled_item, rating, time, 0))  # Label as 0 for negative sample
    
    return negatives

In [23]:
neg_valid_data = generate_negatives(valid_data, item_set)

Creating negative set: 100%|██████████| 146396/146396 [02:36<00:00, 936.67it/s] 


In [24]:
pos_valid_data = [(u, i, r, t, 1) for u, i, r, t in valid_data]
valid_data_full = pos_valid_data + neg_valid_data

In [None]:
# Measuring performance of model for Most Popular model
def mostPopular_accuracy(valid_data):
    accurate = 0
    for u, i,_,_, l  in valid_data:
        prediction = 1 if i in popular_items else 0  # Predicted label
        if(prediction == l):
            accurate += 1
        return (accurate / len(valid_data))

def precision_at_k(recommendations, relevant_items):
    return len(recommendations & relevant_items) / len(recommendations)

def recall_at_k(most_popular, relevant_items):
    if len(relevant_items) == 0:
        return 0  # Avoid division by zero
    return len(set(most_popular) & set(relevant_items)) / len(relevant_items)

In [26]:
# Testing against validation dataset to optimize threshold
valid_data_set = {x[1] for x in valid_data}

precision = precision_at_k(popular_items, valid_data_set)
recall = recall_at_k(popular_items, valid_data_set)
accuracy = mostPopular_accuracy(valid_data_full)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"accuracy: {accuracy}")


Precision: 1.0
Recall: 0.15030674846625766
accuracy: 0.0


#### Item-to-Item Collaborative Filtering

In [132]:
import math

In [None]:
user_set = set()
item_set = set()
purchased_set = set()

# Collect from train and validation data
for u, b, r,_ in train_data + valid_data:  # Avoid test_data
    user_set.add(u)
    item_set.add(b)
    purchased_set.add((u, b))  # Add observed interactions

In [None]:
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set) 
ratingDict = defaultdict(list)

for u,b,r in train_data:
    ratingsPerUser[u].append((b,r))
    ratingsPerItem[b].append((u,r))
    itemsPerUser[u].add(b)
    ratingDict[(u, b)] = int(r)
    usersPerItem[b].add(u)


In [133]:
def Cosine(i1, i2):
    # Between two items
    inter = usersPerItem[i1].intersection(usersPerItem[i2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for u in inter:
        numer += ratingDict[(u,i1)]*ratingDict[(u,i2)]
    for u in usersPerItem[i1]:
        denom1 += ratingDict[(u,i1)]**2
    for u in usersPerItem[i2]:
        denom2 += ratingDict[(u,i2)]**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0: return 0
    return numer / denom

def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom > 0:
        return numer/denom
    return 0


In [134]:
userAverages = {}
itemAverages = {}

for u in itemsPerUser:
    rs = [ratingDict[(u,i)] for i in itemsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)
    
for i in usersPerItem:
    rs = [ratingDict[(u,i)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)

def pearson(i1, i2):
    # Between two items
    iBar1 = itemAverages[i1]
    iBar2 = itemAverages[i2]
    inter = usersPerItem[i1].intersection(usersPerItem[i2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for u in inter:
        numer += (ratingDict[(u,i1)] - iBar1)*(ratingDict[(u,i2)] - iBar2)
    for u in inter: #usersPerItem[i1]:
        denom1 += (ratingDict[(u,i1)] - iBar1)**2
    #for u in usersPerItem[i2]:
        denom2 += (ratingDict[(u,i2)] - iBar2)**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0: return 0
    return numer / denom

NameError: name 'itemsPerUser' is not defined

In [135]:
correct = 0
jaccard_threshold = 0.10  
cosine_threshold = 0.10  

def predictPurchase(u,b):
    maxJaccardSim = 0
    maxCosineSim = 0
    users = set(ratingsPerItem[b])
    
    # Compute similarities for all items rated by the user
    for b2, _ in ratingsPerUser[u]:
        # Jaccard similarity
        jaccardSim = Jaccard(users, set(ratingsPerItem[b2]))
        if jaccardSim > maxJaccardSim:
            maxJaccardSim = jaccardSim
        
        # Cosine similarity
        cosineSim = Cosine(b, b2)
        if cosineSim > maxCosineSim:
            maxCosineSim = cosineSim
    
    # Prediction logic
    pred = 0
    if (
        maxJaccardSim > jaccard_threshold 
        or maxCosineSim > cosine_threshold 
        or (b in popular_items) 
        or len(ratingsPerItem[b]) > 40
    ):
        pred = 1

    return pred

#### Latent Factor Model (PyTorch)

They aim to map users and items to a shared latent space where their interactions can be represented as the dot product of their latent representations. Latent factors represent categories that are present in the data. With a higher k, you have more specific categories. Whats going is we are trying to predict a user u’s rating of item i. Therefore, we look at P to find a vector representing user u, and their preferences or “affinity” toward all of the latent factors. Then, we look at Q to find a vector representing item i and it’s “affinity” toward all the latent factors.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder

In [15]:
user_set = set()
item_set = set()
purchased_set = set()

# Collect from train and validation data
for u, b, r,_ in train_data + valid_data + test_data:
    user_set.add(u)
    item_set.add(b)
    purchased_set.add((u, b))  # Add observed interactions

In [16]:
user_IDs = {}
item_IDs = {}
interactions = []

# Could adapt to any dataset, this one is from
# https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home
for u, i, r,_ in train_data + valid_data + test_data:
    if not u in user_IDs: user_IDs[u] = len(user_IDs)
    if not i in item_IDs: item_IDs[i] = len(item_IDs)
    interactions.append((u,i,r))

In [35]:
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
for u,i,r,_ in tqdm(train_data + valid_data + test_data):
    ratingsPerUser[u].append((i,r))
    ratingsPerItem[i].append((u,r))

100%|██████████| 1276840/1276840 [00:02<00:00, 429933.44it/s]


In [38]:
def generate_negatives(data, item_set, purchased_set, dataset_name=""):
    """
    Generate one negative sample for each positive interaction in the dataset.

    Args:
        data (list): List of tuples (user, item, ...) representing positive interactions.
        item_set (set): Set of all possible items.
        purchased_set (set): Set of all (user, item) pairs that are positive interactions.
        dataset_name (str): Name of the dataset for logging.

    Returns:
        list: List of negative samples corresponding to each positive interaction.
    """
    negatives = []
    for user, item, rating, time in tqdm(data, desc=f"Creating negative set for {dataset_name}"):
        # Get items the user has not purchased
        non_purchased_items = item_set - {i for i, r in ratingsPerUser[user]}
        
        # Randomly sample one non-purchased item
        if non_purchased_items:
            sampled_item = random.choice(list(non_purchased_items))
            negatives.append((user, sampled_item, rating, time, 0))  # Label as 0 for negative sample
    
    return negatives

In [39]:
# Generate negative samples for each dataset
neg_train_data = generate_negatives(train_data, item_set, purchased_set, "train_data")
neg_valid_data = generate_negatives(valid_data, item_set, purchased_set, "valid_data")
neg_test_data = generate_negatives(test_data, item_set, purchased_set, "test_data")

Creating negative set for train_data: 100%|██████████| 984048/984048 [22:32<00:00, 727.74it/s]  
Creating negative set for valid_data: 100%|██████████| 146396/146396 [04:19<00:00, 563.72it/s]
Creating negative set for test_data: 100%|██████████| 146396/146396 [03:41<00:00, 662.05it/s] 


In [40]:
# Combine positive and negative samples for each dataset
pos_train_data = [(u, i, r, t, 1) for u, i, r, t in train_data]
train_data_full = pos_train_data + neg_train_data

pos_valid_data = [(u, i, r, t, 1) for u, i, r, t in valid_data]
valid_data_full = pos_valid_data + neg_valid_data

pos_test_data = [(u, i, r, t, 1) for u, i, r, t in test_data]
test_data_full = pos_test_data + neg_test_data

In [None]:
# Put into csv
train_data_full_df = pd.DataFrame(train_data_full, columns=["user_ID", "parent_asin", "rating", "timestamp", "label"])
train_data_full_df.to_csv("data/train_data_full.csv", index=False)  # Set index=False to avoid writing row numbers

valid_data_full_df = pd.DataFrame(valid_data_full, columns=["user_ID", "parent_asin", "rating", "timestamp", "label"])
valid_data_full_df.to_csv("data/valid_data_full.csv", index=False)  # Set index=False to avoid writing row numbers

test_data_full_df = pd.DataFrame(test_data_full, columns=["user_ID", "parent_asin", "rating", "timestamp", "label"])
test_data_full_df.to_csv("data/test_data_full.csv", index=False)  # Set index=False to avoid writing row numbers


In [112]:
train_data_full_df = pd.read_csv("data/train_data_full.csv")
valid_data_full_df = pd.read_csv("data/valid_data_full.csv")
test_data_full_df = pd.read_csv("data/test_data_full.csv")

In [113]:
train_data_full_df.head()

Unnamed: 0,user_ID,parent_asin,rating,timestamp,label
0,AFSKPY37N3C43SOI5IEXEK5JSIYA,B00466BGS4,5.0,1284039832000,1
1,AFSKPY37N3C43SOI5IEXEK5JSIYA,B004074Y6U,2.0,1365629921000,1
2,AFSKPY37N3C43SOI5IEXEK5JSIYA,B005ZKC4FO,4.0,1365671303000,1
3,AFZUK3MTBIBEDQOPAK3OATUOUKLA,B00FWRNW1A,5.0,1382845563000,1
4,AFZUK3MTBIBEDQOPAK3OATUOUKLA,B00EEDJHXA,5.0,1391875795000,1


In [51]:
# Convert user and item IDs to indices
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

train_data_full_df['user_idx'] = user_encoder.fit_transform(train_data_full_df['user_ID'])
train_data_full_df['item_idx'] = item_encoder.fit_transform(train_data_full_df['parent_asin'])

# Labels (1 for purchase, 0 for no purchase)
labels = train_data_full_df['label']

In [52]:
class LatentFactorModel(nn.Module):
    def __init__(self, num_users, num_items, num_factors):
        super(LatentFactorModel, self).__init__()
        self.user_factors = nn.Embedding(num_users, num_factors)  # User latent factors
        self.item_factors = nn.Embedding(num_items, num_factors)  # Item latent factors
        self.user_biases = nn.Embedding(num_users, 1)  # User biases
        self.item_biases = nn.Embedding(num_items, 1)  # Item biases
        self.global_bias = nn.Parameter(torch.zeros(1))  # Global bias

    def forward(self, user_idx, item_idx):
        # Dot product of user and item latent factors
        dot_product = (self.user_factors(user_idx) * self.item_factors(item_idx)).sum(dim=1)
        # Add biases
        prediction = dot_product + self.user_biases(user_idx).squeeze() + self.item_biases(item_idx).squeeze() + self.global_bias
        return torch.sigmoid(prediction)  # Sigmoid for binary classification

In [59]:
# Initialize model, loss, and optimizer
num_users = len(user_encoder.classes_)
num_items = len(item_encoder.classes_)
num_factors = 20

model = LatentFactorModel(num_users, num_items, num_factors)
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Prepare training data
user_idx = torch.tensor(train_data_full_df['user_idx'].values)
item_idx = torch.tensor(train_data_full_df['item_idx'].values)
labels = torch.tensor(train_data_full_df['label'].values, dtype=torch.float32)

# Training loop
epochs = 20
for epoch in tqdm(range(epochs), desc="Training Progress"):
    model.train()
    optimizer.zero_grad()
    predictions = model(user_idx, item_idx)
    loss = criterion(predictions, labels)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

Training Progress:   5%|▌         | 1/20 [00:00<00:11,  1.71it/s]

Epoch 1/20, Loss: 2.0141


Training Progress:  10%|█         | 2/20 [00:01<00:10,  1.78it/s]

Epoch 2/20, Loss: 2.0091


Training Progress:  15%|█▌        | 3/20 [00:01<00:10,  1.57it/s]

Epoch 3/20, Loss: 2.0041


Training Progress:  20%|██        | 4/20 [00:02<00:09,  1.61it/s]

Epoch 4/20, Loss: 1.9991


Training Progress:  25%|██▌       | 5/20 [00:02<00:08,  1.68it/s]

Epoch 5/20, Loss: 1.9942


Training Progress:  30%|███       | 6/20 [00:03<00:09,  1.53it/s]

Epoch 6/20, Loss: 1.9893


Training Progress:  35%|███▌      | 7/20 [00:04<00:09,  1.32it/s]

Epoch 7/20, Loss: 1.9843


Training Progress:  40%|████      | 8/20 [00:05<00:10,  1.19it/s]

Epoch 8/20, Loss: 1.9794


Training Progress:  45%|████▌     | 9/20 [00:06<00:09,  1.12it/s]

Epoch 9/20, Loss: 1.9744


Training Progress:  50%|█████     | 10/20 [00:08<00:10,  1.04s/it]

Epoch 10/20, Loss: 1.9695


Training Progress:  55%|█████▌    | 11/20 [00:09<00:10,  1.18s/it]

Epoch 11/20, Loss: 1.9645


Training Progress:  60%|██████    | 12/20 [00:10<00:09,  1.18s/it]

Epoch 12/20, Loss: 1.9596


Training Progress:  65%|██████▌   | 13/20 [00:11<00:08,  1.16s/it]

Epoch 13/20, Loss: 1.9548


Training Progress:  70%|███████   | 14/20 [00:13<00:07,  1.17s/it]

Epoch 14/20, Loss: 1.9499


Training Progress:  75%|███████▌  | 15/20 [00:14<00:05,  1.11s/it]

Epoch 15/20, Loss: 1.9451


Training Progress:  80%|████████  | 16/20 [00:15<00:04,  1.07s/it]

Epoch 16/20, Loss: 1.9402


Training Progress:  85%|████████▌ | 17/20 [00:16<00:03,  1.05s/it]

Epoch 17/20, Loss: 1.9352


Training Progress:  90%|█████████ | 18/20 [00:16<00:02,  1.01s/it]

Epoch 18/20, Loss: 1.9304


Training Progress:  95%|█████████▌| 19/20 [00:17<00:00,  1.02it/s]

Epoch 19/20, Loss: 1.9256


Training Progress: 100%|██████████| 20/20 [00:18<00:00,  1.06it/s]

Epoch 20/20, Loss: 1.9208





In [58]:
valid_data_full_df['user_idx'] = user_encoder.transform(valid_data_full_df['user_ID'])
valid_data_full_df['item_idx'] = item_encoder.transform(valid_data_full_df['parent_asin'])

model.eval()
with torch.no_grad():
    valid_user_idx = torch.tensor(valid_data_full_df['user_idx'].values)
    valid_item_idx = torch.tensor(valid_data_full_df['item_idx'].values)
    valid_labels = torch.tensor(valid_data_full_df['label'].values, dtype=torch.float32)
    predictions = model(valid_user_idx, valid_item_idx)
    predictions = (predictions > 0.5).float()
    accuracy = (predictions == valid_labels).float().mean()
    print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.5004


#### Complete Latent Factor Model (Numpy, Scipy)

In [86]:
import scipy
import numpy

In [87]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [88]:
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
for u,i,r,_ in tqdm(train_data):
    ratingsPerUser[u].append((i,r))
    ratingsPerItem[i].append((u,r))

100%|██████████| 984048/984048 [00:03<00:00, 317572.11it/s]


In [89]:
N = len(train_data)
nUsers = len(ratingsPerUser)
nItems = len(ratingsPerItem)
users = list(ratingsPerUser.keys())
items = list(ratingsPerItem.keys())


In [91]:
ratingMean = sum(r for _, _, r,_ in train_data) / len(train_data)
ratingMean

4.0345318521047755

In [None]:
alpha = ratingMean
labels = [d[2] for d in train_data]
userBiases = defaultdict(float)
itemBiases = defaultdict(float)
userGamma = {}
itemGamma = {}
K = 2

for u in ratingsPerUser:
    userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]
for i in ratingsPerItem:
    itemGamma[i] = [random.random() * 0.1 - 0.05 for k in range(K)]

In [95]:
def unpack(theta):
    global alpha
    global userBiases
    global itemBiases
    global userGamma
    global itemGamma
    index = 0
    alpha = theta[index]
    index += 1
    userBiases = dict(zip(users, theta[index:index+nUsers]))
    index += nUsers
    itemBiases = dict(zip(items, theta[index:index+nItems]))
    index += nItems
    for u in users:
        userGamma[u] = theta[index:index+K]
        index += K
    for i in items:
        itemGamma[i] = theta[index:index+K]
        index += K

In [96]:
def inner(x, y):
    return sum([a*b for a,b in zip(x,y)])

def prediction(user, item):
    user_bias = userBiases.get(user, 0.0)
    item_bias = itemBiases.get(item, 0.0)
    user_gamma = userGamma.get(user, [0.0] * K)
    item_gamma = itemGamma.get(item, [0.0] * K)
    
    return alpha + user_bias + item_bias + inner(user_gamma, item_gamma)

In [97]:
def cost(theta, labels, lamb):
    unpack(theta)
    predictions = [prediction(d[0], d[1]) for d in train_data]
    cost = MSE(predictions, labels)
    print("MSE = " + str(cost))
    for u in users:
        cost += lamb*userBiases[u]**2
        for k in range(K):
            cost += lamb*userGamma[u][k]**2
    for i in items:
        cost += lamb*itemBiases[i]**2
        for k in range(K):
            cost += lamb*itemGamma[i][k]**2
    return cost


In [98]:
def derivative(theta, labels, lamb):
    unpack(theta)
    N = len(train_data)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    dUserGamma = {}
    dItemGamma = {}
    for u in ratingsPerUser:
        dUserGamma[u] = [0.0 for k in range(K)]
    for i in ratingsPerItem:
        dItemGamma[i] = [0.0 for k in range(K)]
    for d in train_data:
        u,i = d[0], d[1]
        pred = prediction(u, i)
        diff = pred - d[2]
        dalpha += 2/N*diff
        dUserBiases[u] += 2/N*diff
        dItemBiases[i] += 2/N*diff
        for k in range(K):
            dUserGamma[u][k] += 2/N*itemGamma[i][k]*diff
            dItemGamma[i][k] += 2/N*userGamma[u][k]*diff
    for u in userBiases:
        dUserBiases[u] += 2*lamb*userBiases[u]
        for k in range(K):
            dUserGamma[u][k] += 2*lamb*userGamma[u][k]
    for i in itemBiases:
        dItemBiases[i] += 2*lamb*itemBiases[i]
        for k in range(K):
            dItemGamma[i][k] += 2*lamb*itemGamma[i][k]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    for u in users:
        dtheta += dUserGamma[u]
    for i in items:
        dtheta += dItemGamma[i]
    return numpy.array(dtheta)


In [99]:
scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + # Initialize alpha
                                   [0.0]*(nUsers+nItems) + # Initialize beta
                                   [random.random() * 0.1 - 0.05 for k in range(K*(nUsers+nItems))], # Gamma
                             derivative, args = (labels, 0.001))

MSE = 1.7948607568590096
MSE = 1.7825268761593958
MSE = 4.937117337940384
MSE = 1.7901026378080709
MSE = 1.758678996833904
MSE = 1.7559860318138723
MSE = 1.746190872633137
MSE = 1.7072620050547018
MSE = 1.7058387808708848
MSE = 1.7079095155464608
MSE = 1.7085188643196114
MSE = 1.7091300243197818
MSE = 1.7097001460331234
MSE = 1.7099404022262072
MSE = 1.7100464782445248
MSE = 1.7100736247394632
MSE = 1.7100776752401614
MSE = 1.7100756733588656
MSE = 1.7100764681832057
MSE = 1.710081923555373
MSE = 1.7100850825657585


(array([ 3.96753697e+00, -9.64337059e-04, -1.79492485e-02, ...,
         1.18062141e-06,  7.93900829e-07, -1.64548958e-06]),
 1.7412779714577473,
 {'grad': array([-8.19462171e-07,  6.84519973e-10, -9.36256818e-09, ...,
          2.35676681e-09,  1.58770706e-09, -3.29115959e-09]),
  'task': 'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL',
  'funcalls': 21,
  'nit': 17,
  'warnflag': 0})