In [9]:
import pickle
import numpy as np
import pandas as pd
import torch

In [2]:
restaurants_df = pd.read_pickle('dataset/dataframe_restaurant_filtered.pkl')

In [3]:
restaurants_df = restaurants_df[["User_id","Business_id","Rating"]]

In [4]:
restaurants_df.head()

Unnamed: 0,User_id,Business_id,Rating
2440314,1987824,45676,5.0
1588099,1987824,6663,4.0
2878432,1987824,48299,4.0
2692698,1987824,45044,5.0
2081843,1987824,23498,5.0


In [5]:
import scipy.sparse as sp

# Assuming restaurants_df has columns: 'User_id', 'Business_id', 'Rating'

# Map unique user and business IDs to integer indices
user_mapper = {user_id: index for index, user_id in enumerate(restaurants_df['User_id'].unique())}
business_mapper = {business_id: index for index, business_id in enumerate(restaurants_df['Business_id'].unique())}

# Create integer indices for users and businesses in the DataFrame
row_indices = restaurants_df['User_id'].map(user_mapper)
col_indices = restaurants_df['Business_id'].map(business_mapper)

# Create the sparse matrix using CSR format
ratings = sp.csr_matrix((restaurants_df['Rating'], (row_indices, col_indices)),
                        shape=(len(user_mapper), len(business_mapper)), dtype=np.int8)


To access rating use `ratings[user_mapper[user_id],business_mapper[restaurant_id]]`

Data is very sparse 


Grid search for number of latent factors might prove useful: 5-20 

In [6]:
num_users= ratings.shape[0]
num_items= ratings.shape[1]
latent_dim= 5
lr= 0.001
l2_reg= 0.05
epochs= 100

In [7]:
# Randomly initialize user and item matrices
user_matrix = np.random.normal(scale=1./latent_dim, size=(num_users, latent_dim))
item_matrix = np.random.normal(scale=1./latent_dim, size=(num_items, latent_dim))

Train the PMF with Hinge Loss and Adam Optimizer

In [10]:
def CSR_to_chunks(data, batchsize):
    num_batches = data.shape[0] // batchsize
    for i in range(num_batches):
        yield data[i * batchsize : (i + 1) * batchsize, :]

In [17]:
#compute hinge loss
import torch.nn.functional as F

def compute_hinge_loss(prediction, actual):
    loss = F.hinge_embedding_loss(prediction.view(-1), actual.view(-1), margin=1.0, reduction='mean')
    return loss


In [18]:
def train_PMF(data,user_matrix,item_matrix,epochs,lr,l2_reg):
    user_matrix = torch.tensor(user_matrix, requires_grad=True)
    item_matrix = torch.tensor(item_matrix, requires_grad=True)

    optim = torch.optim.Adam([user_matrix, item_matrix], lr=lr)

    for epoch in range(epochs):
        batch_size= 1000
        idx=0
        for batch in CSR_to_chunks(data, batch_size):
            optim.zero_grad()
            shape= batch.shape[0]
            users= user_matrix[idx:idx+shape]
            prediction = torch.mm(users, item_matrix.t())
            loss = compute_hinge_loss(prediction, torch.tensor(batch.todense()))
            total_loss = loss + l2_reg * (torch.norm(users) + torch.norm(item_matrix))
            idx+=shape
            total_loss.backward()
            optim.step()
        
        print("Epoch: {}, Loss: {}".format(epoch, total_loss))


In [19]:
train_PMF(ratings, user_matrix, item_matrix, epochs, lr, l2_reg)


KeyboardInterrupt: 