In [2]:
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from math import floor

In [3]:
if torch.cuda.is_available():
    device = torch.device('cuda')
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')


In [4]:
path = '../data/ml-1m/'

movies = pd.read_csv(path + 'movies.dat', sep = '::', engine = 'python', encoding = 'latin-1', names = ['movie_id', 'title', 'genres'])
ratings = pd.read_csv(path + 'ratings.dat', sep = '::', engine = 'python', encoding = 'latin-1', names = ['user_id', 'movie_id', 'rating', 'time'])
users = pd.read_csv(path + 'users.dat', sep = '::', engine = 'python', encoding = 'latin-1', names = ['user_id', 'gender', 'age', 'occupation', 'zipcode'])


In [5]:
pivot_matrix = ratings.pivot(index='user_id', columns='movie_id', values='rating')
original_rating_matrix = ratings.pivot(index='user_id', columns='movie_id', values='rating')

n_users, n_movies = pivot_matrix.values.shape
pivot_notna = pivot_matrix.notna()

In [6]:
def train_test_split(matrix, validation_ratio, test_ratio, random_seed = 42):
    random.seed(random_seed)
    n_rows, _ = matrix.shape
    train_dataset, test_dataset, valid_dataset = np.zeros_like(matrix), np.zeros_like(matrix), np.zeros_like(matrix)
    
    
    for i in range(n_rows):
        true_indices = np.argwhere(matrix[i, :])
        np.random.shuffle(true_indices)

        num_valid = int(len(true_indices) * validation_ratio)
        num_test = int(len(true_indices) * test_ratio)
        
        row_test_indices = true_indices[:num_test].tolist()
        row_valid_indices = true_indices[num_test:num_test + num_valid].tolist()
        row_train_indices = true_indices[num_test + num_valid:].tolist()

        train_dataset[i, [row_train_indices]] = 1
        test_dataset[i, [row_test_indices]] = 1
        valid_dataset[i, [row_valid_indices]] = 1
    
    return train_dataset, test_dataset, valid_dataset

bin_train_data, bin_test_data, bin_valid_data = train_test_split(pivot_notna.values, 0.2, 0.2)

# CML without F (Not included Item-Features)

In [36]:
class CML(nn.Module):
    def __init__(self, n_users, n_items, dim = 100, lambda_f = 0.5, lambda_c = 10):
        super().__init__()
        self.users = nn.Parameter(torch.randn(n_users, dim))
        self.items = nn.Parameter(torch.randn(n_items, dim))
        self.lambda_f = lambda_f
        self.lambda_c = lambda_c

    def forward(self, i, j):
        dist = (self.users[i] - self.items[j]).norm(2)
        return dist

    def calculate_dist(self, i):
        diff = self.items - self.users[i]
        return torch.norm(diff, p=2, dim = 1)

In [37]:
cml = CML(n_users, n_movies)
optimizer = torch.optim.Adagrad(cml.parameters(), lr=0.001)

In [38]:
pivot_matrix[pivot_matrix.isnull()] = 0

dataset = pivot_matrix.values


In [39]:
train_indices = np.argwhere(dataset)

NUM_EPOCHS = 10
BATCH_SIZE = 32
U_SAMPLE_SIZE = 32
MARGIN = 5

for epoch in range(NUM_EPOCHS): # EPOCHS : Until Converge
    random.shuffle(train_indices)
    positive_samples = train_indices[:BATCH_SIZE]

    ranking_matrix = np.zeros((n_users, n_movies))
    mini_batch_loss = 0

    optimizer.zero_grad()

    for sample in positive_samples: # MINI_BATCH
        sample_loss, num_impostor = 0, 0 
        row_idx, col_idx = sample
        dist_ij = cml(row_idx, col_idx)
        negative_samples = np.argwhere(dataset[row_idx, :] == 0)
        random.shuffle(negative_samples)
        u_negative_samples = negative_samples[:U_SAMPLE_SIZE]
        
        for negative_sample in u_negative_samples:
            dist_ik = cml(row_idx, negative_sample[0])
            
            hinge_loss = MARGIN + dist_ij - dist_ik 

            if hinge_loss:
                num_impostor += 1
                sample_loss += hinge_loss

        rank_ij = floor(n_movies * num_impostor / U_SAMPLE_SIZE) # floor(J * M / U)
        ranking_matrix[row_idx, col_idx] =  rank_ij 
        mini_batch_loss += rank_ij * sample_loss

    mini_batch_loss.backward()
    optimizer.step()
    with torch.no_grad():
        for param in cml.parameters():
            if param.requires_grad:
                # Normalize the parameter
                param.copy_(F.normalize(param, p=2, dim=0))


In [40]:
from collections import defaultdict

recommendations = defaultdict(list)
relevant_items = defaultdict(list)


In [41]:
positive_dataset = pivot_matrix[pivot_matrix >= 4].values

for i in range(n_users):
    relevant_items[i] = np.argwhere(positive_dataset[i, :]).squeeze(1)
    recommendations[i] = cml.calculate_dist(i).tolist()


In [42]:
def calculate_recall_at_k(recommendations, relevant_items, k):
    recall_scores = []

    for user_id, recommended_items in recommendations.items():
        if user_id in relevant_items:
            top_k_recommended = set(np.argsort(recommended_items)[::-1][:k])
            user_relevant_items = relevant_items[user_id]

            recall = len(top_k_recommended.intersection(user_relevant_items)) / len(user_relevant_items)
            recall_scores.append(recall)

    average_recall = sum(recall_scores) / len(recall_scores)
    return average_recall


In [43]:
calculate_recall_at_k(recommendations, relevant_items, 50)

0.013491635186184675

In [44]:
calculate_recall_at_k(recommendations, relevant_items, 100)

0.02698327037236935