In [None]:
import os
os.chdir("/content/drive/MyDrive/Colab Notebooks")

In [None]:
import torch
import random 
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import numpy as np


# import import_ipynb
# from importlib import reload
%run MLP.ipynb
%run GMF.ipynb
%run Dataset.ipynb
%run NCF.ipynb



In [None]:
def train_test_split(ratings_df):
    # rank ratings of each user according to time in descending order, i.e. latest ratings have rank 1
    ratings_df['rank_time'] = ratings_df.groupby(['userId'])['timestamp'].rank(method='first', ascending=False)
    # create training and testing set
    test_set = ratings_df[ratings_df['rank_time'] == 1]
    train_set = ratings_df[ratings_df['rank_time'] > 1] 

    train_set, test_set = train_set[['userId', 'movieId', 'rating']], test_set[['userId', 'movieId', 'rating']]

    return train_set, test_set


def get_negative_instances(ratings_df) -> pd.DataFrame:

    uniq_movies = set(ratings_df['movieId'].unique())

    user_item_interaction = ratings_df.groupby('userId')['movieId'].apply(set).reset_index().rename(
                columns = {'movieId':'interacted_items'})
    
    user_item_interaction['negative_instances'] = user_item_interaction['interacted_items'].apply(lambda positive_instances: [x for x in uniq_movies if x not in positive_instances])
    
    return user_item_interaction[['userId', 'negative_instances']]


    
# sample negative instances
def sample_neg(negative_instances_df, num_neg_samples):  
    negative_instances_df['negative_samples'] = negative_instances_df['negative_instances'].apply(lambda x: random.sample(x, num_neg_samples))
    
    return negative_instances_df[['userId', 'negative_samples']]



# train_df columns = {userId, movieId, rating}
# negative_instances_df columns = {userId, negative_instances}
# return Dataset object
def get_dataset(df, negative_instances_df, num_neg_samples=4):
 
    # sample 4 negative instance per positive instance,
    # columns = {userId, negative_samples}
    neg_samples_df = sample_neg(negative_instances_df, num_neg_samples)
    # merge dataframes to include movies and ratings
    neg_samples_df = pd.merge(df, neg_samples_df, on='userId')

    users, movies, labels = [], [], []
    for row in neg_samples_df.itertuples():
        users.append(row.userId)
        movies.append(row.movieId)
        labels.append(row.rating)
        for i in range(num_neg_samples):
            users.append(row.userId)
            movies.append(row.negative_samples[i])
            labels.append(0)

    # create custom Dataset 
    return RatingsDataset(users, movies, labels)


    

# # actual_ratings_df contains the latest ratings of each user for a moviea
# # hundred_neg_samples_df is a dataframe with columns = {userId, negative_samples}, where negative_samples is a list of 100 negative instances of each user
def evaluate(users, movies, predictions, test_df, topk=10):
    # _, indices = torch.topk(predictions, topk)
    # n = len(test_df)
    # topk_users, topk_movies = [users[i] for i in indices], [movies[i] for i in indices]
    # pred_df = test_df.loc[test_df['userId'].isin(topk_users) and test_df['movieId'].isin(topk_movies)]

    # # hit ratio
    # hit_ratio = len(pred_df) / n 
    # # NCDG 
    # pred_df




    pred_df = pd.DataFrame(data={'userId':users.cpu(), 'pred_movieId':movies.cpu(), 'pred_rating':predictions.cpu()})
    n = len(test_df)
    pred_df.loc[:,'userId'] = pred_df.userId.astype('int')
    pred_df.loc[:,'pred_movieId'] = pred_df.pred_movieId.astype('int')
    pred_df.loc[:, 'pred_rating'] = pred_df.pred_rating.astype('float')

    pred_df.loc[:,'rank'] = pred_df.groupby(['userId'])['pred_rating'].rank(method='first', ascending=False)
    pred_df = pred_df.loc[pred_df['rank'] <= topk]
    
    
    pred_ground_df = pd.merge(pred_df, test_df, on='userId')

    pred_ground_df = pred_ground_df.loc[pred_ground_df['pred_movieId'] == pred_ground_df['movieId']].copy()
    # hit ratio
    hit_ratio = len(pred_ground_df) / n

    # NCDG
    # since each user rates item in the test set, we normalized NCDG by dividing DCG by the total number of unique users
    pred_ground_df['ncdg'] = pred_ground_df['rank'].apply(lambda rank: 1.0 / np.log2(rank + 1))
    ncdg = pred_ground_df.loc[:,'ncdg'].sum() / n


    return hit_ratio, ncdg






In [None]:
def train(model, train_df, test_df, negative_instances, epochs, batch_size, lr, path):
    device = torch.device('cuda')
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.BCELoss()

    
    for e in range(epochs):
        if e == 0:
            lr = 0.001
        else:
            lr = 0.0000001
        # get training Dataset
        train_dataset = get_dataset(train_df, negative_instances, 4)
        # create train loader
        train_loader = DataLoader(train_dataset, batch_size, shuffle=True)
        test_dataset = get_dataset(test_df, negative_instances_df, 99)
        test_size = len(test_dataset.users)

        test_loader = DataLoader(test_dataset, test_size, shuffle=False)

        running_loss = 0 
        best_loss = 1000
        # set model to training mode 
        model.train()
        for users, movies, ratings, in train_loader:
            users, movies, ratings = users.to(device), movies.to(device), ratings.to(device)
            # zero gradient 
            optimizer.zero_grad()
            # calculate output 
            output = model.forward(users, movies)
            # convert to float
            ratings = ratings.float()
            # caulcate loss 
            loss = criterion(output, ratings.unsqueeze(1))
            # calculate gradient 
            loss.backward()
            # update weights
            optimizer.step()
            running_loss += loss.item()

        # avg_hit_ratio, avg_ncdg = 0, 0
        else:
            # set to evaluation mode (turn off dropout)
            model.eval()
            best_hit_ratio, best_ncdg = 0, 0
            # turn off gradient
            with torch.no_grad():

                for users, movies, _ in test_loader:
                    users, movies = users.to(device), movies.to(device)
                    # calculate predictions
                    output = model.forward(users, movies)
                    hit_ratio, ncdg = evaluate(users, movies, output.view(-1), test_df)

        if hit_ratio > best_hit_ratio and ncdg > best_ncdg :
            torch.save(model.state_dict(), path)
            best_hit_ratio = hit_ratio
            best_ncdg = ncdg 

        print(f'iteration {e}: loss per epoch: {running_loss/len(train_loader)}, hit_ratio: {hit_ratio},  NCDG: {ncdg}')

    
    return model 




# 

In [None]:
# read csv files 
ratings_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ratings_1m.csv')

# convert ratings to binary
ratings_df.loc[:,'rating'] = 1
ratings_df.userId = ratings_df.userId.astype(int)
ratings_df.movieId = ratings_df.movieId.astype(int)

# map users 
movieId_mapping = {val: i for i, val in enumerate(ratings_df['movieId'].unique())}
ratings_df['movieId'] = ratings_df['movieId'].map(movieId_mapping)

# number of unique users and movies, used for embedding 
num_uniq_users = ratings_df['userId'].nunique() + 1
num_uniq_movies = ratings_df['movieId'].nunique() + 1


# columns = {userId, negative_instances}
# sample from full dataset
negative_instances_df = get_negative_instances(ratings_df)

# split into training and test data
train_df, ground_truth_df = train_test_split(ratings_df)





In [None]:
# create GMF, MLP and NCF models
# mlp_model = MLP(num_users=num_uniq_users + 1, num_items=num_uniq_movies + 1, latent_dim=8)
# mlp_model = train(mlp_model, train_df, ground_truth_df, negative_instances_df, 100, 256, 0.0005, path='./MLP.pt')

# gmf_model = GMF(num_users=num_uniq_users + 1, num_items=num_uniq_movies + 1, latent_dim=8)
# gmf_model = train(gmf_model, train_df, ground_truth_df, negative_instances_df, 100, 512, path='./GMF.pt')

ncf_model = NCF(num_users=num_uniq_users+1, num_items=num_uniq_movies+1, latent_dim=8)
ncf_model = train(ncf_model, train_df, ground_truth_df, negative_instances_df, 100, 256, lr=0.001, path='NCF.pt')

# # save GMF, MLP model
# torch.save(gmf_model.state_dict(), './GMF.pt')
# torch.save(mlp_model.state_dict(), './MLP.pt')
