In [1]:
!pip install -q sentence_transformers

In [2]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict

import gc
import os
import copy
import time
import random
from tqdm.auto import tqdm

In [3]:
tqdm.pandas()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
def fix_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

# Hyperparams

In [5]:
BATCH_SIZE = 2048
EPOCHS = 20
LR = [0.0001, 0.0005, 0.001]
# LR = [0.001]
DROPOUT_RATE = [0.1, 0.3, 0.5]
SEEDS = [42]
TOP_K = 10
EARLY_STOP = 5
N_NEG_TRAIN = 100
N_NEG_TEST = 1000

# Dataset

In [6]:
DEFAULT_USER_COL = 'user_id'
DEFAULT_ITEM_COL = 'movie_id'
DEFAULT_RATING_COL = 'rating'

In [7]:
def _read_users(data_path):
    columns = ['user_id', 'gender', 'age', 'occupation', 'zip-code']
    users = pd.read_table(f'{data_path}/users.dat', names = columns, sep = "::", encoding = "latin1", engine='python')
    return users


def _read_ratings(data_path):
    columns = ['user_id', 'movie_id', 'rating', 'timestamp']
    ratings = pd.read_table(f'{data_path}/ratings.dat', names = columns, sep = "::", encoding = "latin1", engine='python')
    return ratings


def _read_movies(data_path):
    movies = pd.read_csv(data_path)
    return movies

# Read datasets
data_path = '/kaggle/input/movielens-1m-dataset'
users = _read_users(data_path)
ratings = _read_ratings(data_path)

movie_path = '/kaggle/input/movielens-1m-generated-descriptions/original_temp0.csv'
movies = _read_movies(movie_path)

In [8]:
movies = movies[['movie_id', 'genre', 'description']]
movies

Unnamed: 0,movie_id,genre,description
0,1,Animation|Children's|Comedy,A group of toys embark on an adventure to retu...
1,2,Adventure|Children's|Fantasy,Two siblings discover a magical board game tha...
2,3,Comedy|Romance,"Two elderly neighbors, Max and John, continue ..."
3,4,Comedy|Drama,Four close-knit friends navigate the complexit...
4,5,Comedy,A father grapples with his daughter's second w...
...,...,...,...
3878,3948,Comedy,A nurse's fiancé faces a series of awkward and...
3879,3949,Drama,Four individuals' lives spiral into addiction ...
3880,3950,Drama,A group of young soldiers undergo intense trai...
3881,3951,Drama,A dysfunctional family living in a two-family ...


In [9]:
df = pd.merge(pd.merge(ratings, users), movies)
df = df[['user_id', 'movie_id', 'rating', 'description']]
df['rating'] = [1.0 if r > 0 else 0.0 for r in df['rating']]
df

Unnamed: 0,user_id,movie_id,rating,description
0,1,1193,1.0,A rebellious inmate challenges the oppressive ...
1,1,661,1.0,An orphaned boy embarks on an extraordinary ad...
2,1,914,1.0,A renowned phonetics professor transforms a Co...
3,1,3408,1.0,A determined legal clerk uncovers a corporate ...
4,1,2355,1.0,A misfit ant teams up with a group of circus b...
...,...,...,...,...
1000204,6040,1091,1.0,Two insurance salesmen try to cover up the dea...
1000205,6040,1094,1.0,An IRA member falls in love with the girlfrien...
1000206,6040,562,1.0,A socially awkward and bullied seventh-grader ...
1000207,6040,1096,1.0,A Polish woman is forced to make an impossible...


In [10]:
min(df['user_id']), max(df['user_id'])

(1, 6040)

In [11]:
user_id_to_index = {user_id:idx for idx, user_id in enumerate(df['user_id'].unique())}

In [12]:
class NegativeSampler:
    """NegativeSampler class for NCF. Samples a subset of negative items from a given population of items."""

    def __init__(
        self,
        user,
        n_samples,
        seed,
        user_positive_item_pool,
        item_pool,
        print_warnings=True,
        training=True,
    ):
        """Constructor

        Args:
            user (str or int): User to be sampled for.
            n_samples (int): Number of required samples.
            user_positive_item_pool (set): Set of items with which user has previously interacted.
            item_pool (set): Set of all items in population.
            print_warnings (bool): If true, prints warnings if sampling without replacement and
                there are not enough items to sample from to satisfy n_neg or n_neg_test.
            training (bool): Set to true if sampling for the training set or false if for the test set.
        """
        self.user = user
        self.n_samples = n_samples
        self.seed = seed
        self.user_positive_item_pool = user_positive_item_pool
        self.item_pool = item_pool

        self.print_warnings = print_warnings
        self.training = training

        self.user_negative_item_pool = self._get_user_negatives_pool()
        self.population_size = len(self.user_negative_item_pool)
        self._check_sample_size()
#         self._sample = self._sample_negatives()

    def sample(self):
        """Method for sampling uniformly from a population of negative items

        Returns: list
        """
        return self._sample_negatives()

    def _get_user_negatives_pool(self):
        # Get list of items user has not interacted with
        return list(set(self.item_pool) - self.user_positive_item_pool)

    def _sample_negatives(self):
        random.seed(self.seed)
        return random.sample(self.user_negative_item_pool, k=self.n_samples)

    def _check_sample_size(self):
        # If sampling without replacement, check sample population is sufficient and reduce n_samples if not.
        n_neg_var = "n_neg" if self.training else "n_neg_test"
        dataset_name = "training" if self.training else "test"

        k = min(self.n_samples, self.population_size)
        if k < self.n_samples and self.print_warnings:
            warning_string = (
                "The population of negative items to sample from is too small for user {}. "
                "Samples needed = {}, negative items = {}. "
                "Reducing samples to {} for this user."
                "If an equal number of negative samples for each user is required in the {} set, sample with replacement or reduce {}. "
                "This warning can be turned off by setting print_warnings=False".format(
                    self.user,
                    self.n_samples,
                    self.population_size,
                    self.population_size,
                    dataset_name,
                    n_neg_var,
                )
            )
            logging.warning(warning_string)
        self.n_samples = k

In [13]:
class CustomDataset(Dataset):
    
    def __init__(
        self,
        target_df,
        user_positive_item_pool,
        n_neg=1000,
        seed=42,
        user_item_dict=None,
        all_items=None,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        train=True,
        print_warnings=False,
    ):
        """Constructor

        Args:
            df (Dataframe): Dataframe
            col_user (str): User column name.
            col_item (str): Item column name.
            col_rating (str): Rating column name.
            user_positive_item_pool (dict)
            user_item_dict (dict)
            all_items (list)
            n_neg (int): Number of negative samples per positive example for data subset.
            seed (int): Seed.
            train(bool): Type of dataset (train/test)
            print_warnings (bool): If true, prints warnings if sampling without replacement and
                there are not enough items to sample from to satisfy n_neg or n_neg_test.
        """
        
        super(CustomDataset, self).__init__()
        self.target_df = target_df
        self.n_neg = n_neg
        self.seed = seed
        
        self.col_user = col_user
        self.col_item = col_item
        self.col_rating = col_rating
        
        self.user_positive_item_pool = user_positive_item_pool
        self.user_item_dict = user_item_dict
        self.all_items = all_items
        self.train = train
        self.print_warnings = print_warnings
        self.users, self.items, self.ratings = self._negative_sampling()
        
    
    def __len__(self) -> int:
        '''
        get lenght of data
        :return: len(data)
        '''
        return self.target_df.shape[0]


    def __getitem__(self, index):
        '''
        transform userId[index], item[inedx] to Tensor.
        and return to Datalaoder object.
        :param index: idex for dataset.
        :return: user,item,rating
        '''
        return self.users[index], self.items[index], self.ratings[index]


    def _negative_sampling(self):
    
        users, items, ratings = [], [], []
        
        if self.train:
            for user in tqdm(self.user_item_dict.keys()):
                sampler = NegativeSampler(
                    user,
                    self.n_neg,
                    self.seed,
                    self.user_positive_item_pool.get(user),
                    self.all_items,
                    self.print_warnings,
                    training=self.train,
                )
                negative_examples = sampler.sample()
                self.user_positive_item_pool[user].update(negative_examples)
                
                for item in self.user_item_dict.get(user):
                    users.append(user)
                    items.append(item)
                    ratings.append(1.0)

                for neg_example in negative_examples:
                    users.append(user)
                    items.append(neg_example)
                    ratings.append(0.0)
        else:
            for idx, row in tqdm(self.target_df.iterrows(), total=self.target_df.shape[0]):
                user, item = row[self.col_user], row[self.col_item]
                sampler = NegativeSampler(
                    user,
                    self.n_neg,
                    self.seed,
                    self.user_positive_item_pool.get(user),
                    self.all_items,
                    self.print_warnings,
                    training=self.train,
                )
                negative_examples = sampler.sample()
                self.user_item_dict[user].update(negative_examples)
                
                users.append(user)
                items.append([item] + negative_examples)
                ratings.append([1.0] + [0.0] * len(negative_examples))
    
        return torch.tensor(users), torch.tensor(items), torch.tensor(ratings)

In [14]:
def get_user_item_dict(df):
    user_item_dict = dict()
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        user_id, interacted_item_id = row[DEFAULT_USER_COL], row[DEFAULT_ITEM_COL]
        if user_id not in user_item_dict:
            user_item_dict[user_id] = set()
        user_item_dict[user_id].add(interacted_item_id)
        
    return user_item_dict

In [15]:
def get_max_interaction_len(pos_pool):
    maxx = -1
    for key in pos_pool.keys():
        maxx = max(maxx, len(pos_pool[key]))
    return maxx

In [16]:
def train_val_test_split(df, train_ratio, val_test_ratio, seed):
    train_df, test_df = train_test_split(df, train_size=train_ratio, random_state=seed, shuffle=True)
    val_df, test_df = train_test_split(test_df, test_size=val_test_ratio, random_state=seed, shuffle=True)
    print(f'Split data with seed {seed} done!')
    
    return train_df, val_df, test_df

In [17]:
def create_user_item_dict(train_df, val_df, test_df):
    train_positive_pool = get_user_item_dict(train_df)
    val_positive_pool = get_user_item_dict(val_df)
    test_positive_pool = get_user_item_dict(test_df)
    print(f'Create item pools done!')
    
    return train_positive_pool, val_positive_pool, test_positive_pool

In [18]:
def data_preparing(train_df, val_df, test_df, train_positive_pool, val_positive_pool, test_positive_pool, seed, n_neg_train, n_neg_test):
    train_dataset = CustomDataset(
        target_df=train_df,
        user_positive_item_pool=train_positive_pool,
        n_neg=n_neg_train,
        seed=seed, 
        user_item_dict=train_positive_pool,
        all_items=df[DEFAULT_ITEM_COL].unique(),
        train=True,
    )
    print(f'Create training dataset done!')
    
    merged_pos_pool = {
        key: train_positive_pool.get(key, set()) | val_positive_pool.get(key, set()) for key in set(train_positive_pool) | set(val_positive_pool)
    }
    merged_pos_pool = {
        key: merged_pos_pool.get(key, set()) | test_positive_pool.get(key, set()) for key in set(merged_pos_pool) | set(test_positive_pool)
    }
    print(f'Create new positive pool for test/validation done!')
    
    val_dataset = CustomDataset(
        target_df=val_df,
        user_positive_item_pool=merged_pos_pool,
        n_neg=n_neg_test,
        seed=seed,
        user_item_dict=val_positive_pool,
        all_items=df[DEFAULT_ITEM_COL].unique(),
        train=False,
    )
    print(f'Create validation dataset done!')

    test_dataset = CustomDataset(
        target_df=test_df,
        user_positive_item_pool=merged_pos_pool,
        n_neg=n_neg_test,
        seed=seed,
        user_item_dict=test_positive_pool,
        all_items=df[DEFAULT_ITEM_COL].unique(),
        train=False,
    )
    print(f'Create testing dataset done!')
    
    return train_dataset, val_dataset, test_dataset, val_positive_pool, test_positive_pool

In [19]:
def data_loader(train_dataset, val_dataset, test_dataset):
    train_dataloader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(dataset=val_dataset, 0shuffle=False)
    test_dataloader = DataLoader(dataset=test_dataset, shuffle=False)
    
    return train_dataloader, val_dataloader, test_dataloader

# Tokenizer

In [20]:
import sentence_transformers
from sentence_transformers import SentenceTransformer

item_embedder = SentenceTransformer("all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [21]:
batch_size = 32
item_dict = dict()
idx = 0

while idx in range(movies.shape[0]):
    if idx + batch_size < movies.shape[0]:
        end_idx = idx + batch_size - 1
    else:
        end_idx = movies.shape[0] - 1

    batch_item_des = movies.loc[idx:end_idx, 'description'].tolist()    
    list_emb = item_embedder.encode(batch_item_des)
    for i in range(len(list_emb)):
        item_dict[(movies.loc[idx+i, 'movie_id'])] = list_emb[i]
    idx = idx + batch_size

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [22]:
np.asarray(item_dict[1]).shape

(384,)

# MLP

In [23]:
class MLP(nn.Module):
    def __init__(self,
                 num_users:int,
                 item_emb_dim=384,
                 h_dim=128,
                 dropout_rate=0.3,
                 ):
        super(MLP,self).__init__()

        self.num_users = num_users
        self.user_embedding = nn.Embedding(num_users, h_dim)
        self.linear1 = nn.Linear(h_dim, h_dim // 2)
        
        self.linear2 = nn.Linear(item_emb_dim, h_dim * 2)
        self.linear3 = nn.Linear(h_dim * 2, h_dim)
        self.linear4 = nn.Linear(h_dim, h_dim // 2)
        self.act1 = nn.ReLU()
        self.act2 = nn.ReLU()
        self.act3 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout_rate)
        self.dropout2 = nn.Dropout(dropout_rate)

#         self._init_weight()

#     def _init_weight(self):
#         nn.init.normal_(self.user_embedding.weight,std=1e-2)
#         nn.init.xavier_uniform_(self.linear.weight)

        
    def forward(self, user_ids, item_ids):
        user_embeddings = self.user_embedding(user_ids)
        user_embeddings = self.linear1(user_embeddings)
        user_embeddings = self.act1(user_embeddings)
        
        items = item_ids.detach().cpu().tolist()
        item_embeddings = [item_dict.get(item_id) for item_id in items]
        item_embeddings = torch.stack([torch.tensor(emb) for emb in item_embeddings]).to(device)
        item_embeddings = self.linear2(item_embeddings)
        item_embeddings = self.act2(item_embeddings)
        item_embeddings = self.dropout1(item_embeddings) 
        item_embeddings = self.linear3(item_embeddings)
        item_embeddings = self.act3(item_embeddings)
        item_embeddings = self.dropout2(item_embeddings)
        item_embeddings = self.linear4(item_embeddings)
        
        output = torch.sum(user_embeddings * item_embeddings, dim=1)
        
        return output

In [24]:
def hit(gt_item, pred_items):
    if gt_item in pred_items:
        return 1
    return 0


def ndcg(gt_item, pred_items):
    if gt_item in pred_items:
        index = pred_items.index(gt_item)
        return np.reciprocal(np.log2(index+2))
    return 0

In [25]:
def train_epoch(model, optimizer, train_loader, criterion):
    model.train()
    total_loss = 0
    total = len(train_loader)
    
    progress_bar = tqdm(train_loader, desc='Training', leave=False)
    
    for batch in progress_bar:
        user_ids = batch[0].to(device)
        user_ids = user_ids.detach().cpu().tolist()
        user_ids = [user_id_to_index.get(user_id) for user_id in user_ids]
        user_ids = torch.stack([torch.tensor(user_id) for user_id in user_ids]).to(device)
        
        item_ids = batch[1].to(device)
        ratings = batch[2].to(device)

        optimizer.zero_grad()
        preds = model(user_ids, item_ids)
        
        loss = criterion(preds, ratings)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total += len(ratings)
            
    return total_loss / total

def test(model, test_dataloader, test_positive_pool, top_k):
    model.eval()
    total_loss = 0
    total = len(test_dataloader)
    HR = []
    NDCG = []
        
    pred_dict = {}    
    with torch.no_grad():
        for user, items in test_positive_pool.items():
            user_ids = [user_id_to_index.get(user)] * len(items)
            user_ids = torch.tensor(user_ids).to(device)
            item_ids = list(items)
            item_ids = torch.tensor(item_ids).to(device)
            preds = model(user_ids, item_ids)
            preds = preds.detach().cpu().tolist()
            items = list(items)
            pred_dict[user] = {items[idx]: preds[idx] for idx in range(len(items))} 

    
    progress_bar = tqdm(test_dataloader, desc='Validating', leave=False)
    
    with torch.no_grad():
        for user_id, item_ids, ratings in progress_bar:
            
            item_ids = torch.flatten(item_ids)
            gt_item = item_ids[0].item()
            item_ids = item_ids.detach().cpu().tolist()
            
            preds = [pred_dict[user_id.item()].get(item_id) for item_id in item_ids]
            preds = torch.tensor(preds).to(device)
            
            ratings = torch.flatten(ratings)
            ratings = ratings.to(device)
            
            item_ids = torch.tensor(item_ids).to(device)

            _, indices = torch.topk(preds, top_k)
            recommends = torch.take(
                    item_ids, indices).tolist()

            HR.append(hit(gt_item, recommends))
            NDCG.append(ndcg(gt_item, recommends))
            
            loss = criterion(preds, ratings)
            total_loss += loss.item()
            
    return total_loss / total, np.mean(HR)/top_k, np.mean(HR), np.mean(NDCG)

In [26]:
def train(model, optimizer, total_epochs, train_dataloader, val_dataloader, val_positive_pool, criterion, top_k, early_stop=5, val_step=5):
    train_losses, valid_losses = [], []
    val_max_recall = 0.0
    num_decreases = 0
    
#     try:
    for epoch in range(total_epochs):
        train_loss = train_epoch(model, optimizer, train_dataloader, criterion)
        
        print('Epoch {}/{}'.format(epoch, total_epochs - 1))
        print('-' * 10)
        print('Training Loss: {:.2e}'.format(train_loss))
        
        if (epoch + 1) % val_step == 0:
            valid_loss, val_precision, val_recall, val_ndcg = test(model, val_dataloader, val_positive_pool, top_k)

            if val_recall > val_max_recall:
                val_max_recall = val_recall
                num_decreases = 0
                torch.save(model.state_dict(), '/kaggle/working/best_model.pt')
            else:
                if num_decreases > early_stop:
                    print('Early Stop!')
                    break
                else:
                    num_decreases += 1
            print('Validate Loss: {:.2e} Precision@{}: {:.8f} Recall@{}: {:.8f} NDCG@{}: {:.8f}'.format(valid_loss, top_k, val_precision, top_k, val_recall, top_k, val_ndcg))
            train_losses.append(train_loss)
            valid_losses.append(valid_loss)
            
#     except Exception as e:
#         print(f'Error {e}!')

# Main

In [27]:
max_num_users = df[DEFAULT_USER_COL].nunique()
criterion = nn.CrossEntropyLoss()
weight_decay = 0.0005
n_neg_train = N_NEG_TRAIN
n_neg_test = N_NEG_TEST

for seed in SEEDS:
    print(f'Seed {seed}:')
    
    train_df, val_df, test_df = train_val_test_split(df=df, train_ratio=0.8, val_test_ratio=0.5, seed=seed)
    train_positive_pool, val_positive_pool, test_positive_pool = create_user_item_dict(train_df, val_df, test_df)
    train_dataset, val_dataset, test_dataset, val_positive_pool, test_positive_pool = data_preparing(
        train_df, val_df, test_df,  
        train_positive_pool, val_positive_pool, test_positive_pool, 
        seed, n_neg_train, n_neg_test
    )
    train_dataloader, val_dataloader, test_dataloader = data_loader(train_dataset, val_dataset, test_dataset)
    del train_dataset, val_dataset, test_dataset
    gc.collect()
    
    for lr in LR:
        print(f'Learning rate: {lr}')
        model = MLP(num_users=max_num_users, h_dim=128)
        optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
        
        # If multiple GPUs are available, use DataParallel
        if torch.cuda.device_count() > 1:
            print(f"Using {torch.cuda.device_count()} GPUs!")
            model = nn.DataParallel(model)

        model = model.to(device)
        
        st = time.time()
        train(
            model=model, 
            optimizer=optimizer,
            total_epochs=EPOCHS,
            train_dataloader=train_dataloader, 
            val_dataloader=val_dataloader, 
            val_positive_pool=val_positive_pool,
            criterion=criterion,
            top_k=TOP_K,
            early_stop=EARLY_STOP,
        )
        print('Training finished, took {:.2f}s'.format(time.time() - st))
        
        del model
        gc.collect()
        model = MLP(num_users=max_num_users, h_dim=128)
        model = nn.DataParallel(model)
        best_model_cp = torch.load('/kaggle/working/best_model.pt')
        model.load_state_dict(best_model_cp)
        model.to(device)
         
        _, test_precision, test_recall, test_ndcg = test(model, test_dataloader, test_positive_pool, TOP_K)
        print('Precision@{}: {:.8f}'.format(TOP_K, test_precision))
        print('Recall@{}: {:.8f}'.format(TOP_K, test_recall))
        print('NDCG@{}: {:.8f}'.format(TOP_K, test_ndcg))
        
        del model
        gc.collect()

Seed 42:
Split data with seed 42 done!


  0%|          | 0/800167 [00:00<?, ?it/s]

  0%|          | 0/100021 [00:00<?, ?it/s]

  0%|          | 0/100021 [00:00<?, ?it/s]

Create item pools done!


  0%|          | 0/6040 [00:00<?, ?it/s]

Create training dataset done!
Create new positive pool for test/validation done!


  0%|          | 0/100021 [00:00<?, ?it/s]

Create validation dataset done!


  0%|          | 0/100021 [00:00<?, ?it/s]

Create testing dataset done!
Learning rate: 0.0001
Using 2 GPUs!


Training:   0%|          | 0/391 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


RuntimeError: "nll_loss_forward_reduce_cuda_kernel_2d_index" not implemented for 'Float'