In [1]:
# !pip install torcheval

In [1]:
import pandas as pd
from sklearn import model_selection
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cpu')

In [3]:
df = pd.read_csv("data/rating.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813737 entries, 0 to 7813736
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 178.8 MB


In [4]:
df.user_id.nunique()

73515

In [5]:
df.rating.value_counts()

rating
 8     1646019
-1     1476496
 7     1375287
 9     1254096
 10     955715
 6      637775
 5      282806
 4      104291
 3       41453
 2       23150
 1       16649
Name: count, dtype: int64

In [6]:
class MovieDataset:
    def __init__(self, users, movies, ratings) -> None:
        self.users = users
        self.movies = movies
        self.ratings = ratings

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        users = self.users[idx]
        movies = self.movies[idx]
        ratings = self.ratings[idx]

        return {
            "users":torch.tensor(users, dtype=torch.long).to(device),
            "movies":torch.tensor(movies, dtype=torch.long).to(device),
            "rating":torch.tensor(ratings, dtype=torch.long).to(device),
        }

In [7]:
### Demo Model

class HashEmbedding(nn.Module):
    def __init__(self, num_buckets, embedding_dim):
        """
        num_buckets: Number of buckets (fixed number of embedding vectors).
        embedding_dim: Dimension of each embedding vector.
        """
        super(HashEmbedding, self).__init__()
        self.num_buckets = num_buckets
        self.embedding_dim = embedding_dim
        
        # Embedding layer with a fixed number of buckets
        self.embeddings = nn.Embedding(num_buckets, embedding_dim)
        
    def forward(self, ids):
        """
        ids: Tensor of user or item IDs.
        """
        # Hash the IDs to the range of available buckets
        hashed_ids = ids % self.num_buckets
        
        # Retrieve the corresponding embeddings for the hashed IDs
        return self.embeddings(hashed_ids) 

In [8]:
# Parameters
num_buckets = 2  # Number of embedding buckets (buckets size)
embedding_dim = 16  # Dimension of each embedding vector

# Instantiate the hash embedding model
hash_embedding = HashEmbedding(num_buckets, embedding_dim)

# Sample user or movie IDs
ids = torch.tensor([1001, 2002, 3003])  # Example entity IDs

# Perform embedding lookup using hashing
embeddings = hash_embedding(ids)

print("IDs:", ids)
print("Hashed IDs:", ids % num_buckets)
# print("Embeddings:", embeddings)

IDs: tensor([1001, 2002, 3003])
Hashed IDs: tensor([1, 0, 1])


In [9]:
class RecSysModelWithDropout(nn.Module):
    def __init__(self, users_buckets, movies_buckets, embedding_dim=32):
        super().__init__()

        self.users_buckets = users_buckets
        self.movies_buckets = movies_buckets

        self.user_embed = nn.Embedding(users_buckets, embedding_dim, sparse=True)
        self.movie_embed = nn.Embedding(movies_buckets, embedding_dim, sparse=True)
        
        # self.user_biases = torch.nn.Embedding(users_buckets, 1, sparse=True)
        # self.item_biases = torch.nn.Embedding(movies_buckets, 1, sparse=True)

        self.user_embed.weight.data.uniform_(-0.05, 0.05)
        self.movie_embed.weight.data.uniform_(-0.05, 0.05)

        self.dropout0 = nn.Dropout(0.5)
        self.dropout1 = nn.Dropout(0.5)

        self.m = torch.nn.Sigmoid()

    def forward(self, users_ids, movies_ids):
        hashed_users_ids = users_ids % self.users_buckets
        hashed_movies_ids = movies_ids % self.movies_buckets
        
        user_embed = self.user_embed(hashed_users_ids)
        movie_embed = self.movie_embed(hashed_movies_ids)

        output = (self.dropout0(user_embed) * self.dropout1(movie_embed)).sum(1)# + self.user_biases(hashed_users_ids).sum(1) + self.item_biases(hashed_movies_ids).sum(1)
        return self.m(output)

    
    def expand_embeddings(self, users_buckets, movies_buckets):
        """
        Expand the embedding table to accommodate more embeddings.
        new_num_embeddings: New total number of embeddings after expansion.
        """
        if users_buckets > self.users_buckets:        
            # Create a new embedding layer with the expanded size
            new_user_embed = nn.Embedding(users_buckets, self.embedding_dim)
        
            # Copy the old embeddings into the new expanded layer
            with torch.no_grad():
                new_user_embed.weight[:self.users_buckets] = self.user_embed.weight
                self.user_embed = new_user_embed
                self.users_buckets = users_buckets

        if movies_buckets > self.movies_buckets:
            # Create a new embedding layer with the expanded size
            new_movie_embed = nn.Embedding(movies_buckets, self.embedding_dim)
        
            # Copy the old embeddings into the new expanded layer
            with torch.no_grad():
                new_movie_embed.weight[:self.movies_buckets] = self.movie_embed.weight
                self.movie_embed = new_movie_embed
                self.movies_buckets = movies_buckets

In [10]:
df.user_id.values

array([    1,     1,     1, ..., 73515, 73516, 73516])

In [11]:
df.keys()

Index(['user_id', 'anime_id', 'rating'], dtype='object')

In [12]:
df.rating = (df.rating/2).astype(int)

In [13]:
df.rating = (df.rating / 3.5).astype(int)

In [14]:
df.groupby(by=['rating']).count()

Unnamed: 0_level_0,user_id,anime_id
rating,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3957907,3957907
1,3855830,3855830


In [15]:
df_train, df_valid = model_selection.train_test_split(
    df, test_size=0.1, random_state=42, stratify=df.rating.values
)

In [16]:
df_train.head()

Unnamed: 0,user_id,anime_id,rating
2824162,26466,5955,0
3556926,32936,527,0
489764,4992,1726,0
35460,392,7592,0
5615936,52728,3785,1


In [17]:
train_dataset = MovieDataset(
    users=df_train.user_id.values,
    movies=df_train.anime_id.values,
    ratings=df_train.rating.values
)

val_dataset = MovieDataset(
    users=df_valid.user_id.values,
    movies=df_valid.anime_id.values,
    ratings=df_valid.rating.values
)

In [18]:
next(iter(train_dataset))

{'users': tensor(26466), 'movies': tensor(5955), 'rating': tensor(0)}

In [19]:
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=64,
    shuffle=True,
)

validation_loader = DataLoader(
    dataset=val_dataset,
    batch_size=1024*2,
    shuffle=True
)

In [22]:
max(df.anime_id)+100

34619

In [260]:
model = RecSysModelWithDropout(
    users_buckets=max(df.user_id)+100,
    movies_buckets=max(df.anime_id)+100,
).to(device=device)

optimizer = torch.optim.SparseAdam(model.parameters())

In [163]:
loss_func = nn.BCELoss()

In [224]:
def fit(model, optimizer, epochs, train_dataset, val_dataset, loss_fn, metric, device, callback={}, save_model=None):

    '''callback = {
        "ModelCheckpoint": {"filepath":"model_chkpt.pth"},
        }'''
    
    if save_model:
        # Load the saved state dictionary
        model.load_state_dict(torch.load(save_model, map_location=device))

            
    train_acc_metric = metric()#.to(device)  # Define the train accuracy metric.
    val_acc_metric = metric()#.to(device)  # Define the validation accuracy metric.
    model.to(device)  # send model to target device
    
    #### Training Step Function ***************
    def train_step(users, movies, labels):
        labels = labels.view(-1,).to(torch.float32)
        # model.train()  # set model to training mode
        optimizer.zero_grad()  # clear gradients
        predictions = model(users, movies)  # Forward pass. 
        loss_value = loss_fn(predictions, labels)  # calculate loss
        loss_value.backward()  # Backward pass
        # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)  # gradient clipping
        optimizer.step()  # Update weights
        train_acc_metric.update(labels, predictions.detach())  # update training metric
        return loss_value.detach().cpu().item()  # return training loss
    
    #### Validation Step Function ***************
    def test_step(users, movies, labels):
        labels = labels.view(-1, ).to(torch.float32)
        # model.eval()  # set model to evaluation mode
        with torch.inference_mode():  # turn off gradient
            val_predictions = model(users, movies)  # Forward pass
            val_acc_metric.update(labels, val_predictions)  # update validation metric
        return loss_fn(val_predictions, labels).cpu().item()  # return validation loss

    history = {"train_accuracy":[], "val_accuracy":[], "train_loss":[], "val_loss":[]}  # history of train_acc, train_loss, vall_acc, val_loss.
    
    train_lenght = len(train_dataset)  #Define the length of the train dataset.
    val_length = len(val_dataset)
    
    #### Training Loop ***************
    for e in range(1, epochs+1):
        count = 0
        train_loss = 0  # Define the train loss.
        display = tqdm(train_dataset)  # display progress bar
        for i, batch in enumerate(display):
            batch_train_loss = train_step(batch['users'].to(device), batch['movies'].to(device), batch['rating'].to(device))
            train_loss += batch_train_loss/(i+1)                    
            display.set_description(f"Epoch {e} Train Loss: {train_loss:.6f} Tain Accuracy: {train_acc_metric.compute():.6f}")
            count += 1
            
            ### &&&&&&&&&&&&&&&&
            if count == 1000:
#                 scheduler.step()
                count = 0
        
                ### Define the monitor.    
                monitor = {"train_": train_acc_metric.compute(), "train_loss": train_loss}

                ### define a wandb logger. @@@@@@@@@@@@@@@@@
#                 wandb.log(monitor)

        #### Validation Loop ***************    
        val_loss = 0
        display1 = tqdm(val_dataset)
        for i, batch in enumerate(display1):
            val_loss += test_step(batch['users'].to(device), batch['movies'].to(device), batch['rating'].to(device))/(i+1)
            display1.set_description(f"Val Loss: {val_loss :.6f} Val Accuracy: {val_acc_metric.compute():.6f}")        
        
        
        history["train_accuracy"].append(train_acc_metric.compute())  # Update the history of the training.
        history["val_accuracy"].append(val_acc_metric.compute())  # Update the history of the validation.
        history["train_loss"].append(train_loss)  # Update the history of the training.
        history["val_loss"].append(val_loss)  # Update the history of the validation.

        ### Define the monitor.    
        monitor = {"val_": val_acc_metric.compute(), "train_": train_acc_metric.compute(), "val_loss": val_loss, "train_loss": train_loss}

        ### define a wandb logger. @@@@@@@@@@@@@@@@@
#         wandb.log(monitor)
        
        ####  ModelCheckpoint Callbacks ***************
        if callback.get("ModelCheckpoint"):
            path = callback["ModelCheckpoint"]["filepath"]  # path of the model.
            torch.save(model.state_dict(), path)  # Save the best model.
            print("Model is saved.")

        train_acc_metric.reset()  # reset training metric
        val_acc_metric.reset() ### reset validation metric
        
        print()  # new line
        
    return history  # Return the history.        

In [171]:
# model.load_state_dict(torch.load('model_chkpt.pth', map_location=device))

In [172]:
class Accuracy:
    def __init__(self):
        self.correct = 0
        self.total = 0
        
    def update(self, y_true, y_pred):
        y_pred = y_pred > 0.5
        self.correct += torch.eq(y_pred, y_true).sum().item()
        self.total += y_true.size(0)
    
    def compute(self):
        if self.total == 0:
            return 0
        return self.correct / self.total
    
    def reset(self):
        self.correct = 0
        self.total = 0

In [273]:
history = fit(model=model, optimizer=optimizer, epochs=3, train_dataset=train_loader, 
              val_dataset=validation_loader, loss_fn=loss_func, metric=Accuracy, 
              device=device, callback={"ModelCheckpoint": {"filepath":"model_chkpt.pth"},}, save_model=None)

Epoch 1 Train Loss: 5.800163 Tain Accuracy: 0.767300: 100%|██████████| 109881/109881 [06:32<00:00, 279.89it/s]
Val Loss: 3.246384 Val Accuracy: 0.757738: 100%|██████████| 382/382 [00:10<00:00, 36.94it/s]


Model is saved.



Epoch 2 Train Loss: 5.795678 Tain Accuracy: 0.767321: 100%|██████████| 109881/109881 [06:48<00:00, 269.20it/s]
Val Loss: 3.252605 Val Accuracy: 0.757431: 100%|██████████| 382/382 [00:09<00:00, 38.88it/s]


Model is saved.



Epoch 3 Train Loss: 5.823866 Tain Accuracy: 0.767619: 100%|██████████| 109881/109881 [07:15<00:00, 252.13it/s]
Val Loss: 3.257548 Val Accuracy: 0.758271: 100%|██████████| 382/382 [00:10<00:00, 35.01it/s]


Model is saved.



In [58]:
history

{'train_accuracy': [tensor(0.4297, device='cuda:0')],
 'val_accuracy': [tensor(0.3987, device='cuda:0')],
 'train_loss': [10.184275003148612],
 'val_loss': [7.5748745844218295]}

In [274]:
def evaluate(model, validation_loader, th=0.5):
    y_true = []
    y_pred = []

    # Turn off gradient calculations
    with torch.no_grad():
        display = tqdm(validation_loader)
        for i, batch in enumerate(display):
            user = batch['users']
            movie = batch['movies']
            rating = batch['rating'].float()  # Ensure the target is float
            
            # Get model predictions
            model_output = model(user, movie)
            predictions = (model_output > th).float()  # Convert to binary predictions (0 or 1)
            
            # Store the predictions and actual ratings
            y_true.extend(rating.cpu().numpy())  # True labels
            y_pred.extend(predictions.cpu().numpy())  # Predicted labels

    # Compute precision and recall
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)

    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')

for i in [0.50]:
    print(f"Threshold: {i}")
    evaluate(model, validation_loader, th=i)

Threshold: 0.5


100%|██████████| 382/382 [00:09<00:00, 38.89it/s]


Precision: 0.7370
Recall: 0.7924
