In [1]:
import pandas as pd
import torch
import pytorch_lightning as pl
from tqdm import tqdm
import torchmetrics
import math
from urllib.request import urlretrieve
from zipfile import ZipFile
import os
import torch.nn as nn
import numpy as np
from torch.utils.tensorboard import SummaryWriter

In [2]:
# users = pd.read_csv(
#     "data/users.csv",
#     sep=",",
# )

ratings = pd.read_csv(
    "data/ratings.csv",
    sep=",",
)

movies = pd.read_csv(
    "data/movies.csv", sep=","
)

genres = pd.read_csv(
    "ml-1m/genres.tsv", sep="\t"
)


## Pytorch dataset

In [3]:
import pandas as pd
import torch
import torch.utils.data as data
from torchvision import transforms
import ast
from torch.nn.utils.rnn import pad_sequence

class MovieDataset(data.Dataset):
    """Movie dataset."""

    def __init__(
        self, ratings_file,test=False
    ):
        """
        Args:
            csv_file (string): Path to the csv file with user,past,future.
        """
        self.ratings_frame = pd.read_csv(
            ratings_file,
            delimiter=",",
            # iterator=True,
        )
        self.test = test

    def __len__(self):
        return len(self.ratings_frame)

    def __getitem__(self, idx):
        data = self.ratings_frame.iloc[idx]
        user_id = data.user_id
        
        movie_history = eval(data.sequence_movie_ids)
        movie_history_ratings = eval(data.sequence_ratings)
        target_movie_id = movie_history[-1:][0]
        target_movie_rating = movie_history_ratings[-1:][0]
        
        movie_history = torch.LongTensor(movie_history[:-1])
        movie_history_ratings = torch.LongTensor(movie_history_ratings[:-1])
        
        return user_id, movie_history, target_movie_id,  movie_history_ratings, target_movie_rating

In [4]:
first =True
for i, group in genres.groupby("item")["genre"]:
    tmp = pd.DataFrame({"item" : i, "genre":"|".join(group)}, index=[0])
    if first:
        genre_df = tmp
        first = False
    else:
        genre_df = pd.concat([genre_df, tmp], axis = 0, sort=False)

movies = movies.merge(genre_df, on="item")

In [5]:
genres = list(set(genres["genre"]))

for genre in genres:
    movies[genre] = movies["genre"].apply(
        lambda values: int(genre in values.split("|"))
    )

sequence_length = 8

In [7]:
sequence_length

8

In [8]:
class BST(pl.LightningModule):
    def __init__(
        self, args=None,
    ):
        super().__init__()
        super(BST, self).__init__()
        
        self.save_hyperparameters()
        self.args = args
        #-------------------
        # Embedding layers
        ##Users 
        self.embeddings_user_id = nn.Embedding(
            int(ratings.user.max())+1, int(math.sqrt(ratings.user.max()))+1
        )
        # ###Users features embeddings
        # self.embeddings_user_sex = nn.Embedding(
        #     len(users.sex.unique()), int(math.sqrt(len(users.sex.unique())))
        # )
        # self.embeddings_age_group = nn.Embedding(
        #     len(users.age_group.unique()), int(math.sqrt(len(users.age_group.unique())))
        # )
        # self.embeddings_user_occupation = nn.Embedding(
        #     len(users.occupation.unique()), int(math.sqrt(len(users.occupation.unique())))
        # )
        # self.embeddings_user_zip_code = nn.Embedding(
        #     len(users.zip_code.unique()), int(math.sqrt(len(users.sex.unique())))
        # )
        
        ##Movies
        self.embeddings_movie_id = nn.Embedding(
            int(movies.item.max())+1, int(math.sqrt(movies.item.max()))+1
        )
        self.embeddings_position  = nn.Embedding(
           sequence_length, int(math.sqrt(movies.item.max()))+1
        )
        ###Movies features embeddings
        genre_vectors = movies[genres].to_numpy()
        self.embeddings_movie_genre = nn.Embedding(
            genre_vectors.shape[0], genre_vectors.shape[1]
        )
        
        self.embeddings_movie_genre.weight.requires_grad = True #Not training genres
        
        
        self.embeddings_movie_year = nn.Embedding(
            len(movies.year.unique()), int(math.sqrt(len(movies.year.unique())))
        )
        
        
        # Network
        self.transfomerlayer = nn.TransformerEncoderLayer(346, 2, dropout=0.2)
        self.linear = nn.Sequential(
            nn.Linear(
                3141,
                1024,
            ),
            nn.LeakyReLU(),
            nn.Linear(1024, 512),
            nn.LeakyReLU(),
            nn.Linear(512, 256),
            nn.LeakyReLU(),
            nn.Linear(256, 1),
        )
        self.criterion = torch.nn.MSELoss()
        self.mae = torchmetrics.MeanAbsoluteError()
        self.mse = torchmetrics.MeanSquaredError()
        
    def encode_input(self,inputs):
        user_id, movie_history, target_movie_id,  movie_history_ratings, target_movie_rating = inputs
        
        
        #MOVIES
        #positions : for sequence
        movie_history = self.embeddings_movie_id(movie_history)
        target_movie = self.embeddings_movie_id(target_movie_id)
        
        # positions = torch.arange(0,sequence_length-1,1,dtype=int,device=self.device)
        # positions = self.embeddings_position(positions)
        
        # encoded_sequence_movies_with_poistion_and_rating = (movie_history + positions) #Yet to multiply by rating
        encoded_sequence_movies_with_poistion_and_rating = movie_history
        
        target_movie = torch.unsqueeze(target_movie, 1)
        transfomer_features = torch.cat((encoded_sequence_movies_with_poistion_and_rating, target_movie),dim=1)
        
        #USERS
        user_id = self.embeddings_user_id(user_id)
        
        # sex = self.embeddings_user_sex(sex)
        # age_group = self.embeddings_age_group(age_group)
        # occupation = self.embeddings_user_occupation(occupation)
        # user_features = torch.cat((user_id, sex, age_group,occupation), 1)
        
        return transfomer_features, user_id, target_movie_rating.float()
    
    def forward(self, batch):
        transfomer_features, user_id, target_movie_rating = self.encode_input(batch)
        transformer_output = self.transfomerlayer(transfomer_features)
        transformer_output = torch.flatten(transformer_output,start_dim=1)
        
        #Concat with other features
        features = torch.cat((user_id, transformer_output),dim=1)
        # features = transformer_output

        output = self.linear(features)
        return output, target_movie_rating
        
    def training_step(self, batch, batch_idx):
        out, target_movie_rating = self(batch)
        out = out.flatten()
        loss = self.criterion(out, target_movie_rating)
        
        mae = self.mae(out, target_movie_rating)
        mse = self.mse(out, target_movie_rating)
        rmse =torch.sqrt(mse)
        self.log(
            "train/mae", mae, on_step=True, on_epoch=False, prog_bar=False
        )
        
        self.log(
            "train/rmse", rmse, on_step=True, on_epoch=False, prog_bar=False
        )
        
        self.log("train/step_loss", loss, on_step=True, on_epoch=False, prog_bar=False)
        return loss
    
    def validation_step(self, batch, batch_idx):
        out, target_movie_rating = self(batch)
        out = out.flatten()
        loss = self.criterion(out, target_movie_rating)
        
        mae = self.mae(out, target_movie_rating)
        mse = self.mse(out, target_movie_rating)
        rmse =torch.sqrt(mse)
        
        return {"val_loss": loss, "mae": mae.detach(), "rmse":rmse.detach()}
    
    def test_step(self, batch, batch_idx):
        out, target_movie_rating = self(batch)
        out = out.flatten()
        
        return {"users": batch[0], "top10":out}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        avg_mae = torch.stack([x["mae"] for x in outputs]).mean()
        avg_rmse = torch.stack([x["rmse"] for x in outputs]).mean()
        
        self.log("val/loss", avg_loss, on_step=False, on_epoch=True, prog_bar=False)
        self.log("val/mae", avg_mae, on_step=False, on_epoch=True, prog_bar=False)
        self.log("val/rmse", avg_rmse, on_step=False, on_epoch=True, prog_bar=False)


    def test_epoch_end(self, outputs):
        users = torch.cat([x["users"] for x in outputs])
        y_hat = torch.cat([x["top10"] for x in outputs])
        users = users.tolist()
        y_hat = y_hat.tolist()
        
        data = {"users": users, "item": y_hat}
        df = pd.DataFrame.from_dict(data)
        print(len(df))
        df.to_csv("lightning_logs/predict.csv", index=False)

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=0.0005)

    @staticmethod
    def add_model_specific_args(parent_parser):
        parser = ArgumentParser(parents=[parent_parser], add_help=False)
        parser.add_argument("--learning_rate", type=float, default=0.01)
        return parser

    ####################
    # DATA RELATED HOOKS
    ####################

    def setup(self, stage=None):
        print("Loading datasets")
        self.train_dataset = MovieDataset("data/train_data.csv")
        self.val_dataset = MovieDataset("data/val_data.csv")
        self.test_dataset = MovieDataset("data/test_data.csv")
        print("Done")

    def train_dataloader(self):
        return torch.utils.data.DataLoader(
            self.train_dataset,
            batch_size=128,
            shuffle=False,
            num_workers=os.cpu_count(),
        )

    def val_dataloader(self):
        return torch.utils.data.DataLoader(
            self.val_dataset,
            batch_size=128,
            shuffle=False,
            num_workers=os.cpu_count(),
        )

    def test_dataloader(self):
        return torch.utils.data.DataLoader(
            self.test_dataset,
            batch_size=128,
            shuffle=False,
            num_workers=os.cpu_count(),
        )
        
model = BST()
trainer = pl.Trainer(gpus=1,max_epochs=1)
trainer.fit(model)
trainer.test()

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Loading datasets
Done


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name                   | Type                    | Params
-------------------------------------------------------------------
0 | embeddings_user_id     | Embedding               | 51.7 M
1 | embeddings_movie_id    | Embedding               | 41.2 M
2 | embeddings_position    | Embedding               | 2.8 K 
3 | embeddings_movie_genre | Embedding               | 122 K 
4 | embeddings_movie_year  | Embedding               | 1.0 K 
5 | transfomerlayer        | TransformerEncoderLayer | 1.9 M 
6 | linear                 | Sequential              | 3.9 M 
7 | criterion              | MSELoss                 | 0     
8 | mae                    | MeanAbsoluteError       | 0     
9 | mse                    | MeanSquaredError        | 0     
-------------------------------------------------------------------
98.8 M    Trainable params
0         Non-trainable params
98.8 M    Total params
395.137   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn(


Loading datasets


Restoring states from the checkpoint path at /opt/ml/zBST/Behavior-Sequence-Transformer-Pytorch/lightning_logs/version_23/checkpoints/epoch=0-step=32978.ckpt


Done


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at /opt/ml/zBST/Behavior-Sequence-Transformer-Pytorch/lightning_logs/version_23/checkpoints/epoch=0-step=32978.ckpt


Testing: 0it [00:00, ?it/s]

4966311


[{}]

In [24]:
model.test_dataset[0]
# user_id, movie_history, target_movie_id,  movie_history_ratings, target_movie_rating

(11,
 tensor([4643,  170,  531,  616, 2140, 2722, 2313]),
 2688,
 tensor([1, 1, 1, 1, 1, 1, 1]),
 1)