<a href="https://colab.research.google.com/github/ShawnLiu119/Recommendation_Transformer/blob/main/Recommendation_Engine_using_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Recommendation Engine using Transformer**

Reference material:<br>
https://www.aboutwayfair.com/careers/tech-blog/mars-transformer-networks-for-sequential-recommendation
<br>
https://medium.com/hepsiburada-data-science/personalized-recommendations-with-transformers-11c13cff2be

In [1]:
import time
import math
import os
from tempfile import TemporaryDirectory
from typing import Tuple

import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset
from torchtext.vocab import vocab
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

from collections import Counter

from zipfile import ZipFile
from urllib.request import urlretrieve

import pandas as pd
import numpy as np



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path = '/content/drive/MyDrive/kaggle_data'

In [4]:
%cd '/content/drive/MyDrive/kaggle_data'

/content/drive/MyDrive/kaggle_data


In [5]:
# Downloading dataset
urlretrieve("http://files.grouplens.org/datasets/movielens/ml-1m.zip", "movielens.zip")
ZipFile("movielens.zip", "r").extractall(path)

# Loading dataset
users = pd.read_csv(
    "ml-1m/users.dat",
    sep="::",
    names=["user_id", "sex", "age_group", "occupation", "zip_code"],
)

ratings = pd.read_csv(
    "ml-1m/ratings.dat",
    sep="::",
    names=["user_id", "movie_id", "rating", "unix_timestamp"],
)
movies = pd.read_csv(
    "ml-1m/movies.dat", sep="::", names=["movie_id", "title", "genres"], encoding='latin-1'
)

  users = pd.read_csv(
  ratings = pd.read_csv(
  movies = pd.read_csv(


In [6]:
users.head()

Unnamed: 0,user_id,sex,age_group,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [7]:
movies.head() #this could be products table

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
ratings.head() #this is could be order/transaction table

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [9]:
# Preventing ids to be written as integer or float data type, add str
users["user_id"] = users["user_id"].apply(lambda x: f"user_{x}")
movies["movie_id"] = movies["movie_id"].apply(lambda x: f"movie_{x}")
ratings["movie_id"] = ratings["movie_id"].apply(lambda x: f"movie_{x}")
ratings["user_id"] = ratings["user_id"].apply(lambda x: f"user_{x}")

### Step 1: create vocabulary
<br>
vocabulary does not have to be word,

In [10]:
# Genarting a list of unique movie ids
movie_ids = movies.movie_id.unique()

# Counter is used to feed movies to movive_vocab
movie_counter = Counter(movie_ids)

# Genarting vocabulary
movie_vocab = vocab(movie_counter, specials=['<unk>'])

#https://torchtext.readthedocs.io/en/latest/vocab.html
#counter – collections.Counter object holding the frequencies of each value found in the data.
#here it is the count of each movie_id, which is unique as 1
#vocab is a torch class, counter is needed when iniitated

# For indexing input ids
movie_vocab_stoi = movie_vocab.get_stoi() #Dictionary mapping tokens to indices.

# Movie to title mapping dictionary
movie_title_dict = dict(zip(movies.movie_id, movies.title))

# Similarly generating a vocabulary for user ids
user_ids = users.user_id.unique()
user_counter = Counter(user_ids)
user_vocab = vocab(user_counter, specials=['<unk>'])
user_vocab_stoi = user_vocab.get_stoi()

In [11]:
print(movie_vocab_stoi)

{'movie_3951': 3882, 'movie_3949': 3880, 'movie_3947': 3878, 'movie_3946': 3877, 'movie_3944': 3875, 'movie_3943': 3874, 'movie_3941': 3872, 'movie_3940': 3871, 'movie_3938': 3869, 'movie_3936': 3867, 'movie_3932': 3863, 'movie_3931': 3862, 'movie_3924': 3855, 'movie_3920': 3851, 'movie_3916': 3847, 'movie_3914': 3845, 'movie_3912': 3843, 'movie_3911': 3842, 'movie_3909': 3840, 'movie_3908': 3839, 'movie_3904': 3835, 'movie_3903': 3834, 'movie_3899': 3830, 'movie_3897': 3828, 'movie_3896': 3827, 'movie_3895': 3826, 'movie_3891': 3822, 'movie_3890': 3821, 'movie_3886': 3817, 'movie_3884': 3815, 'movie_3881': 3812, 'movie_3879': 3810, 'movie_3878': 3809, 'movie_3877': 3808, 'movie_3876': 3807, 'movie_3871': 3802, 'movie_3867': 3798, 'movie_3863': 3794, 'movie_3862': 3793, 'movie_3861': 3792, 'movie_3856': 3787, 'movie_3855': 3786, 'movie_3853': 3784, 'movie_3851': 3782, 'movie_3849': 3780, 'movie_3847': 3778, 'movie_3844': 3775, 'movie_3842': 3773, 'movie_3831': 3762, 'movie_3830': 3761,

### Step 2: generating sequence

All interactions of users are first sorted by their interaction timestamp and then divided into sub sequences to train our model.

In [12]:
# Group ratings by user_id in order of increasing unix_timestamp.
ratings_group = ratings.sort_values(by=["unix_timestamp"]).groupby("user_id")

ratings_data = pd.DataFrame(
    data={
        "user_id": list(ratings_group.groups.keys()),
        "movie_ids": list(ratings_group.movie_id.apply(list)),
        "timestamps": list(ratings_group.unix_timestamp.apply(list)),
    }
)

# Sequence length, min history count and window slide size
sequence_length = 4
min_history = 1
step_size = 2

# Creating sequences from lists with sliding window
def create_sequences(values, window_size, step_size, min_history):
  sequences = []
  start_index = 0
  while len(values[start_index:]) > min_history:
    seq = values[start_index : start_index + window_size]
    sequences.append(seq)
    start_index += step_size
  return sequences

ratings_data.movie_ids = ratings_data.movie_ids.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size, min_history)
)

del ratings_data["timestamps"]

# Sub-sequences are exploded.
# Since there might be more than one sequence for each user.
ratings_data_transformed = ratings_data[["user_id", "movie_ids"]].explode(
    "movie_ids", ignore_index=True
)
#explode(): Transform each element of a list-like to a row, replicating index values.

ratings_data_transformed.rename(
    columns={"movie_ids": "sequence_movie_ids"},
    inplace=True,
)


sequence_length = 4 <br>
每四个token(movie)形成一个sequence

min_history = 1<br>
确保start_index不是最后一个token

step_size = 2<br>
每隔2个token重新开始组成一个sequence


In [13]:
ratings_data.head()

Unnamed: 0,user_id,movie_ids
0,user_1,"[[movie_3186, movie_1721, movie_1270, movie_10..."
1,user_10,"[[movie_597, movie_858, movie_743, movie_1210]..."
2,user_100,"[[movie_260, movie_1676, movie_1198, movie_541..."
3,user_1000,"[[movie_971, movie_260, movie_2990, movie_2973..."
4,user_1001,"[[movie_1198, movie_1617, movie_2885, movie_39..."


In [14]:
ratings_group.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
1000138,user_6040,movie_858,4,956703932
1000153,user_6040,movie_2384,4,956703954
999873,user_6040,movie_593,5,956703954
1000007,user_6040,movie_1961,4,956703977
1000192,user_6040,movie_2019,5,956703977
...,...,...,...,...
480399,user_2947,movie_902,5,1037737203
480447,user_2947,movie_480,2,1037737218
480513,user_2947,movie_1080,4,1037737230
480410,user_2947,movie_1617,4,1037737242


In [15]:
ratings_data.movie_ids[0]

[['movie_3186', 'movie_1721', 'movie_1270', 'movie_1022'],
 ['movie_1270', 'movie_1022', 'movie_2340', 'movie_1836'],
 ['movie_2340', 'movie_1836', 'movie_3408', 'movie_1207'],
 ['movie_3408', 'movie_1207', 'movie_2804', 'movie_260'],
 ['movie_2804', 'movie_260', 'movie_720', 'movie_1193'],
 ['movie_720', 'movie_1193', 'movie_919', 'movie_608'],
 ['movie_919', 'movie_608', 'movie_2692', 'movie_1961'],
 ['movie_2692', 'movie_1961', 'movie_2028', 'movie_3105'],
 ['movie_2028', 'movie_3105', 'movie_938', 'movie_1035'],
 ['movie_938', 'movie_1035', 'movie_1962', 'movie_1028'],
 ['movie_1962', 'movie_1028', 'movie_2018', 'movie_150'],
 ['movie_2018', 'movie_150', 'movie_1097', 'movie_914'],
 ['movie_1097', 'movie_914', 'movie_1287', 'movie_2797'],
 ['movie_1287', 'movie_2797', 'movie_1246', 'movie_2762'],
 ['movie_1246', 'movie_2762', 'movie_661', 'movie_2918'],
 ['movie_661', 'movie_2918', 'movie_531', 'movie_3114'],
 ['movie_531', 'movie_3114', 'movie_2791', 'movie_1029'],
 ['movie_2791',

In [16]:
ratings_data_transformed.head()

Unnamed: 0,user_id,sequence_movie_ids
0,user_1,"[movie_3186, movie_1721, movie_1270, movie_1022]"
1,user_1,"[movie_1270, movie_1022, movie_2340, movie_1836]"
2,user_1,"[movie_2340, movie_1836, movie_3408, movie_1207]"
3,user_1,"[movie_3408, movie_1207, movie_2804, movie_260]"
4,user_1,"[movie_2804, movie_260, movie_720, movie_1193]"


In [17]:
ratings_data_transformed.loc[0, 'sequence_movie_ids']

['movie_3186', 'movie_1721', 'movie_1270', 'movie_1022']

### Step 3 Train Test Split

In [18]:
#there should be some consideration for time order (user rated movies sequences in time order)

# Random indexing
random_selection = np.random.rand(len(ratings_data_transformed.index)) <= 0.85

# Split train data
df_train_data = ratings_data_transformed[random_selection]
train_data_raw = df_train_data[["user_id", "sequence_movie_ids"]].values

# Split test data
df_test_data = ratings_data_transformed[~random_selection]
test_data_raw = df_test_data[["user_id", "sequence_movie_ids"]].values

In [19]:
random_selection

array([ True,  True, False, ...,  True, False, False])

In [20]:
print(train_data_raw.shape)
print(test_data_raw.shape)

(423729, 2)
(74894, 2)


In [21]:
train_data_raw[:10]

array([['user_1',
        list(['movie_3186', 'movie_1721', 'movie_1270', 'movie_1022'])],
       ['user_1',
        list(['movie_1270', 'movie_1022', 'movie_2340', 'movie_1836'])],
       ['user_1',
        list(['movie_3408', 'movie_1207', 'movie_2804', 'movie_260'])],
       ['user_1',
        list(['movie_2804', 'movie_260', 'movie_720', 'movie_1193'])],
       ['user_1',
        list(['movie_720', 'movie_1193', 'movie_919', 'movie_608'])],
       ['user_1',
        list(['movie_919', 'movie_608', 'movie_2692', 'movie_1961'])],
       ['user_1',
        list(['movie_2692', 'movie_1961', 'movie_2028', 'movie_3105'])],
       ['user_1',
        list(['movie_938', 'movie_1035', 'movie_1962', 'movie_1028'])],
       ['user_1',
        list(['movie_1962', 'movie_1028', 'movie_2018', 'movie_150'])],
       ['user_1',
        list(['movie_2018', 'movie_150', 'movie_1097', 'movie_914'])]],
      dtype=object)

In [22]:
# Pytorch Dataset for user interactions
class MovieSeqDataset(Dataset):
    # Initialize dataset
    def __init__(self, data, movie_vocab_stoi, user_vocab_stoi):
        self.data = data
        self.movie_vocab_stoi = movie_vocab_stoi
        self.user_vocab_stoi = user_vocab_stoi


    def __len__(self):
        return len(self.data)

    # Fetch data from the dataset
    def __getitem__(self, idx):
        user, movie_sequence = self.data[idx]
        # Directly index into the vocabularies
        movie_data = [self.movie_vocab_stoi[item] for item in movie_sequence]
        user_data = self.user_vocab_stoi[user]
        return torch.tensor(movie_data), torch.tensor(user_data)


# Collate function and padding
def collate_batch(batch):
    movie_list = [item[0] for item in batch]
    user_list = [item[1] for item in batch]
    return pad_sequence(movie_list, padding_value=movie_vocab_stoi['<unk>'], batch_first=True), torch.stack(user_list)


BATCH_SIZE = 256
# Create instances of your Dataset for each set
train_dataset = MovieSeqDataset(train_data_raw, movie_vocab_stoi, user_vocab_stoi)
val_dataset = MovieSeqDataset(test_data_raw, movie_vocab_stoi, user_vocab_stoi)
# Create DataLoaders
train_iter = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=collate_batch)
val_iter = DataLoader(val_dataset, batch_size=BATCH_SIZE,
                      shuffle=False, collate_fn=collate_batch)

### **Step 4 Model definition**

### 2.1 Positional Encoder

We start by defining the positional encoder, which is crucial for sequence-based models like the Transformer. This encoder will capture the positions of movie interactions in our sequences, thus embedding the order information that the Transformer model needs.

In [23]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)

        # `div_term` is used in the calculation of the sinusoidal values.
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))

        # Initializing positional encoding matrix with zeros.
        pe = torch.zeros(max_len, 1, d_model)

        # Calculating the positional encodings.
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

### 2.2 Transfomer Model

Following the definition of our positional encoder, we then establish our transformer model. This model takes both the user id and the movie id sequence as input, and it is responsible for generating the output movie predictions.

In [24]:
class TransformerModel(nn.Module):
    def __init__(self, ntoken: int, nuser: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        # positional encoder
        self.pos_encoder = PositionalEncoding(d_model, dropout)

        # Multihead attention mechanism.
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)

        # Embedding layers
        self.movie_embedding = nn.Embedding(ntoken, d_model)
        self.user_embedding = nn.Embedding(nuser, d_model)

        # Defining the size of the input to the model.
        self.d_model = d_model

        # Linear layer to map the output tomovie vocabulary.
        self.linear = nn.Linear(2*d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        # Initializing the weights of the embedding and linear layers.
        initrange = 0.1
        self.movie_embedding.weight.data.uniform_(-initrange, initrange)
        self.user_embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, user: Tensor, src_mask: Tensor = None) -> Tensor:
        # Embedding movie ids and userid
        movie_embed = self.movie_embedding(src) * math.sqrt(self.d_model)
        user_embed = self.user_embedding(user) * math.sqrt(self.d_model)

        # positional encoding
        movie_embed = self.pos_encoder(movie_embed)

        # generating output with final layers
        output = self.transformer_encoder(movie_embed, src_mask)

        # Expand user_embed tensor along the sequence length dimension
        user_embed = user_embed.expand(-1, output.size(1), -1)

        # Concatenate user embeddings with transformer output
        output = torch.cat((output, user_embed), dim=-1)

        output = self.linear(output)
        return output



In [25]:
#parameters

ntokens = len(movie_vocab)  # size of vocabulary
nusers = len(user_vocab)
emsize = 128  # embedding dimension
d_hid = 128  # dimension of the feedforward network model
nlayers = 2  # number of ``nn.TransformerEncoderLayer``
nhead = 2  # number of heads in ``nn.MultiheadAttention``
dropout = 0.2  # dropout probability

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TransformerModel(ntokens, nusers, emsize, nhead, d_hid, nlayers, dropout).to(device)

criterion = nn.CrossEntropyLoss()
lr = 1.0  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)



### STEP5: Train and Evaluate

3.1 Train Function

We will train our model via loading data, predicting movies, calculating loss and updating parameters. Ultimately our aim is to iteratively improving our transformer model performance.

In [26]:
def train(model: nn.Module, train_iter, epoch) -> None:
    # Switch to training mode
    model.train()
    total_loss = 0.
    log_interval = 200
    start_time = time.time()

    for i, (movie_data, user_data) in enumerate(train_iter):
        # Load movie sequence and user id
        movie_data, user_data = movie_data.to(device), user_data.to(device)
        user_data = user_data.reshape(-1, 1)

        # Split movie sequence to inputs and targets
        inputs, targets = movie_data[:, :-1], movie_data[:, 1:]
        targets_flat = targets.reshape(-1)

        # Predict movies
        output = model(inputs, user_data)
        output_flat = output.reshape(-1, ntokens)

        # Backpropogation process
        loss = criterion(output_flat, targets_flat)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        # Results
        if i % log_interval == 0 and i > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()

In [27]:
def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    # Switch the model to evaluation mode.
    # This is necessary for layers like dropout,
    model.eval()
    total_loss = 0.

    with torch.no_grad():
        for i, (movie_data, user_data) in enumerate(eval_data):
            # Load movie sequence and user id
            movie_data, user_data = movie_data.to(device), user_data.to(device)
            user_data = user_data.reshape(-1, 1)
            # Split movie sequence to inputs and targets
            inputs, targets = movie_data[:, :-1], movie_data[:, 1:]
            targets_flat = targets.reshape(-1)
            # Predict movies
            output = model(inputs, user_data)
            output_flat = output.reshape(-1, ntokens)
            # Calculate loss
            loss = criterion(output_flat, targets_flat)
            total_loss += loss.item()
    return total_loss / (len(eval_data) - 1)