<a href="https://colab.research.google.com/github/SeongBeomLEE/RecsysTutorial/blob/main/Caser/Caser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !pip install python-box

Collecting python-box
  Downloading python_box-5.4.1-py3-none-any.whl (21 kB)
Installing collected packages: python-box
Successfully installed python-box-5.4.1


In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

from box import Box

import warnings

warnings.filterwarnings(action='ignore')

data_dir = '/content/drive/MyDrive/RecsysTutorial/Data/MovieLens/'
model_dir = '/content/drive/MyDrive/RecsysTutorial/Model/'

In [2]:
# 기본 설정은 최대한 오피셜 깃허브 코드와 동일하게 함
# l2 norm은 적용하지 않음
config = {
    'k' : 30,
    'epochs' : 50,
    'lr' : 1e-03,
    'batch_size' : 512,
    'device' : torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'),

    'L' : 5,
    'T' : 3,
    'd' : 50,
    'nv' : 4,
    'nh' :16,
    'drop_ratio' : 0.5,
    'neg_samples' : 3,
}

config = Box(config)

# 데이터 확인

In [3]:
ratings_df = pd.read_csv(data_dir + 'ratings.csv')
ratings_df.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [4]:
num_users = ratings_df['userId'].nunique()
num_items = ratings_df['movieId'].nunique()

sparsity = 1 - len(ratings_df) / (num_users * num_items)

print(f'전체 User 수: {num_users}')
print(f'전체 Item 수: {num_items}')
print(f'행렬의 희소성: {sparsity:.4f}')

전체 User 수: 671
전체 Item 수: 9066
행렬의 희소성: 0.9836


# DataSet

In [5]:
class MakeDataset():
    def __init__(self, config, df : pd.DataFrame):
        self.config = config
        self.df = df
        self.user_encoder, self.user_decoder, self.item_encoder, self.item_decoder = self.get_encoder_decoder()
        self.num_users = len(self.user_encoder)
        self.num_items = len(self.item_encoder)
        self.all_items = [i for i in range(self.num_items)]

        self.df['userId'] = self.df['userId'].apply(lambda x : self.user_encoder[x])
        self.df['movieId'] = self.df['movieId'].apply(lambda x : self.item_encoder[x])
        self.user_neg_candidate = self.get_user_neg_candidate()

        self.train_sequence_user, self.train_sequence_L, self.train_sequence_T, self.test_sequence_user, self.test_sequence_L, self.test_sequence_T = self.train_test_data_split()

    def train_test_data_split(self):

        user_id_li = self.df['userId'].unique()
        train_sequence_user = []
        test_sequence_user = []

        train_sequence_L = []
        test_sequence_L = []

        train_sequence_T = []
        test_sequence_T = []
        for user_id in user_id_li:
            sequence_user = []
            sequence_L = []
            sequence_T = []

            user_df = self.df[self.df['userId'] == user_id].sort_values('timestamp')
            movieId_li = user_df['movieId'].tolist()
            seq_length = self.config.T + self.config.L
            if len(movieId_li) > seq_length:
                for i in range(0, len(movieId_li) - seq_length + 1):
                    sequence = movieId_li[i : i + seq_length]
                    
                    sequence_user.append(user_id)
                    sequence_L.append(sequence[ : -self.config.T])
                    sequence_T.append(sequence[-self.config.T : ])

            train_sequence_user.extend(sequence_user[:-1])
            test_sequence_user.extend(sequence_user[-1:])

            train_sequence_L.extend(sequence_L[:-1])
            test_sequence_L.extend(sequence_L[-1:])

            train_sequence_T.extend(sequence_T[:-1])
            test_sequence_T.extend(sequence_T[-1:])

        return train_sequence_user, train_sequence_L, train_sequence_T, test_sequence_user, test_sequence_L, test_sequence_T

    def get_neg_samples(self, n : int, user_id_li):
        neg_samples = []
        for u in user_id_li:
            neg_sample = []
            u_neg_candidate = self.user_neg_candidate[u]
            for _ in range(n):
                neg_sample.append(u_neg_candidate[np.random.randint(len(u_neg_candidate))])
            neg_samples.append(neg_sample)

        return neg_samples

    def get_user_neg_candidate(self):
        user_candidate = {}
        for user_id in self.df['userId'].unique():
            movieId_li = self.df[self.df['userId'] == user_id]['movieId'].tolist()
            movieId_li = [movieId for movieId in movieId_li]
            user_candidate[user_id] = list(set(self.all_items) - set(movieId_li))
        
        return user_candidate

    def get_encoder_decoder(self):
        user_encoder, user_decoder = {}, {}
        for idx, user_id in enumerate(self.df['userId'].unique()):
            user_encoder[user_id] = idx
            user_decoder[idx] = user_id

        item_encoder, item_decoder = {}, {}
        for idx, item_id in enumerate(self.df['movieId'].unique()):
            item_encoder[item_id] = idx
            item_decoder[idx] = item_id
        
        return user_encoder, user_decoder, item_encoder, item_decoder
    
    def get_data(self, train : bool = True):
        if train: return self.train_sequence_user, self.train_sequence_L, self.train_sequence_T
        else: return self.test_sequence_user, self.test_sequence_L, self.test_sequence_T

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class CustomDataset(Dataset):
    def __init__(self, sequence_user : list, sequence_L : list, sequence_T : list):
        self.sequence_user = sequence_user
        self.sequence_L = sequence_L
        self.sequence_T = sequence_T

    def __len__(self):
        return len(self.sequence_user)

    def __getitem__(self, idx):
        sequence_user = self.sequence_user[idx]
        sequence_L = torch.tensor(self.sequence_L[idx])
        sequence_T = torch.tensor(self.sequence_T[idx])

        return sequence_user, sequence_L, sequence_T

# 모델

In [6]:
class Caser(nn.Module):
    def __init__(self, config, num_users : int, num_items : int):
        super(Caser, self).__init__()
        self.config = config
        self.d = self.config.d # 임베딩 차원
        self.nv = self.config.nv # vertical conv layer의 필터의 수
        self.nh = self.config.nh # horizontal conv layer의 필터의 수
        self.drop_ratio = self.config.drop_ratio # dropout 비율


        self.P = nn.Embedding(num_users, self.d) # user Embedding
        self.Q = nn.Embedding(num_items, self.d) # item Embedding

        # vertical conv layer
        self.conv_v = nn.Conv2d(in_channels = 1, out_channels = self.nv, kernel_size = (self.config.L, 1))

        # horizontal conv layer
        lengths = [i + 1 for i in range(self.config.L)]
        self.conv_h = nn.ModuleList([nn.Conv2d(in_channels = 1, out_channels = self.nh, kernel_size = (i, self.d)) for i in lengths])

        # convolutional sequence embedding
        self.conv_v_d = self.nv * self.d
        self.conv_h_d = self.nh * self.config.L
        self.fc = nn.Sequential(
            nn.Linear(self.conv_v_d + self.conv_h_d, self.d),
            nn.ReLU(),
        )

        # output
        self.Q_prime = nn.Embedding(num_items, self.d * 2)
        self.b = nn.Embedding(num_items, 1)

        # dropout
        self.dropout = nn.Dropout(self.drop_ratio)

    def weight_initialization(self):
        self.P.weight.data.normal_(0, 1.0 / self.P.embedding_dim)
        self.Q.weight.data.normal_(0, 1.0 / self.Q.embedding_dim)
        self.Q_prime.weight.data.normal_(0, 1.0 / self.Q_prime.embedding_dim)
        self.b.weight.data.zero_()

    def forward(self, user, sequence_L, sequence_T, pred = False):
        
        #### Embedding Look-up
        user_emb = self.P(user)
        item_emb = self.Q(sequence_L).unsqueeze(1)

        #### Convolutional Layers
        # vertical conv layer
        out_v = self.conv_v(item_emb).view(-1, self.conv_v_d)

        # horizontal conv layer
        out_h = []
        for conv in self.conv_h:
            out = conv(item_emb).squeeze(3)
            out = F.max_pool1d(out, out.shape[2]).squeeze(2)
            out_h.append(out)
        out_h = torch.cat(out_h, 1)

        # convolutional sequence embedding
        out = torch.cat([out_v, out_h], 1)
        out = self.dropout(out)
        z = self.fc(out)

        ### output
        x = torch.cat([z, user_emb], 1)
        W = self.Q_prime(sequence_T)
        b = self.b(sequence_T)

        if pred:
            W = W.squeeze()
            b = b.squeeze()
            res = (x * W).sum(1) + b
        else:
            res = torch.baddbmm(b, W, x.unsqueeze(2)).squeeze()

        return res

# 학습 설정

In [7]:
class CustomLoss(nn.Module):
    def __init__(self):
        super(CustomLoss, self).__init__()

    def forward(self, targets_prediction, negatives_prediction):
        positive_loss = -torch.mean(
            torch.log(torch.sigmoid(targets_prediction))
        )

        negative_loss = -torch.mean(
            torch.log(1 - torch.sigmoid(negatives_prediction))
        )

        loss = positive_loss + negative_loss
        return loss

In [8]:
def get_recall(target_list, pred_list, k):
    cnt = 0
    for t in target_list:
        if t in pred_list[:k]:
            cnt += 1
    score = cnt / len(target_list)
    return score

def get_precision(target_list, pred_list, k):
    cnt = 0
    for t in target_list:
        if t in pred_list[:k]:
            cnt += 1
    score = cnt / len(pred_list[:k])
    return score

def get_ap(target_list, pred_list, k):
    score = 0
    for i in range(1, k + 1):
        score += get_precision(target_list, pred_list, k = i)
    score /= k
    return score

In [9]:
def train(model, data_loader, criterion, optimizer):
    model.train()
    loss_val = 0

    for i in data_loader:
        sequence_user, sequence_L, sequence_T = i
        neg_sequence_T = torch.tensor(dataset.get_neg_samples(n = config.neg_samples, user_id_li = sequence_user.numpy()))
        b_size = sequence_user.shape[0]

        sequence_user = torch.cat([sequence_user, sequence_user], axis = 0).to(config.device)
        sequence_L = torch.cat([sequence_L, sequence_L], axis = 0).to(config.device)
        sequence_T = torch.cat([sequence_T, neg_sequence_T], axis = 0).to(config.device)

        optimizer.zero_grad()
        output = model(sequence_user, sequence_L, sequence_T)

        targets_prediction, negatives_prediction = torch.split(output, b_size)

        loss = criterion(targets_prediction, negatives_prediction)

        loss.backward()
        optimizer.step()

        loss_val += loss.item()
    
    loss_val = loss_val / len(data_loader)

    return loss_val

def evaluate(model, data_loader):
    model.eval()

    recall = 0
    precision = 0
    ap = 0

    with torch.no_grad():
        for i in data_loader:
            sequence_user, sequence_L, sequence_T = i

            sequence_user = sequence_user.to(config.device)
            sequence_L = sequence_L.to(config.device)
            item_list = torch.tensor(dataset.all_items).to(config.device)

            pred_list = model(sequence_user, sequence_L, item_list, pred = True).argsort(descending = True).cpu().numpy()
            target_list = sequence_T[0].cpu().numpy()

            recall += get_recall(target_list, pred_list, k = config.k)
            precision += get_precision(target_list, pred_list, k = config.k)
            ap += get_ap(target_list, pred_list, k = config.k)

    recall = recall / len(data_loader)
    precision = precision / len(data_loader)
    ap = ap / len(data_loader)

    return recall, precision, ap

# 학습

In [10]:
dataset = MakeDataset(config = config, df = ratings_df)

In [11]:
train_sequence_user, train_sequence_L, train_sequence_T = dataset.get_data(train = True)
train_dataset = CustomDataset(sequence_user = train_sequence_user, sequence_L = train_sequence_L, sequence_T = train_sequence_T)
train_loader = DataLoader(train_dataset, batch_size = config.batch_size, shuffle = True, drop_last = False)

test_sequence_user, test_sequence_L, test_sequence_T = dataset.get_data(train = False)
test_dataset = CustomDataset(sequence_user = test_sequence_user, sequence_L = test_sequence_L, sequence_T = test_sequence_T)
test_loader = DataLoader(test_dataset, batch_size = 1, shuffle = False, drop_last = False)

In [12]:
model = Caser(config = config, num_users = dataset.num_users, num_items = dataset.num_items).to(config.device)
optimizer = torch.optim.Adam(model.parameters(), lr = config.lr)
criterion = CustomLoss()

In [13]:
# 초기화를 해주지 않으면 기울기가 망가져서 모델 학습이 안됨
model.weight_initialization()

best_metric = 0

for epoch in range(1, config.epochs + 1):
    train_loss = train(model = model, data_loader = train_loader, criterion = criterion, optimizer = optimizer)
    recall, precision, ap = evaluate(model = model, data_loader = test_loader)

    print(f"[EPOCH: {epoch}], Train Loss: {train_loss:.4f}, Recall@{config.k}: {recall:.4f}, Precision@{config.k}: {precision:.4f}, MAP@{config.k}: {ap:.4f},")

    if best_metric < ap:
        best_metric = ap
        torch.save(model.state_dict(), model_dir + f'Caser.pt')

[EPOCH: 1], Train Loss: 0.9651, Recall@30: 0.0581, Precision@30: 0.0058, MAP@30: 0.0062,
[EPOCH: 2], Train Loss: 0.7882, Recall@30: 0.0631, Precision@30: 0.0063, MAP@30: 0.0071,
[EPOCH: 3], Train Loss: 0.6682, Recall@30: 0.0631, Precision@30: 0.0063, MAP@30: 0.0074,
[EPOCH: 4], Train Loss: 0.5769, Recall@30: 0.0720, Precision@30: 0.0072, MAP@30: 0.0074,
[EPOCH: 5], Train Loss: 0.5096, Recall@30: 0.0765, Precision@30: 0.0077, MAP@30: 0.0074,
[EPOCH: 6], Train Loss: 0.4602, Recall@30: 0.0725, Precision@30: 0.0073, MAP@30: 0.0076,
[EPOCH: 7], Train Loss: 0.4153, Recall@30: 0.0735, Precision@30: 0.0074, MAP@30: 0.0074,
[EPOCH: 8], Train Loss: 0.3818, Recall@30: 0.0914, Precision@30: 0.0091, MAP@30: 0.0082,
[EPOCH: 9], Train Loss: 0.3537, Recall@30: 0.0720, Precision@30: 0.0072, MAP@30: 0.0074,
[EPOCH: 10], Train Loss: 0.3300, Recall@30: 0.0805, Precision@30: 0.0080, MAP@30: 0.0088,
[EPOCH: 11], Train Loss: 0.3073, Recall@30: 0.0775, Precision@30: 0.0077, MAP@30: 0.0075,
[EPOCH: 12], Train 

데이터 셋이 작아서 모델의 성능이 좋은 편은 아님