<a href="https://colab.research.google.com/github/SeongBeomLEE/RecsysTutorial/blob/main/AutoRec/AutoRec_for_implicit_feedback.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install python-box

Collecting python-box
  Downloading python_box-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 5.4 MB/s 
[?25hInstalling collected packages: python-box
Successfully installed python-box-6.0.1


In [None]:
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from box import Box

import warnings

warnings.filterwarnings(action='ignore')
torch.set_printoptions(sci_mode=True)

# 1. 학습 설정

In [None]:
config = {
    'data_path' : "/content/drive/MyDrive/RecsysTutorial/Data/MovieLens" , # 데이터 경로

    'model_path' : "/content/drive/MyDrive/RecsysTutorial/model", # 모델 저장 경로
    'model_name' : 'AutoRec.pt',

    'num_factor': 64,

    'valid_samples' : 10, # 검증에 사용할 sample 수
    'seed' : 22,

    'lr' : 0.005,
    'batch_size' : 128,
    'num_epochs' : 200,
    'num_workers' : 2,
}

device = 'cuda' if torch.cuda.is_available() else 'cpu'

config = Box(config)

In [None]:
if not os.path.isdir(config.model_path):
    os.mkdir(config.model_path)

# 2. 데이터 전처리

In [None]:
class MakeMatrixDataSet():
    """
    MatrixDataSet 생성
    """
    def __init__(self, config):
        self.config = config
        self.df = pd.read_csv(os.path.join(self.config.data_path, 'ratings.csv'))
        
        self.item_encoder, self.item_decoder = self.generate_encoder_decoder('movieId')
        self.user_encoder, self.user_decoder = self.generate_encoder_decoder('userId')
        self.num_item, self.num_user = len(self.item_encoder), len(self.user_encoder)

        self.df['item_idx'] = self.df['movieId'].apply(lambda x : self.item_encoder[x])
        self.df['user_idx'] = self.df['userId'].apply(lambda x : self.user_encoder[x])

        self.user_train, self.user_valid = self.generate_sequence_data()

    def generate_encoder_decoder(self, col : str) -> dict:
        """
        encoder, decoder 생성

        Args:
            col (str): 생성할 columns 명
        Returns:
            dict: 생성된 user encoder, decoder
        """

        encoder = {}
        decoder = {}
        ids = self.df[col].unique()

        for idx, _id in enumerate(ids):
            encoder[_id] = idx
            decoder[idx] = _id

        return encoder, decoder
    
    def generate_sequence_data(self) -> dict:
        """
        sequence_data 생성

        Returns:
            dict: train user sequence / valid user sequence
        """
        users = defaultdict(list)
        user_train = {}
        user_valid = {}
        for user, item, time in zip(self.df['user_idx'], self.df['item_idx'], self.df['timestamp']):
            users[user].append(item)
        
        for user in users:
            np.random.seed(self.config.seed)

            user_total = users[user]
            valid = np.random.choice(user_total, size = self.config.valid_samples, replace = False).tolist()
            train = list(set(user_total) - set(valid))

            user_train[user] = train
            user_valid[user] = valid # valid_samples 개수 만큼 검증에 활용 (현재 Task와 가장 유사하게)

        return user_train, user_valid
    
    def get_train_valid_data(self):
        return self.user_train, self.user_valid

    def make_matrix(self, user_list, train = True):
        """
        user_item_dict를 바탕으로 행렬 생성
        """
        mat = torch.zeros(size = (user_list.size(0), self.num_item))
        for idx, user in enumerate(user_list):
            if train:
                mat[idx, self.user_train[user.item()]] = 1
            else:
                mat[idx, self.user_train[user.item()] + self.user_valid[user.item()]] = 1
        return mat


In [None]:
class AEDataSet(Dataset):
    def __init__(self, num_user):
        self.num_user = num_user
        self.users = [i for i in range(num_user)]

    def __len__(self):
        return self.num_user

    def __getitem__(self, idx): 
        user = self.users[idx]
        return torch.LongTensor([user])

# 3. 모델

In [None]:
class AutoRec(nn.Module):
    def __init__(self, num, num_factor):
        super(AutoRec, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(num, num_factor),
            nn.Sigmoid(),
            nn.Linear(num_factor, num_factor // 2),
        )
        self.decoder = nn.Sequential(
            nn.Linear(num_factor // 2, num_factor),
            nn.Sigmoid(),
            nn.Linear(num_factor, num),
        )

        self.init_weights()

    def forward(self, mat):
        latent = self.encoder(mat)
        recont_mat = self.decoder(latent)

        return recont_mat

    def init_weights(self):
        for layer in self.encoder:
            if isinstance(layer, nn.Linear):
                size = layer.weight.size()
                fan_out = size[0]
                fan_in = size[1]
                std = np.sqrt(2.0/(fan_in + fan_out))
                layer.weight.data.normal_(0.0, std)
                layer.bias.data.normal_(0.0, 0.001)
        
        for layer in self.decoder:
            if isinstance(layer, nn.Linear):
                size = layer.weight.size()
                fan_out = size[0]
                fan_in = size[1]
                std = np.sqrt(2.0/(fan_in + fan_out))
                layer.weight.data.normal_(0.0, std)
                layer.bias.data.normal_(0.0, 0.001)

# 4. 학습 함수

In [None]:
def train(model, criterion, optimizer, data_loader, make_matrix_data_set):
    model.train()
    loss_val = 0
    for users in data_loader:
        mat = make_matrix_data_set.make_matrix(users)
        mat = mat.to(device)
        recon_mat = model(mat)

        optimizer.zero_grad()
        loss = criterion(recon_mat, mat)

        loss_val += loss.item()

        loss.backward()
        optimizer.step()
    
    loss_val /= len(data_loader)

    return loss_val

def get_ndcg(pred_list, true_list):
    ndcg = 0
    for rank, pred in enumerate(pred_list):
        if pred in true_list:
            ndcg += 1 / np.log2(rank + 2)
    return ndcg

# hit == recall == precision
def get_hit(pred_list, true_list):
    hit_list = set(true_list) & set(pred_list)
    hit = len(hit_list) / len(true_list)
    return hit

def evaluate(model, data_loader, user_train, user_valid, make_matrix_data_set):
    model.eval()

    NDCG = 0.0 # NDCG@10
    HIT = 0.0 # HIT@10

    with torch.no_grad():
        for users in data_loader:
            mat = make_matrix_data_set.make_matrix(users)
            mat = mat.to(device)

            recon_mat = model(mat)
            recon_mat = recon_mat.softmax(dim = 1)
            recon_mat[mat == 1] = -1.
            rec_list = recon_mat.argsort(dim = 1)

            for user, rec in zip(users, rec_list):
                uv = user_valid[user.item()]
                up = rec[-10:].cpu().numpy().tolist()
                NDCG += get_ndcg(pred_list = up, true_list = uv)
                HIT += get_hit(pred_list = up, true_list = uv)

    NDCG /= len(data_loader.dataset)
    HIT /= len(data_loader.dataset)

    return NDCG, HIT

# 5. 학습

In [None]:
make_matrix_data_set = MakeMatrixDataSet(config = config)
user_train, user_valid = make_matrix_data_set.get_train_valid_data()

In [None]:
ae_dataset = AEDataSet(
    num_user = make_matrix_data_set.num_user,
    )

In [None]:
data_loader = DataLoader(
    ae_dataset,
    batch_size = config.batch_size, 
    shuffle = True, 
    pin_memory = True,
    num_workers = config.num_workers,
    )

In [None]:
model = AutoRec(
    num = make_matrix_data_set.num_item, 
    num_factor = config.num_factor).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)

In [None]:
best_hit = 0
for epoch in range(1, config.num_epochs + 1):
    tbar = tqdm(range(1))
    for _ in tbar:
        train_loss = train(
            model = model, 
            criterion = criterion, 
            optimizer = optimizer, 
            data_loader = data_loader,
            make_matrix_data_set = make_matrix_data_set
            )
        
        ndcg, hit = evaluate(
            model = model,
            data_loader = data_loader,
            user_train = user_train,
            user_valid = user_valid,
            make_matrix_data_set = make_matrix_data_set,
            )

        if best_hit < hit:
            best_hit = hit
            torch.save(model.state_dict(), os.path.join(config.model_path, config.model_name))

        tbar.set_description(f'Epoch: {epoch:3d}| Train loss: {train_loss:.5f}| NDCG@10: {ndcg:.5f}| HIT@10: {hit:.5f}')

Epoch:   1| Train loss: 0.01890| NDCG@10: 0.25544| HIT@10: 0.06200: 100%|██████████| 1/1 [00:00<00:00,  1.62it/s]
Epoch:   2| Train loss: 0.01554| NDCG@10: 0.25896| HIT@10: 0.06230: 100%|██████████| 1/1 [00:00<00:00,  1.31it/s]
Epoch:   3| Train loss: 0.01389| NDCG@10: 0.24409| HIT@10: 0.05961: 100%|██████████| 1/1 [00:00<00:00,  1.42it/s]
Epoch:   4| Train loss: 0.01454| NDCG@10: 0.22810| HIT@10: 0.05618: 100%|██████████| 1/1 [00:00<00:00,  1.42it/s]
Epoch:   5| Train loss: 0.01405| NDCG@10: 0.24902| HIT@10: 0.06036: 100%|██████████| 1/1 [00:00<00:00,  1.90it/s]
Epoch:   6| Train loss: 0.01336| NDCG@10: 0.24866| HIT@10: 0.06200: 100%|██████████| 1/1 [00:00<00:00,  2.39it/s]
Epoch:   7| Train loss: 0.01218| NDCG@10: 0.25641| HIT@10: 0.06304: 100%|██████████| 1/1 [00:00<00:00,  2.23it/s]
Epoch:   8| Train loss: 0.01240| NDCG@10: 0.25384| HIT@10: 0.06274: 100%|██████████| 1/1 [00:00<00:00,  2.45it/s]
Epoch:   9| Train loss: 0.01292| NDCG@10: 0.27865| HIT@10: 0.06513: 100%|██████████| 1/1