<a href="https://colab.research.google.com/github/SeongBeomLEE/RecsysTutorial/blob/main/NCF/NCF_for_implicit_feedback.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install python-box

Collecting python-box
  Downloading python_box-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 6.4 MB/s 
[?25hInstalling collected packages: python-box
Successfully installed python-box-6.0.1


In [None]:
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import os

import random
from datetime import datetime
from time import time
import scipy.sparse as sp

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from box import Box

import warnings

warnings.filterwarnings(action='ignore')

# 1. 학습 설정

In [None]:
config = {
    'data_path' : "/content/drive/MyDrive/RecsysTutorial/Data/MovieLens" , # 데이터 경로

    'model_path' : "/content/drive/MyDrive/RecsysTutorial/model", # 모델 저장 경로
    'model_name' : 'GMF.pt',

    'num_epochs' : 15,
    'lr' : 0.005,
    'batch_size' : 1024,

    "num_factor" : 512,
    "num_layers" : 3,
    "dropout" : 0.2,

    'valid_samples' : 10, # 검증에 사용할 sample 수
    'seed' : 22,
}

device = 'cuda' if torch.cuda.is_available() else 'cpu'

config = Box(config)

In [None]:
if not os.path.isdir(config.model_path):
    os.mkdir(config.model_path)

# 2. 데이터 전처리

In [None]:
class MakeCFDataSet():
    """
    GraphDataSet 생성
    """
    def __init__(self, config):
        self.config = config
        self.df = pd.read_csv(os.path.join(self.config.data_path, 'ratings.csv'))

        self.item_encoder, self.item_decoder = self.generate_encoder_decoder('movieId')
        self.user_encoder, self.user_decoder = self.generate_encoder_decoder('userId')
        self.num_item, self.num_user = len(self.item_encoder), len(self.user_encoder)

        self.df['item_idx'] = self.df['movieId'].apply(lambda x : self.item_encoder[x])
        self.df['user_idx'] = self.df['userId'].apply(lambda x : self.user_encoder[x])
        
        self.exist_users = [i for i in range(self.num_user)]
        self.exist_items = [i for i in range(self.num_item)]
        self.user_train, self.user_valid = self.generate_sequence_data()

    def generate_encoder_decoder(self, col : str) -> dict:
        """
        encoder, decoder 생성

        Args:
            col (str): 생성할 columns 명
        Returns:
            dict: 생성된 user encoder, decoder
        """

        encoder = {}
        decoder = {}
        ids = self.df[col].unique()

        for idx, _id in enumerate(ids):
            encoder[_id] = idx
            decoder[idx] = _id

        return encoder, decoder
    
    def generate_sequence_data(self) -> dict:
        """
        sequence_data 생성

        Returns:
            dict: train user sequence / valid user sequence
        """
        users = defaultdict(list)
        user_train = {}
        user_valid = {}
        for user, item, time in zip(self.df['user_idx'], self.df['item_idx'], self.df['timestamp']):
            users[user].append(item)
        
        for user in users:
            np.random.seed(self.config.seed)

            user_total = users[user]
            valid = np.random.choice(user_total, size = self.config.valid_samples, replace = False).tolist()
            train = list(set(user_total) - set(valid))

            user_train[user] = train
            user_valid[user] = valid # valid_samples 개수 만큼 검증에 활용 (현재 Task와 가장 유사하게)

        return user_train, user_valid

    def neg_sampling(self, users):
        
        neg_sampling_cnt = 3
        
        def sample_neg_items_for_u(u, num):
            neg_items = list(set(self.exist_items) - set(self.user_train[u]))
            neg_batch = random.sample(neg_items, num)
            return neg_batch
        
        _users, neg_items = [], []
        for user in users:
            neg_items += sample_neg_items_for_u(user, neg_sampling_cnt)
            _users += [user] * neg_sampling_cnt

        return _users, neg_items

    def get_train_valid_data(self):
        return self.user_train, self.user_valid

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class CFDataset(Dataset):
    def __init__(self, user_train):
        self.users = []
        self.items = []
        for user in user_train.keys():
            self.items += user_train[user]
            self.users += [user] * len(user_train[user])

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        user = self.users[idx]
        item = self.items[idx]

        return user, item

# 3. 모델

## 3-1. GMF

In [None]:
class GMF(nn.Module):
    def __init__(self, num_user, num_item, num_factor):
        super(GMF, self).__init__()
        self.user_emb = nn.Embedding(num_user, num_factor)
        self.item_emb = nn.Embedding(num_item, num_factor)
        
        self.predict_layer = nn.Sequential(
            nn.Linear(num_factor, 1, bias = False)
        )

        self._init_weight_()
    
    def _init_weight_(self):
        nn.init.normal_(self.user_emb.weight, std=0.01)
        nn.init.normal_(self.item_emb.weight, std=0.01)
        for m in self.predict_layer:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, a=1)
    
    def forward(self, user, item):
        user_emb = self.user_emb(user)
        item_emb = self.item_emb(item)

        output = self.predict_layer(user_emb * item_emb)

        return output.view(-1)

## 3-2. MLP

In [None]:
class MLP(nn.Module):
    def __init__(self, num_user, num_item, num_factor, num_layers, dropout):
        super(MLP, self).__init__()
        self.dropout = dropout
        self.user_emb = nn.Embedding(num_user, num_factor)
        self.item_emb = nn.Embedding(num_item, num_factor)

        MLP_modules = []
        input_size = num_factor * 2
        for i in range(num_layers):
            MLP_modules.append(nn.Dropout(p = self.dropout))
            MLP_modules.append(nn.Linear(input_size, input_size // 2))
            MLP_modules.append(nn.ReLU())
            input_size = input_size // 2
        self.MLP_layers = nn.Sequential(*MLP_modules)

        self.predict_layer = nn.Sequential(
            nn.Linear(input_size, 1, bias = False),
        )

        self._init_weight_()
    
    def _init_weight_(self):
        nn.init.normal_(self.user_emb.weight, std=0.01)
        nn.init.normal_(self.item_emb.weight, std=0.01)
        for m in self.MLP_layers:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
        
        for m in self.predict_layer:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, a=1)
    
    def forward(self, user, item):
        user_emb = self.user_emb(user)
        item_emb = self.item_emb(item)
        
        cat_emb = torch.cat((user_emb, item_emb), -1)

        output = self.MLP_layers(cat_emb)

        output = self.predict_layer(output)

        return output.view(-1)

## 3-3. NeuMF

In [None]:
class NeuMF(nn.Module):
    def __init__(self, GMF, MLP, num_factor):
        super(NeuMF, self).__init__()
        self.gmf_user_emb = GMF.user_emb
        self.gmf_item_emb = GMF.item_emb

        self.mlp_user_emb = MLP.user_emb
        self.mlp_item_emb = MLP.item_emb

        self.mlp_layer = MLP.MLP_layers
        for i in self.mlp_layer:
            if isinstance(i, nn.Linear):
                out_features = i.out_features

        self.predict_layer = nn.Sequential(
            nn.Linear(num_factor + out_features, 1, bias = False),
        )

        self._init_weight_()
    
    def _init_weight_(self):
        for m in self.predict_layer:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, a=1)

    def forward(self, user, item):
        gmf_user_emb = self.gmf_user_emb(user)
        gmf_item_emb = self.gmf_item_emb(item)
        gmf_output = gmf_user_emb * gmf_item_emb

        mlp_user_emb = self.mlp_user_emb(user)
        mlp_item_emb = self.mlp_item_emb(item)
        mlp_cat_emb = torch.cat((mlp_user_emb, mlp_item_emb), -1)
        mlp_output = self.mlp_layer(mlp_cat_emb)
        
        cat_output = torch.cat((gmf_output, mlp_output), -1)

        output = self.predict_layer(cat_output)

        return output.view(-1)

## 3-4 BPR Loss

In [None]:
class BPR_Loss(nn.Module):
    def __init__(self):
        super(BPR_Loss, self).__init__()
    
    def forward(self, pos, neg):
        bpr_loss = -torch.mean(torch.log(torch.sigmoid(pos - neg)))
        return bpr_loss

# 4. 학습 함수

In [None]:
def train(model, data_loader, criterion, optimizer, make_cf_data_set):
    model.train()
    loss_val = 0

    for users, items in data_loader:
        neg_users, neg_items = make_cf_data_set.neg_sampling(users.numpy().tolist())

        all_users = torch.concat([users, torch.tensor(neg_users)]).to(device)
        all_items = torch.concat([items, torch.tensor(neg_items)]).to(device)

        optimizer.zero_grad()

        output = model(all_users, all_items)
        pos_output, neg_output = torch.split(output, [len(users), len(neg_users)])
        pos_output = torch.concat([pos_output.view(-1, 1), pos_output.view(-1, 1), pos_output.view(-1, 1)], dim = 1).view(-1)
        loss = criterion(pos_output, neg_output)

        loss.backward()
        optimizer.step()

        loss_val += loss.item()

    loss_val /= len(data_loader)

    return loss_val

def get_ndcg(pred_list, true_list):
    ndcg = 0
    for rank, pred in enumerate(pred_list):
        if pred in true_list:
            ndcg += 1 / np.log2(rank + 2)
    return ndcg

# 대회 메트릭인 recall과 동일
def get_hit(pred_list, true_list):
    hit_list = set(true_list) & set(pred_list)
    hit = len(hit_list) / len(true_list)
    return hit

def evaluate(model, user_train, user_valid, make_cf_data_set):
    model.eval()

    NDCG = 0.0 # NDCG@10
    HIT = 0.0 # HIT@10

    all_users = make_cf_data_set.exist_users
    all_items = make_cf_data_set.exist_items
    with torch.no_grad():
        for user in all_users:
            users = [user] * len(all_items)
            users, items = torch.tensor(users).to(device), torch.tensor(all_items).to(device)

            output = model(users, items)
            output = output.softmax(dim = 0)
            output[user_train[user]] = -1.

            uv = user_valid[user]
            up = output.argsort()[-10:].cpu().numpy().tolist()

            NDCG += get_ndcg(pred_list = up, true_list = uv)
            HIT += get_hit(pred_list = up, true_list = uv)

    NDCG /= len(all_users)
    HIT /= len(all_users)

    return NDCG, HIT

# 5. 학습

In [None]:
make_cf_data_set = MakeCFDataSet(config = config)
user_train, user_valid = make_cf_data_set.get_train_valid_data()

In [None]:
cf_dataset = CFDataset(user_train = user_train)
data_loader = DataLoader(
    cf_dataset, 
    batch_size = config.batch_size, 
    shuffle = True, 
    drop_last = False)

## 5-1. GMF

In [None]:
model = GMF(
    num_user = make_cf_data_set.num_user, 
    num_item = make_cf_data_set.num_item, 
    num_factor = config.num_factor).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr = config.lr)
criterion = BPR_Loss()

In [None]:
best_hit = 0
for epoch in range(1, config.num_epochs + 1):
    tbar = tqdm(range(1))
    for _ in tbar:
        train_loss = train(
            model = model, 
            data_loader = data_loader, 
            criterion = criterion, 
            optimizer = optimizer, 
            make_cf_data_set = make_cf_data_set
            )
        
        ndcg, hit = evaluate(model, user_train, user_valid, make_cf_data_set)
        
        if best_hit < hit:
            best_hit = hit
            torch.save(model.state_dict(), os.path.join(config.model_path, config.model_name))

        tbar.set_description(f'Epoch: {epoch:3d}| Train loss: {train_loss:.5f}| NDCG@10: {ndcg:.5f}| HIT@10: {hit:.5f}')

Epoch:   1| Train loss: 0.48790| NDCG@10: 0.27005| HIT@10: 0.06647: 100%|██████████| 1/1 [01:01<00:00, 61.58s/it]
Epoch:   2| Train loss: 0.19521| NDCG@10: 0.40536| HIT@10: 0.09866: 100%|██████████| 1/1 [00:58<00:00, 58.91s/it]
Epoch:   3| Train loss: 0.08507| NDCG@10: 0.43686| HIT@10: 0.10224: 100%|██████████| 1/1 [00:51<00:00, 51.90s/it]
Epoch:   4| Train loss: 0.04422| NDCG@10: 0.44783| HIT@10: 0.10671: 100%|██████████| 1/1 [00:51<00:00, 51.17s/it]
Epoch:   5| Train loss: 0.02993| NDCG@10: 0.44954| HIT@10: 0.10715: 100%|██████████| 1/1 [00:50<00:00, 50.22s/it]
Epoch:   6| Train loss: 0.02163| NDCG@10: 0.43496| HIT@10: 0.10522: 100%|██████████| 1/1 [00:51<00:00, 51.97s/it]
Epoch:   7| Train loss: 0.01670| NDCG@10: 0.47004| HIT@10: 0.10999: 100%|██████████| 1/1 [00:50<00:00, 50.55s/it]
Epoch:   8| Train loss: 0.01382| NDCG@10: 0.44355| HIT@10: 0.10805: 100%|██████████| 1/1 [00:50<00:00, 50.31s/it]
Epoch:   9| Train loss: 0.01136| NDCG@10: 0.46517| HIT@10: 0.10835: 100%|██████████| 1/1

KeyboardInterrupt: ignored

## 5-2. MLP

In [None]:
model = MLP(
    num_user = make_cf_data_set.num_user, 
    num_item = make_cf_data_set.num_item,
    num_factor = config.num_factor,
    num_layers = config.num_layers,
    dropout = config.dropout,).to(device)
    
optimizer = torch.optim.Adam(model.parameters(), lr = config.lr)
criterion = BPR_Loss()

In [None]:
best_hit = 0
for epoch in range(1, config.num_epochs + 1):
    tbar = tqdm(range(1))
    for _ in tbar:
        train_loss = train(
            model = model, 
            data_loader = data_loader, 
            criterion = criterion, 
            optimizer = optimizer, 
            make_cf_data_set = make_cf_data_set
            )
        
        ndcg, hit = evaluate(model, user_train, user_valid, make_cf_data_set)
        
        if best_hit < hit:
            best_hit = hit
            torch.save(model.state_dict(), os.path.join(config.model_path, "MLP.pt"))

        tbar.set_description(f'Epoch: {epoch:3d}| Train loss: {train_loss:.5f}| NDCG@10: {ndcg:.5f}| HIT@10: {hit:.5f}')

Epoch:   1| Train loss: 0.36789| NDCG@10: 0.24865| HIT@10: 0.06036: 100%|██████████| 1/1 [01:01<00:00, 61.69s/it]
Epoch:   2| Train loss: 0.30033| NDCG@10: 0.24330| HIT@10: 0.05618: 100%|██████████| 1/1 [01:00<00:00, 60.61s/it]
Epoch:   3| Train loss: 0.26206| NDCG@10: 0.25007| HIT@10: 0.05961: 100%|██████████| 1/1 [01:00<00:00, 60.55s/it]
Epoch:   4| Train loss: 0.21405| NDCG@10: 0.31328| HIT@10: 0.07303: 100%|██████████| 1/1 [01:00<00:00, 60.25s/it]
Epoch:   5| Train loss: 0.18037| NDCG@10: 0.34407| HIT@10: 0.07973: 100%|██████████| 1/1 [01:00<00:00, 60.53s/it]
Epoch:   6| Train loss: 0.15760| NDCG@10: 0.38236| HIT@10: 0.09031: 100%|██████████| 1/1 [01:01<00:00, 61.77s/it]
Epoch:   7| Train loss: 0.14214| NDCG@10: 0.40384| HIT@10: 0.09463: 100%|██████████| 1/1 [00:59<00:00, 59.72s/it]
Epoch:   8| Train loss: 0.13183| NDCG@10: 0.37171| HIT@10: 0.09046: 100%|██████████| 1/1 [00:59<00:00, 59.38s/it]
Epoch:   9| Train loss: 0.12116| NDCG@10: 0.36791| HIT@10: 0.08852: 100%|██████████| 1/1

## 5-3. NMF

In [None]:
gmf = GMF(
    num_user = make_cf_data_set.num_user, 
    num_item = make_cf_data_set.num_item, 
    num_factor = config.num_factor).to(device)

gmf.load_state_dict(torch.load(os.path.join(config.model_path, f'GMF.pt')))

mlp = MLP(
    num_user = make_cf_data_set.num_user, 
    num_item = make_cf_data_set.num_item,
    num_factor = config.num_factor,
    num_layers = config.num_layers,
    dropout = config.dropout,).to(device)

mlp.load_state_dict(torch.load(os.path.join(config.model_path, f'MLP.pt')))

model = NeuMF(
    GMF = gmf, 
    MLP = mlp, 
    num_factor = config.num_factor).to(device)

optimizer = torch.optim.SGD(model.parameters(), lr = config.lr, momentum = 0.9)
criterion = BPR_Loss()

In [None]:
best_hit = 0
for epoch in range(1, config.num_epochs + 1):
    tbar = tqdm(range(1))
    for _ in tbar:
        train_loss = train(
            model = model, 
            data_loader = data_loader, 
            criterion = criterion, 
            optimizer = optimizer, 
            make_cf_data_set = make_cf_data_set
            )
        
        ndcg, hit = evaluate(model, user_train, user_valid, make_cf_data_set)
        
        if best_hit < hit:
            best_hit = hit
            torch.save(model.state_dict(), os.path.join(config.model_path, "NMF.pt"))

        tbar.set_description(f'Epoch: {epoch:3d}| Train loss: {train_loss:.5f}| NDCG@10: {ndcg:.5f}| HIT@10: {hit:.5f}')

Epoch:   1| Train loss: 0.15745| NDCG@10: 0.36547| HIT@10: 0.08495: 100%|██████████| 1/1 [01:02<00:00, 62.08s/it]
Epoch:   2| Train loss: 0.09142| NDCG@10: 0.39837| HIT@10: 0.09568: 100%|██████████| 1/1 [01:01<00:00, 61.00s/it]
Epoch:   3| Train loss: 0.08521| NDCG@10: 0.41296| HIT@10: 0.09806: 100%|██████████| 1/1 [01:03<00:00, 63.78s/it]
Epoch:   4| Train loss: 0.08137| NDCG@10: 0.40306| HIT@10: 0.09732: 100%|██████████| 1/1 [01:02<00:00, 62.35s/it]
Epoch:   5| Train loss: 0.08098| NDCG@10: 0.41669| HIT@10: 0.09896: 100%|██████████| 1/1 [01:00<00:00, 60.96s/it]
Epoch:   6| Train loss: 0.07980| NDCG@10: 0.42041| HIT@10: 0.10060: 100%|██████████| 1/1 [01:01<00:00, 61.50s/it]
Epoch:   7| Train loss: 0.07746| NDCG@10: 0.42372| HIT@10: 0.10104: 100%|██████████| 1/1 [01:02<00:00, 62.46s/it]
Epoch:   8| Train loss: 0.07755| NDCG@10: 0.41844| HIT@10: 0.10134: 100%|██████████| 1/1 [01:10<00:00, 70.55s/it]
Epoch:   9| Train loss: 0.07628| NDCG@10: 0.42664| HIT@10: 0.10194: 100%|██████████| 1/1