# Factorization Machine 실습코드

Factorization Machine이 어떻게 이뤄지는지 실제 실습 코드를 바탕으로 구체적으로 이해해보려고 한다.

In [6]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

In [2]:
ratings = pd.read_csv('./ratings.csv')

간단한 실습을 위해, movie-lens 가장 최신의 100k를 활용해보고자 한다.

## Example. Pytorch를 활용하는 방법

In [4]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


Pytorch 활용을 위해서는 custom dataset을 만드는 것이 필요할 것이다.
10만개 정도의 데이터 셋 중 80%는 train set, 10%는 valid set, 10%는 test set으로 활용한다.

In [9]:
class MovieLens100kDataset(torch.utils.data.Dataset):
    def __init__(self):
        data = ratings.to_numpy()[:,:3]
        self.items = data[:,:2]
        self.targets = data[:,2]
        self.field_dims = np.max(self.items, axis = 0)+1
        self.user_field_idx = np.array((0,), dtype = np.long)
        self.item_field_idx = np.array((1,), dtype = np.long)
    
    def __len__(self):
        return self.targets.shape[0]
    
    def __getitem__(self, index):
        return self.items[index], self.targets[index]

In [11]:
dataset = MovieLens100kDataset()
train_len = int(len(dataset) * 0.8)
valid_len = int(len(dataset) * 0.1)
test_len = len(dataset) - train_len - valid_len
train_set, valid_set, test_set = torch.utils.data.random_split(dataset, (train_len, valid_len, test_len))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.user_field_idx = np.array((0,), dtype = np.long)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.item_field_idx = np.array((1,), dtype = np.long)


In [14]:
from torch.utils.data import DataLoader

In [15]:
batch_size = 32
train_loader = DataLoader(train_set, batch_size = batch_size)
valid_loader = DataLoader(valid_set, batch_size = batch_size)
test_loader = DataLoader(test_set, batch_size = batch_size)

### Model 구현

model 구현은 다음을 바탕으로 진행한다.
https://github.com/rixwew/pytorch-fm

In [21]:
class FeaturesLinear(torch.nn.Module):
    def __init__(self, field_dims, output_dim = 1):
        super().__init__()
        # 입력 차원 임베딩.
        self.fc = torch.nn.Embedding(sum(field_dims), output_dim)
        # 출력시의 편향 학습
        self.bias = torch.nn.Parameter(torch.zeros(output_dim,))
        # offset의, 역할은 무엇인가?
        self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype = np.long)
        
    def forward(self, x):
        x = x + x.new_tensor(self.offsets).unsqueeze(0)
        return torch.sum(self.fc(x), dim = 1) + self.bias

In [22]:
class FeaturesEmbedding(torch.nn.Module):
    def __init__(self, field_dims, embed_dim):
        super().__init__()
        self.embedding = torch.nn.Embedding(sum(field_dims), embed_dim)
        self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype = np.long)
        torch.nn.init.xavier_uniform_(self.embedding.weight.data)
    
    def forward(self, x):
        x = x + x.new_tensor(self.offsets).unsqueeze(0)
        return self.embedding(x)

In [23]:
class FactorizationMachine(torch.nn.Module):
    def __init__(self, reduce_sum = True):
        super().__init__()
        self.reduce_sum = reduce_sum
        
    def forward(self, x):
        square_of_sum = torch.sum(x, dim = 1) ** 2
        sum_of_square = torch.sum(x ** 2, dim = 1)
        ix = square_of_sum - sum_of_square
        if self.reduce_sum:
            ix = torch.sum(ix, dim = 1, keepim = True)
        return 0.5 * ix

In [24]:
class FactorizationMachineModel(torch.nn.Module):
    def __init__(self, field_dims, embed_dim):
        super().__init__()
        self.embedding = FeaturesEmbedding(field_dims, embed_dim)
        self.linear = FeaturesLinear(field_dims)
        self.fm = FactorizaionMachine(reduce_sum = True)
        
    def forward(self, x):
        x = self.linear(x) + self.fm(self.embedding(x))
        return torch.sigmoid(x.squeeze(1))

In [None]:
def train(model, optimizer, data_loader, criterion, device, log_interval=100):
    model.train()
    total_loss = 0
    tk0 = tqdm.tqdm(data_loader, smoothing=0, mininterval=1.0)
    for i, (fields, target) in enumerate(tk0):
        fields, target = fields.to(device), target.to(device)
        y = model(fields)
        loss = criterion(y, target.float())
        model.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if (i + 1) % log_interval == 0:
            tk0.set_postfix(loss=total_loss / log_interval)
            total_loss = 0

In [None]:
def test(model, data_loader, device):
    model.eval()
    targets, predicts = list(), list()
    with torch.no_grad():
        for fields, target in tqdm.tqdm(data_loader, smoothing=0, mininterval=1.0):
            fields, target = fields.to(device), target.to(device)
            y = model(fields)
            targets.extend(target.tolist())
            predicts.extend(y.tolist())
    return roc_auc_score(targets, predicts)


In [None]:
model = get_model(model_name, dataset).to(device)
    criterion = torch.nn.BCELoss()
    optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    early_stopper = EarlyStopper(num_trials=2, save_path=f'{save_dir}/{model_name}.pt')
    for epoch_i in range(epoch):
        train(model, optimizer, train_data_loader, criterion, device)
        auc = test(model, valid_data_loader, device)
        print('epoch:', epoch_i, 'validation: auc:', auc)
        if not early_stopper.is_continuable(model, auc):
            print(f'validation: best auc: {early_stopper.best_accuracy}')
            break
    auc = test(model, test_data_loader, device)
    print(f'test auc: {auc}')

## Example. xlearn을 활용하는 방법

## Example. FastFM을 활용하는 방법

## 더 나아가는 방법
Factorization Machine 이상의 방법론에 대해 생각해 볼 수 있을것이다.
DeepFM 등의 방법론도 존재하고, Factorization Machine을 개선한 모델에 대해서는 차차 알아가고자 한다.