## library

In [1]:
import csv
import os
import pickle
import numpy as np
import pandas as pd
import random
from collections import Counter
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.init import normal_
from torch.utils.data import DataLoader, Dataset, TensorDataset

In [2]:
seed = 42
random.seed(seed)
np.random.seed(seed)

## data

- 데이터 준비

In [3]:
rating_data = "/opt/ml/input/data/train/train_ratings.csv"

raw_rating_df = pd.read_csv(rating_data)
raw_rating_df
raw_rating_df['rating'] = 1.0 # implicit feedback
raw_rating_df.drop(['time'],axis=1,inplace=True)
print("Raw rating df")
print(raw_rating_df)

Raw rating df
           user   item  rating
0            11   4643     1.0
1            11    170     1.0
2            11    531     1.0
3            11    616     1.0
4            11   2140     1.0
...         ...    ...     ...
5154466  138493  44022     1.0
5154467  138493   4958     1.0
5154468  138493  68319     1.0
5154469  138493  40819     1.0
5154470  138493  27311     1.0

[5154471 rows x 3 columns]


In [4]:
user_ids = raw_rating_df['user'].unique()
movie_ids = raw_rating_df['item'].unique()

ratings_matrix = raw_rating_df.pivot(index='user', columns='item', values='rating')

# implicit_df = dict()
# implicit_df['user'] = list()
# implicit_df['item'] = list()
# implicit_df['implicit_feedback'] = list()
# user_dict = dict()
# movie_dict = dict()
# for u, user_id in tqdm(enumerate(user_ids)):
#     user_dict[u] = user_id
#     for i, item in enumerate(movie_ids):
#         if i not in movie_dict:
#             movie_dict[i] = item
#         implicit_df['user'].append(u)
#         implicit_df['item'].append(i)
#         if pd.isna(ratings_matrix.loc[user_id, item]):
#             implicit_df['implicit_feedback'].append(0)
#         else:
#             implicit_df['implicit_feedback'].append(1)

# implicit_df = pd.DataFrame(implicit_df)

In [5]:
''' 데이터 저장 '''
# with open('implicit_df', 'wb') as f:
#     pickle.dump(implicit_df, f)
# with open('user_dict', 'wb') as f:
#     pickle.dump(user_dict, f)
# with open('movie_dict', 'wb') as f:
#     pickle.dump(movie_dict, f)

''' 데이터 로드 '''
with open('implicit_df', 'rb') as f:
    implicit_df = pickle.load(f)
with open('user_dict', 'rb') as f:
    user_dict = pickle.load(f)
with open('movie_dict', 'rb') as f:
    movie_dict = pickle.load(f)

In [6]:
# 효율성을 위해 category타입으로 변경
implicit_df['user'] = implicit_df['user'].astype("category")
implicit_df['item'] = implicit_df['item'].astype("category")

In [7]:
ratings_matrix = ratings_matrix.fillna(0.)
ratings_matrix.head(3)

item,1,2,3,4,5,6,7,8,9,10,...,116823,117176,117533,117881,118696,118700,118900,118997,119141,119145
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
implicit_df.head(3)

Unnamed: 0,user,item,implicit_feedback
0,0,0,1
1,0,1,1
2,0,2,1


- 데이터 분리(X, y)

In [8]:
train_X = implicit_df.loc[:, implicit_df.columns != 'implicit_feedback']
train_y = implicit_df['implicit_feedback']

In [9]:
display(train_X.head(3))
display(train_y.head(3))

Unnamed: 0,user,item
0,0,0
1,0,1
2,0,2


0    1
1    1
2    1
Name: implicit_feedback, dtype: int64

- Dataset

In [10]:
dataset = TensorDataset(torch.LongTensor(np.array(train_X)), torch.FloatTensor(np.array(train_y)))
next(iter(dataset))

(tensor([0, 0]), tensor(1.))

## model

In [12]:
class MLPLayers(nn.Module):
    """
    여러 층의 MLP Layer Class
    
    Args:
        - layers: (List) input layer, hidden layer, output layer의 node 수를 저장한 List.
                ex) [5, 4, 3, 2] -> input layer: 5 nodes, output layer: 2 nodes, hidden layers: 4 nodes, 3 nodes
        - dropout: (float) dropout 확률
    Shape:
        - Input: (torch.Tensor) input features. Shape: (batch size, # of input nodes)
        - Output: (torch.Tensor) output features. Shape: (batch size, # of output nodes)
    """
    def __init__(self, layers, dropout):
        super(MLPLayers, self).__init__()
        
        # initialize Class attributes
        self.layers = layers
        self.n_layers = len(self.layers) - 1
        self.dropout = dropout
        self.activation = nn.ReLU()
        
        # define layers
        mlp_modules = list()
        for i in range(self.n_layers):
            mlp_modules.append(nn.Dropout(p=self.dropout))
            input_size = self.layers[i]
            output_size = self.layers[i+1]
            mlp_modules.append(nn.Linear(input_size, output_size))
            mlp_modules.append(self.activation)

        self.mlp_layers = nn.Sequential(*mlp_modules)
        
        self.apply(self._init_weights)
        
    # initialize weights
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            normal_(module.weight.data, 0, 0.01)
            if module.bias is not None:
                module.bias.data.fill_(0.0)
    
    def forward(self, input_feature):
        return self.mlp_layers(input_feature)


class NCF(nn.Module):
    """
    Neural Collaborative Filtering
    
    Args:
        - n_users: (int) 전체 유저의 수
        - n_items: (int) 전체 아이템의 수
        - emb_dim: (int) Embedding의 Dimension
        - layers: (List) Neural CF Layers의 각 node 수를 저장한 List.
                ex) [5, 4, 3, 2] -> hidden layers: 5 nodes, 4 nodes, 3 nodes, 2 nodes
        - dropout: (float) dropout 확률
        - pretrained: (str) pretrained된 임베딩 weight 위치
    Shape:
        - Input: (torch.Tensor) input features, (user_id, item_id). Shape: (batch size, 2)
        - Output: (torch.Tensor) expected implicit feedback. Shape: (batch size,)
    """
    def __init__(self, n_users, n_items, emb_dim, layers, dropout, pretrained = None):
        super(NCF, self).__init__()
        
        # initialize Class attributes
        self.n_users = n_users
        self.n_items = n_items
        self.emb_dim = emb_dim
        self.layers = layers
        self.n_layers = len(self.layers) + 1
        self.dropout = dropout
        
        # define layers
        self.user_embedding = nn.Embedding(self.n_users, self.emb_dim)
        self.item_embedding = nn.Embedding(self.n_items, self.emb_dim)
        self.mlp_layers = MLPLayers([2 * self.emb_dim] + self.layers, self.dropout)
        self.predict_layer = nn.Linear(self.layers[-1], 1)
        self.sigmoid = nn.Sigmoid()
        
        self.apply(self._init_weights)

        # load item_embedding's weight(pretrained)
        if pretrained is not None:
            with open(pretrained, 'rb') as f:
                pretrained_emb = pickle.load(f)
            pretrained_weight = pretrained_emb.weight[sorted(movie_dict.values()), :]
            
            item_weight = self.item_embedding.state_dict()
            item_weight['weight'] = pretrained_weight
            self.item_embedding.load_state_dict(item_weight)
        
    # initialize weights
    def _init_weights(self, module):
        if isinstance(module, nn.Embedding):
            normal_(module.weight.data, mean=0.0, std=0.01)
        elif isinstance(module, nn.Linear):
            normal_(module.weight.data, 0, 0.01)
            if module.bias is not None:
                module.bias.data.fill_(0.0)
    
    def forward(self, input_feature):
        user, item = torch.split(input_feature, [1, 1], -1)
        user = user.squeeze(-1)
        item = item.squeeze(-1)
        
        user_e = self.user_embedding(user)
        item_e = self.item_embedding(item)
        
        input_feature = torch.cat((user_e, item_e), -1)
        mlp_output = self.mlp_layers(input_feature)
        output = self.predict_layer(mlp_output)
        output = self.sigmoid(output)
        return output.squeeze(-1)

## 설정 및 하이퍼파라미터

In [13]:
# 설정 및 하이퍼파라미터
batch_size = 2048
data_shuffle = True
emb_dim = 100
layers = [1024, 256, 64]
dropout = 0
epochs = 5
learning_rate = 0.001
gpu_idx = 0
early_stop = 5

n_users = raw_rating_df['user'].nunique()
n_items = raw_rating_df['item'].nunique()

torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)  # 재현을 위한 설정
    torch.backends.cudnn.deterministic = True  # 재현을 위한 설정
device = torch.device("cuda:{}".format(gpu_idx) if torch.cuda.is_available() else "cpu")

dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=data_shuffle)
model = NCF(n_users, n_items, emb_dim, layers, dropout, pretrained='/opt/ml/input/code/experiment/pretrained_emb100').to(device)

loss_fn = nn.BCELoss().to(device)
err_fn = None
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, amsgrad=True)

## train

In [None]:
# # train(early stop)
# size = len(dataloader.dataset)
# num_batches = len(dataloader)
# save_loss = 99999
# e_stop_stack = 0
# print('training started.')
# for e in range(epochs):
#     if e_stop_stack > early_stop:
#         break
#     train_loss = 0
#     print(f'Epoch {e+1} ...')
#     for batch, (x, y) in enumerate(tqdm(dataloader, 
#             total=len(dataloader),
#             bar_format="{l_bar}{r_bar}")):
#         x, y = x.to(device), y.to(device)
#         model.train()
#         optimizer.zero_grad()
#         output = model(x)
#         loss = loss_fn(output, y.float())
#         loss.backward()
#         optimizer.step()

#         train_loss += loss.item()s
#         # if (batch+1) % 1000 == 0:
#         #     loss, current = loss.item(), batch * len(x)
#         #     print(f"Loss: {loss:>7f} | [{current:>5d}/{size:>5d}]")
#     train_loss /= num_batches
#     print(f'  - AVG Losses: {train_loss:>7f}')
#     if train_loss < save_loss:
#         e_stop_stack = 0
#         save_loss = train_loss
#         print(f'  - Better performance. Saving model ...')
#         with open('mdoel_iter5_1024_256_64_lr001', 'wb') as f:
#             pickle.dump(model, f)
#     else:
#         e_stop_stack += 0
#     print()
# print('training done.')

In [None]:
# # 모델 저장
# with open('mdoel_iter5_1024_256_64_lr001', 'wb') as f:
#     pickle.dump(model, f)

In [14]:
# 모델 로드
with open('mdoel_iter5_1024_256_64_lr001', 'rb') as f:
    model = pickle.load(f)

## inference

In [32]:
raw_rating_df['rating'] = -100
inference_matrix = raw_rating_df.pivot_table('rating', 'user', 'item').fillna(0)

inference_data = implicit_df[implicit_df.implicit_feedback != 1]
inference_X = inference_data.loc[:, implicit_df.columns != 'implicit_feedback']
inference_y = inference_data['implicit_feedback']

dataset = TensorDataset(torch.LongTensor(np.array(inference_X)), torch.FloatTensor(np.array(inference_y)))

In [38]:
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
model = model.to(device)
model.eval()
for batch, (x, y) in enumerate(tqdm(dataloader, total=len(dataloader), bar_format="{l_bar}{r_bar}")):
    with torch.no_grad():
        in_x = x.to(device)
        output = model(in_x)

        x = x.numpy()
        for u, i, r in zip(x[:,0], x[:,1], output.to('cpu').detach().numpy()):
            inference_matrix.loc[user_dict[u], movie_dict[i]] = r


100%|| 101716/101716 [9:06:36<00:00,  3.10it/s] 


In [40]:
inference_matrix.head(3)

item,1,2,3,4,5,6,7,8,9,10,...,116823,117176,117533,117881,118696,118700,118900,118997,119141,119145
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,-100.0,0.843135,0.024481,0.001992,0.053384,0.029017,0.010009,0.010452,0.010817,0.310516,...,3.33005e-08,2.651161e-09,8.283698e-09,1.582915e-08,1.077139e-07,4.086198e-09,8.319276e-09,5.289579e-08,5.341038e-09,3.084643e-08
14,-100.0,0.193124,0.043458,0.000409,0.036686,0.000452,-100.0,0.008109,3.6e-05,0.07904,...,1.128581e-06,1.011685e-08,5.417369e-10,3.901542e-07,3.445418e-07,3.263885e-08,2.879336e-08,3.218162e-06,1.03619e-08,1.823451e-08
18,0.032893,0.001325,0.000658,0.000872,0.000949,0.003495,0.002556,5.5e-05,2.5e-05,0.001376,...,9.642567e-08,7.717298e-08,7.628975e-08,7.779003e-07,6.396987e-08,1.076574e-07,5.925844e-08,3.206567e-08,2.247216e-08,6.40139e-09


In [41]:
result = np.argpartition(inference_matrix, -10).iloc[:, -10:]
final_users, final_items = list(), list()
item_columns = inference_matrix.columns
for idx in range(result.shape[0]):
    final_users.extend([result.index[idx]] * 10)
    for i in result.values[idx]:
        final_items.append(item_columns[i])
        
submission_df = pd.DataFrame(zip(final_users,final_items), columns=['user','item'])
submission_df.to_csv("./ncf_iter5_1024_256_64_lr001.csv", index=False)

In [42]:
submission_df

Unnamed: 0,user,item
0,11,7373
1,11,40815
2,11,4886
3,11,4370
4,11,5679
...,...,...
313595,138493,2628
313596,138493,589
313597,138493,1270
313598,138493,2011
