In [2]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.utils.data

import numpy as np
import math

import pickle

### データを読み込む

In [3]:
with open('data/user_train_arr.pkl', mode='rb') as f:
    user_arr = pickle.load(f)
with open('data/movie_train_arr.pkl', mode='rb') as f:
    movie_arr = pickle.load(f)
with open('data/score_train_arr.pkl', mode='rb') as f:
    score_arr = pickle.load(f)
with open('data/user_test_arr.pkl', mode='rb') as f:
    user_test_arr = pickle.load(f)
with open('data/movie_test_arr.pkl', mode='rb') as f:
    movie_test_arr = pickle.load(f)
with open('data/score_test_arr.pkl', mode='rb') as f:
    score_test_arr = pickle.load(f)

In [4]:
type(user_arr), type(movie_arr), type(score_arr)

(scipy.sparse.csc.csc_matrix, scipy.sparse.csc.csc_matrix, numpy.ndarray)

In [5]:
user_arr.shape, movie_arr.shape, score_arr.shape

((750156, 6041), (750156, 3953), (750156,))

In [6]:
user_test_arr.shape, movie_test_arr.shape, score_test_arr.shape

((250053, 6041), (250053, 3953), (250053,))

### NCF Architecture

In [7]:
class Model(nn.Module):
    
    def __init__(self):
        super(Model, self).__init__()
        
        # GMF
        self.item_embedding_gmf = nn.Linear(3953, 1000, bias=False)
        self.user_embedding_gmf = nn.Linear(6041, 1000, bias=False)
        
        self.GMF_Linear = nn.Linear(1000, 1)
        
        # MLP
        self.item_embedding_mlp = nn.Linear(3953, 1000, bias=False)
        self.user_embedding_mlp = nn.Linear(6041, 1000, bias=False)
        self.linear1 = nn.Linear(2000, 1000)
        self.linear2 = nn.Linear(1000, 500)
        self.linear3 = nn.Linear(500, 250)
        self.linear4 = nn.Linear(250, 125)
        self.linear5 = nn.Linear(125, 65)
        self.linear6 = nn.Linear(65, 1)
        
        # NeuMF
        self.linear_MF = nn.Linear(2, 1)
    
    def forward(self, item, user):
        
        # GMF
        item_gmf = self.item_embedding_gmf(item)
        user_gmf = self.user_embedding_gmf(user)
        product = torch.mul(item_gmf, user_gmf)
        y_gmf = self.GMF_Linear(product)
        y_gmf = torch.sigmoid(y_gmf)
        
        # MLP
        item_mlp = self.item_embedding_mlp(item)
        user_mlp = self.user_embedding_mlp(user)
        item_user_concat = torch.cat([item_mlp, user_mlp], dim=1)
        y_mlp = self.linear1(item_user_concat)
        y_mlp = torch.relu(y_mlp)
        y_mlp = self.linear2(y_mlp)
        y_mlp = torch.relu(y_mlp)
        y_mlp = self.linear3(y_mlp)
        y_mlp = torch.relu(y_mlp)
        y_mlp = self.linear4(y_mlp)
        y_mlp = torch.relu(y_mlp)
        y_mlp = self.linear5(y_mlp)
        y_mlp = torch.relu(y_mlp)
        y_mlp = self.linear6(y_mlp)
        
        # NeuMF
        y = torch.cat([y_gmf, y_mlp], dim=1)
        y = self.linear_MF(y)
        y = torch.sigmoid(y)
        
        return y
        

In [8]:
model = Model()

In [9]:
movie_arr[0].toarray().shape

(1, 3953)

In [10]:
item = torch.FloatTensor(movie_arr[0].toarray())
user = torch.FloatTensor(user_arr[0].toarray())

model(item, user)

tensor([[0.7014]], grad_fn=<SigmoidBackward>)

### DataSetを作成する

In [11]:
# Datasetの定義
class MyDataSet(torch.utils.data.Dataset):
    def __init__(self, item, user, score):
        
        self.item = item
        self.user = user
        self.score = score
        self.length = item.shape[1]
        
    def __len__(self):
        return self.length
    
    def __getitem__(self, index):
        
        item_vec = self.item[index].toarray().astype("float32")
        user_vec = self.user[index].toarray().astype("float32")
        score = self.score[index].astype("float32")
        
        return item_vec, user_vec, score

In [12]:
trainset = MyDataSet(movie_arr, user_arr, score_arr)

In [13]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=100, shuffle=True)

In [14]:
for data in trainloader:
    res = model(torch.squeeze(data[0]), torch.squeeze(data[1]))
    break

### 訓練開始

In [15]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score - self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), 'checkpoint_pairwise.pt')
        self.val_loss_min = val_loss

In [16]:
# ハイパーパラメータ
learning_rate = 0.0001
epochs = 50

In [17]:
model = Model()

In [21]:
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [22]:
early_stopping = EarlyStopping(patience=10, verbose=True)

In [24]:
train_loss = list()
valid_loss = list()
train_losses = list()
valid_losses = list()
for epoch in range(epochs):
    print(epoch)
    
    model.train()
    for data in trainloader:
        
        output = model(torch.squeeze(data[0]), torch.squeeze(data[1]))
        loss = criterion(torch.squeeze(output), data[2])
    
        # Backward and optimize
        optimizer.zero_grad()
        
        train_losses.append(loss.item())
        
        loss.backward()
        optimizer.step()
        
    train_loss = np.average(train_losses)
    print(train_loss)

0
0.5775279581546784
1
0.5772336773574352
2
0.5764692490299542
3
0.5671645103022456
4
0.5424926435947418
5
0.5233024212221304
6
0.5083345114120416
7
0.4965631103143096
8
0.48704628803663785
9
0.47932173766195774
10
0.4725741881538521
11
0.46579667658855517
12
0.4591732217715337
13
0.4533531438559294
14
0.4483123677472273
15
0.4438952469732612
16
0.4399811647394124
17
0.43649066334797276
18
0.43337855444926965
19
0.4305829444900155
20
0.4280332066473507
21
0.4257327745922587
22
0.4236189263670341
23
0.4216916535360118
24
0.41989152306318284
25
0.4182492385403468
26
0.41673052548258394
27
0.4153243705098118
28
0.4140006389854283
29
0.41276793539524076
30
0.4116228936660674
31
0.41055202956777065
32
0.40956136715231517
33
0.40860914964886275
34
0.4077200755476952
35
0.4068718884761135
36
0.40607098242318307
37
0.40530860606384905
38
0.4045831608848694
39
0.40391138575971125
40
0.4032656713229854
41
0.40264267960474603
42
0.40205773456498634
43
0.4014981653371995
44
0.40096090392933953
45


In [25]:
model.eval()

Model(
  (item_embedding_gmf): Linear(in_features=3953, out_features=1000, bias=False)
  (user_embedding_gmf): Linear(in_features=6041, out_features=1000, bias=False)
  (GMF_Linear): Linear(in_features=1000, out_features=1, bias=True)
  (item_embedding_mlp): Linear(in_features=3953, out_features=1000, bias=False)
  (user_embedding_mlp): Linear(in_features=6041, out_features=1000, bias=False)
  (linear1): Linear(in_features=2000, out_features=1000, bias=True)
  (linear2): Linear(in_features=1000, out_features=500, bias=True)
  (linear3): Linear(in_features=500, out_features=250, bias=True)
  (linear4): Linear(in_features=250, out_features=125, bias=True)
  (linear5): Linear(in_features=125, out_features=65, bias=True)
  (linear6): Linear(in_features=65, out_features=1, bias=True)
  (linear_MF): Linear(in_features=2, out_features=1, bias=True)
)

### ２値分類の評価

In [47]:
res_lst = list()

for i in range(0, len(score_test_arr)):
    res = model(torch.FloatTensor(movie_test_arr[i].toarray()), torch.FloatTensor(user_test_arr[i].toarray()))
    res_lst.append(res.item())

In [48]:
len(res_lst)

250053

In [49]:
score_test_arr.shape, movie_test_arr.shape

((250053,), (250053, 3953))

In [52]:
with open('result.pkl', mode='wb') as f:
    pickle.dump(res_lst, f)

In [53]:
for i in range(0, len(res_lst)):
    if res_lst[i] >= 0.5:
        res_lst[i] = 1.0
    else:
        res_lst[i] = 0.0

In [54]:
with open('result_binary.pkl', mode='wb') as f:
    pickle.dump(res_lst, f)

In [55]:
from sklearn.metrics import f1_score

f1_score(score_test_arr, res_lst) 

0.8578131565993109

In [56]:
torch.save(model, 'nfc_ver1.pth')

  "type " + obj.__name__ + ". It won't be checked "


In [57]:
user_test_arr.shape

(250053, 6041)