## Global Settings and Imports

In [1]:
# jupyter notebook에서 import 해서 쓰는 모듈의 코드가 변경될 시, 변동 사항을 자동으로 반영해주는 기능 켜기
%load_ext autoreload
%autoreload 2

In [2]:
import argparse
import yaml
from dotmap import DotMap
from os import path
import numpy as np
import torch
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning import Trainer
from models.lstur import LSTUR
from models.nrms import NRMS
from models.naml import NAML
from models.naml_simple import NAML_Simple
from models.sentirec import SENTIREC
from models.robust_sentirec import ROBUST_SENTIREC
from data.dataset import BaseDataset
from tqdm import tqdm

## Prepare parameters

In [None]:
args = argparse.Namespace(
    config = "config/model/nrms/exp_manual1.yaml",
    resume = None
)

with open(args.config, 'r') as ymlfile:
    config = yaml.load(ymlfile, Loader=yaml.FullLoader)
    config = DotMap(config)

assert(config.name in ["lstur", "nrms", "naml", "naml_simple", "sentirec", "robust_sentirec"])

pl.seed_everything(1234)

logger = TensorBoardLogger(
    **config.logger
)
checkpoint_callback = ModelCheckpoint(
    **config.checkpoint
)

Seed set to 1234


## Load data

In [5]:
preprocess_path = f"{config.preprocess_data_path}/{config.dataset_size}/"

train_dataset = BaseDataset(
    path.join(preprocess_path+config.train_behavior),
    path.join(preprocess_path+config.train_news), 
    config)
val_dataset = BaseDataset(
    path.join(preprocess_path+config.val_behavior),
    path.join(preprocess_path+config.train_news), 
    config) 
train_loader = DataLoader(
    train_dataset,
    **config.train_dataloader)
val_loader = DataLoader(
    val_dataset,
    **config.val_dataloader)

100%|██████████| 235/235 [00:00<00:00, 36131.28it/s]
100%|██████████| 1/1 [00:00<00:00, 333.57it/s]
100%|██████████| 235/235 [00:00<00:00, 27629.69it/s]
100%|██████████| 1/1 [00:00<00:00, 1001.74it/s]


In [6]:
# load embedding pre-trained embedding weights
embedding_weights=[]
with open(path.join(preprocess_path+config.embedding_weights), 'r') as file: 
    lines = file.readlines()
    for line in tqdm(lines):
        weights = [float(w) for w in line.split(" ")]
        embedding_weights.append(weights)
pretrained_word_embedding = torch.from_numpy(
    np.array(embedding_weights, dtype=np.float32)
)

100%|██████████| 3685/3685 [00:00<00:00, 22712.46it/s]


## Create model

In [7]:
print(config.name)
if config.name == "lstur":
    model = LSTUR(config, pretrained_word_embedding)
elif config.name == "nrms":
    model = NRMS(config, pretrained_word_embedding)
elif config.name == "naml":
    model = NAML(config, pretrained_word_embedding)
elif config.name == "naml_simple":
    model = NAML_Simple(config, pretrained_word_embedding)
elif config.name == "sentirec":
    model = SENTIREC(config, pretrained_word_embedding)
elif config.name == "robust_sentirec":
    model = ROBUST_SENTIREC(config, pretrained_word_embedding)

nrms


## Train model

In [8]:
early_stop_callback = EarlyStopping(
    **config.early_stop
)
if args.resume is not None:
    model = model.load_from_checkpoint(
        args.resume, 
        config=config, 
        pretrained_word_embedding=pretrained_word_embedding)
    trainer = Trainer(
        **config.trainer,
        callbacks=[early_stop_callback, checkpoint_callback],
        logger=logger,
        resume_from_checkpoint=args.resume
    )
else:
    trainer = Trainer(
        **config.trainer,
        callbacks=[early_stop_callback, checkpoint_callback],
        logger=logger,
    )

GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\USER\anaconda3\envs\newsrec\lib\site-packages\pytorch_lightning\trainer\setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.


In [9]:
trainer.fit(
    model=model, 
    train_dataloaders=train_loader, 
    val_dataloaders=val_loader
)

c:\Users\USER\anaconda3\envs\newsrec\lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:654: Checkpoint directory C:\Users\USER\Desktop\projects\python\newsrecommend\SentiRecTest\project\logs\lightning_logs\checkpoints\nrms\exp1 exists and is not empty.

   | Name                                   | Type             | Params | Mode 
-------------------------------------------------------------------------------------
0  | news_encoder                           | TimeDistributed  | 1.5 M  | train
1  | user_encoder                           | UserEncoder      | 421 K  | train
2  | val_performance_metrics                | MetricCollection | 0      | train
3  | val_sentiment_diversity_metrics_vader  | MetricCollection | 0      | train
4  | val_sentiment_diversity_metrics_bert   | MetricCollection | 0      | train
5  | test_performance_metrics               | MetricCollection | 0      | train
6  | test_sentiment_diversity_metrics_vader | MetricCollection | 0      | train
7  |

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\USER\anaconda3\envs\newsrec\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
c:\Users\USER\anaconda3\envs\newsrec\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
c:\Users\USER\anaconda3\envs\newsrec\lib\site-packages\pytorch_lightning\loops\fit_loop.py:310: The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_auc_epoch improved. New best score: 0.178
Epoch 0, global step 1: 'val_auc_epoch' reached 0.17778 (best 0.17778), saving model to 'C:\\Users\\USER\\Desktop\\projects\\python\\newsrecommend\\SentiRecTest\\project\\logs\\lightning_logs\\checkpoints\\nrms\\exp1\\epoch=0-val_auc_epoch=0.1778.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_auc_epoch improved by 0.111 >= min_delta = 0.0001. New best score: 0.289
Epoch 1, global step 2: 'val_auc_epoch' reached 0.28889 (best 0.28889), saving model to 'C:\\Users\\USER\\Desktop\\projects\\python\\newsrecommend\\SentiRecTest\\project\\logs\\lightning_logs\\checkpoints\\nrms\\exp1\\epoch=1-val_auc_epoch=0.2889.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_auc_epoch improved by 0.289 >= min_delta = 0.0001. New best score: 0.578
Epoch 2, global step 3: 'val_auc_epoch' reached 0.57778 (best 0.57778), saving model to 'C:\\Users\\USER\\Desktop\\projects\\python\\newsrecommend\\SentiRecTest\\project\\logs\\lightning_logs\\checkpoints\\nrms\\exp1\\epoch=2-val_auc_epoch=0.5778.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_auc_epoch improved by 0.178 >= min_delta = 0.0001. New best score: 0.756
Epoch 3, global step 4: 'val_auc_epoch' reached 0.75556 (best 0.75556), saving model to 'C:\\Users\\USER\\Desktop\\projects\\python\\newsrecommend\\SentiRecTest\\project\\logs\\lightning_logs\\checkpoints\\nrms\\exp1\\epoch=3-val_auc_epoch=0.7556.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_auc_epoch improved by 0.044 >= min_delta = 0.0001. New best score: 0.800
Epoch 4, global step 5: 'val_auc_epoch' reached 0.80000 (best 0.80000), saving model to 'C:\\Users\\USER\\Desktop\\projects\\python\\newsrecommend\\SentiRecTest\\project\\logs\\lightning_logs\\checkpoints\\nrms\\exp1\\epoch=4-val_auc_epoch=0.8000.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_auc_epoch improved by 0.067 >= min_delta = 0.0001. New best score: 0.867
Epoch 5, global step 6: 'val_auc_epoch' reached 0.86667 (best 0.86667), saving model to 'C:\\Users\\USER\\Desktop\\projects\\python\\newsrecommend\\SentiRecTest\\project\\logs\\lightning_logs\\checkpoints\\nrms\\exp1\\epoch=5-val_auc_epoch=0.8667.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_auc_epoch improved by 0.044 >= min_delta = 0.0001. New best score: 0.911
Epoch 6, global step 7: 'val_auc_epoch' reached 0.91111 (best 0.91111), saving model to 'C:\\Users\\USER\\Desktop\\projects\\python\\newsrecommend\\SentiRecTest\\project\\logs\\lightning_logs\\checkpoints\\nrms\\exp1\\epoch=6-val_auc_epoch=0.9111.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 7, global step 8: 'val_auc_epoch' reached 0.91111 (best 0.91111), saving model to 'C:\\Users\\USER\\Desktop\\projects\\python\\newsrecommend\\SentiRecTest\\project\\logs\\lightning_logs\\checkpoints\\nrms\\exp1\\epoch=7-val_auc_epoch=0.9111.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 8, global step 9: 'val_auc_epoch' reached 0.91111 (best 0.91111), saving model to 'C:\\Users\\USER\\Desktop\\projects\\python\\newsrecommend\\SentiRecTest\\project\\logs\\lightning_logs\\checkpoints\\nrms\\exp1\\epoch=8-val_auc_epoch=0.9111.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_auc_epoch improved by 0.022 >= min_delta = 0.0001. New best score: 0.933
Epoch 9, global step 10: 'val_auc_epoch' reached 0.93333 (best 0.93333), saving model to 'C:\\Users\\USER\\Desktop\\projects\\python\\newsrecommend\\SentiRecTest\\project\\logs\\lightning_logs\\checkpoints\\nrms\\exp1\\epoch=9-val_auc_epoch=0.9333.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 10, global step 11: 'val_auc_epoch' reached 0.93333 (best 0.93333), saving model to 'C:\\Users\\USER\\Desktop\\projects\\python\\newsrecommend\\SentiRecTest\\project\\logs\\lightning_logs\\checkpoints\\nrms\\exp1\\epoch=10-val_auc_epoch=0.9333.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 11, global step 12: 'val_auc_epoch' reached 0.93333 (best 0.93333), saving model to 'C:\\Users\\USER\\Desktop\\projects\\python\\newsrecommend\\SentiRecTest\\project\\logs\\lightning_logs\\checkpoints\\nrms\\exp1\\epoch=11-val_auc_epoch=0.9333.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 12, global step 13: 'val_auc_epoch' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 13, global step 14: 'val_auc_epoch' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Monitored metric val_auc_epoch did not improve in the last 5 records. Best score: 0.933. Signaling Trainer to stop.
Epoch 14, global step 15: 'val_auc_epoch' was not in top 3


In [None]:
# 한 impression 데이터의 history에 포함된
# 모든 뉴스의 제목이 저장된 배치 데이터의 길이.
# history가 설정값보다 더 적을 경우,
# 빈 공간은 padding값(0)으로 채워집니다.
len(train_dataset[0]['h_title'])

50

In [None]:
# 모든 뉴스의 token 길이 또한 고정됩니다.
# 마찬가지로 모든 빈칸은 padding값(0)으로 채워집니다. 
len(train_dataset[0]['h_title'][0])

20