## Global Settings and Imports

In [1]:
# jupyter notebook에서 import 해서 쓰는 모듈의 코드가 변경될 시, 변동 사항을 자동으로 반영해주는 기능 켜기
%load_ext autoreload
%autoreload 2

In [2]:
import argparse
import yaml
from dotmap import DotMap
from os import path
import numpy as np
import torch
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning import Trainer
from models.lstur import LSTUR
from models.nrms import NRMS
from models.naml import NAML
from models.naml_simple import NAML_Simple
from models.sentirec import SENTIREC
from models.robust_sentirec import ROBUST_SENTIREC
from data.dataset import BaseDataset
from tqdm import tqdm

## Prepare parameters

In [3]:
args = argparse.Namespace(
    config = "config/model/nrms/exp1.yaml",
    resume = None
)

with open(args.config, 'r') as ymlfile:
    config = yaml.load(ymlfile, Loader=yaml.FullLoader)
    config = DotMap(config)

assert(config.name in ["lstur", "nrms", "naml", "naml_simple", "sentirec", "robust_sentirec"])

pl.seed_everything(1234)

logger = TensorBoardLogger(
    **config.logger
)

Seed set to 1234


In [4]:
checkpoint_callback = ModelCheckpoint(
    **config.checkpoint
)
print(checkpoint_callback)

<pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint object at 0x00000220E5EC5130>


## Load data

In [5]:
preprocess_path = f"{config.preprocess_data_path}/{config.dataset_size}/"

train_dataset = BaseDataset(
    path.join(preprocess_path+config.train_behavior),
    path.join(preprocess_path+config.train_news), 
    config)
val_dataset = BaseDataset(
    path.join(preprocess_path+config.val_behavior),
    path.join(preprocess_path+config.train_news), 
    config) 
train_loader = DataLoader(
    train_dataset,
    **config.train_dataloader)
val_loader = DataLoader(
    val_dataset,
    **config.val_dataloader)

100%|██████████| 26740/26740 [00:01<00:00, 24427.78it/s]
100%|██████████| 28994/28994 [00:07<00:00, 4059.78it/s]
100%|██████████| 26740/26740 [00:01<00:00, 24782.23it/s]
100%|██████████| 2204/2204 [00:00<00:00, 2405.62it/s]


In [6]:
# load embedding pre-trained embedding weights
embedding_weights=[]
with open(path.join(preprocess_path+config.embedding_weights), 'r') as file: 
    lines = file.readlines()
    for line in tqdm(lines):
        weights = [float(w) for w in line.split(" ")]
        embedding_weights.append(weights)
pretrained_word_embedding = torch.from_numpy(
    np.array(embedding_weights, dtype=np.float32)
)

100%|██████████| 42562/42562 [00:02<00:00, 18878.45it/s]


## Load model from checkpoint

In [7]:
print(config.name)
if config.name == "lstur":
    model = LSTUR(config, pretrained_word_embedding)
elif config.name == "nrms":
    model = NRMS(config, pretrained_word_embedding)
elif config.name == "naml":
    model = NAML(config, pretrained_word_embedding)
elif config.name == "naml_simple":
    model = NAML_Simple(config, pretrained_word_embedding)
elif config.name == "sentirec":
    model = SENTIREC(config, pretrained_word_embedding)
elif config.name == "robust_sentirec":
    model = ROBUST_SENTIREC(config, pretrained_word_embedding)

nrms


## Train model

In [None]:
early_stop_callback = EarlyStopping(
    **config.early_stop
)
if args.resume is not None:
    model = model.load_from_checkpoint(
        args.resume, 
        config=config, 
        pretrained_word_embedding=pretrained_word_embedding)
    trainer = Trainer(
        **config.trainer,
        callbacks=[early_stop_callback, checkpoint_callback],
        logger=logger,
        resume_from_checkpoint=args.resume
    )
else:
    trainer = Trainer(
        **config.trainer,
        callbacks=[early_stop_callback, checkpoint_callback],
        logger=logger,
    )

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [9]:
trainer.fit(
    model=model, 
    train_dataloaders=train_loader, 
    val_dataloaders=val_loader
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name                                   | Type             | Params | Mode 
-------------------------------------------------------------------------------------
0  | news_encoder                           | TimeDistributed  | 13.2 M | train
1  | user_encoder                           | UserEncoder      | 421 K  | train
2  | val_performance_metrics                | MetricCollection | 0      | train
3  | val_sentiment_diversity_metrics_vader  | MetricCollection | 0      | train
4  | val_sentiment_diversity_metrics_bert   | MetricCollection | 0      | train
5  | test_performance_metrics               | MetricCollection | 0      | train
6  | test_sentiment_diversity_metrics_vader | MetricCollection | 0      | train
7  | test_sentiment_diversity_metrics_bert  | MetricCollection | 0      | train
8  | test_topic_diversity_metrics           | MetricCollection | 0      | train
9  | test_ils_senti_metrics_vader           | MetricCollection | 0     

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\USER\anaconda3\envs\newsrec\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
c:\Users\USER\anaconda3\envs\newsrec\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_auc_epoch improved. New best score: 0.628
Epoch 0, global step 453: 'val_auc_epoch' reached 0.62799 (best 0.62799), saving model to 'C:\\Users\\USER\\Desktop\\projects\\python\\newsrecommend\\SentiRecTest\\project\\logs\\lightning_logs\\checkpoints\\nrms\\exp1\\epoch=0-val_auc_epoch=0.6280.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_auc_epoch improved by 0.020 >= min_delta = 0.0001. New best score: 0.648
Epoch 1, global step 906: 'val_auc_epoch' reached 0.64768 (best 0.64768), saving model to 'C:\\Users\\USER\\Desktop\\projects\\python\\newsrecommend\\SentiRecTest\\project\\logs\\lightning_logs\\checkpoints\\nrms\\exp1\\epoch=1-val_auc_epoch=0.6477.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_auc_epoch improved by 0.009 >= min_delta = 0.0001. New best score: 0.656
Epoch 2, global step 1359: 'val_auc_epoch' reached 0.65634 (best 0.65634), saving model to 'C:\\Users\\USER\\Desktop\\projects\\python\\newsrecommend\\SentiRecTest\\project\\logs\\lightning_logs\\checkpoints\\nrms\\exp1\\epoch=2-val_auc_epoch=0.6563.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_auc_epoch improved by 0.001 >= min_delta = 0.0001. New best score: 0.657
Epoch 3, global step 1812: 'val_auc_epoch' reached 0.65727 (best 0.65727), saving model to 'C:\\Users\\USER\\Desktop\\projects\\python\\newsrecommend\\SentiRecTest\\project\\logs\\lightning_logs\\checkpoints\\nrms\\exp1\\epoch=3-val_auc_epoch=0.6573.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_auc_epoch improved by 0.003 >= min_delta = 0.0001. New best score: 0.661
Epoch 4, global step 2265: 'val_auc_epoch' reached 0.66060 (best 0.66060), saving model to 'C:\\Users\\USER\\Desktop\\projects\\python\\newsrecommend\\SentiRecTest\\project\\logs\\lightning_logs\\checkpoints\\nrms\\exp1\\epoch=4-val_auc_epoch=0.6606.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 5, global step 2718: 'val_auc_epoch' reached 0.65808 (best 0.66060), saving model to 'C:\\Users\\USER\\Desktop\\projects\\python\\newsrecommend\\SentiRecTest\\project\\logs\\lightning_logs\\checkpoints\\nrms\\exp1\\epoch=5-val_auc_epoch=0.6581.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_auc_epoch improved by 0.004 >= min_delta = 0.0001. New best score: 0.665
Epoch 6, global step 3171: 'val_auc_epoch' reached 0.66510 (best 0.66510), saving model to 'C:\\Users\\USER\\Desktop\\projects\\python\\newsrecommend\\SentiRecTest\\project\\logs\\lightning_logs\\checkpoints\\nrms\\exp1\\epoch=6-val_auc_epoch=0.6651.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_auc_epoch improved by 0.003 >= min_delta = 0.0001. New best score: 0.668
Epoch 7, global step 3624: 'val_auc_epoch' reached 0.66780 (best 0.66780), saving model to 'C:\\Users\\USER\\Desktop\\projects\\python\\newsrecommend\\SentiRecTest\\project\\logs\\lightning_logs\\checkpoints\\nrms\\exp1\\epoch=7-val_auc_epoch=0.6678.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 8, global step 4077: 'val_auc_epoch' reached 0.66457 (best 0.66780), saving model to 'C:\\Users\\USER\\Desktop\\projects\\python\\newsrecommend\\SentiRecTest\\project\\logs\\lightning_logs\\checkpoints\\nrms\\exp1\\epoch=8-val_auc_epoch=0.6646.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_auc_epoch improved by 0.002 >= min_delta = 0.0001. New best score: 0.670
Epoch 9, global step 4530: 'val_auc_epoch' reached 0.66968 (best 0.66968), saving model to 'C:\\Users\\USER\\Desktop\\projects\\python\\newsrecommend\\SentiRecTest\\project\\logs\\lightning_logs\\checkpoints\\nrms\\exp1\\epoch=9-val_auc_epoch=0.6697.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 10, global step 4983: 'val_auc_epoch' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 11, global step 5436: 'val_auc_epoch' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 12, global step 5889: 'val_auc_epoch' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 13, global step 6342: 'val_auc_epoch' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Monitored metric val_auc_epoch did not improve in the last 5 records. Best score: 0.670. Signaling Trainer to stop.
Epoch 14, global step 6795: 'val_auc_epoch' was not in top 3


In [None]:
# 한 impression 데이터의 history에 포함된
# 모든 뉴스의 제목이 저장된 배치 데이터의 길이.
# history가 설정값보다 더 적을 경우,
# 빈 공간은 padding값(0)으로 채워집니다.
len(train_dataset[0]['h_title'])

50

In [None]:
# 모든 뉴스의 token 길이 또한 고정됩니다.
# 마찬가지로 모든 빈칸은 padding값(0)으로 채워집니다. 
len(train_dataset[0]['h_title'][0])

20