In [1]:
import os

import torch
from torch.utils.data import DataLoader

from model import Dataset
from model import collate_fn, build_pretrained_model
from model import RefScorer

# 1. Argument 설정

## 1.1 Dataset Path
Sample은 Unreference score model training 시 사용되는 query와 gold_reply만 제공

In [2]:
dataset_path = 'dataset'

train_query_path = os.path.join(dataset_path, 'smart_src.txt')
train_gold_reply_path = os.path.join(dataset_path, 'smart_tgt.txt')
train_gen_reply_path = None

train_data_path_list = [train_query_path, train_gold_reply_path]

## 1.2 OpenNMT Path
OpenNMT Training시 사용 된 vocab.pt 파일과 Pretraining된 모델의 경로

In [3]:
onmt_path = 'sample_onmt'

onmt_vocab_path = os.path.join(onmt_path, 'sample.vocab.pt')
onmt_model_path = os.path.join(onmt_path, 'sample.model.pt')

## 1.3 Training Parameter
device는 "cpu" 또는 "cuda:[device_id]"

In [4]:
#device = torch.device('cpu')
device = torch.device('cuda:0')

batch_size = 4

# 2. 객체 생성

# 2.1 Dataset instance 생성
Dataset 구축을 위해 OpenNMT vocab과 data file path list를 넘겨줌.

In [5]:
# Training dataset 구축
my_dataset = Dataset(
    vocab_path=onmt_vocab_path,
    data_path_list=train_data_path_list,
    max_length=35
)

[*] Loading onmt vocab dictionary...
Number of dictionary : 23786 

[*] Reading lines...
Avg length of data 0 : 6.81
Avg length of data 1 : 6.81

Read 90729 sentence pairs
Trim data to 90701 sentence pairs 

[*] Success to preprocess data! 



# 2.2 DataLoader instance 생성
실제 학습 시 DataLoader 객체를 다음과 같이 iteration 시키면 알아서 Batch data를 iterate 해줌.

```python
for data1, data1_len, data2, data2_len in dataloader_instance:
    # RefScorer._embed_sentence 참조
    pad_batch(data1, data1_len)
    # model 학습
    # 나온 output reordering (packing 때문에)
```

DataLoader에서 던져주는 값들은 **각각의 데이터의 Batch와 Length들의 반복**으로 이루어져 있음. 예를 들어 위의 Example 상황 같은 경우, Batch size가 16이고 Query와 Gold reply, 2개의 데이터로 이루어져 있으므로 다음과 같이 던져주게 됨.

* Query sentence 4개 tuple
* Query sentence length 4개 tuple
* Gold reply sentence 4개 tuple
* Gold reply sentence length 4개 tuple

만약 Gen reply까지 Dataset에 포함을 시키면 총 6개의 instance를 iterate 해줌

In [6]:
my_data_loader = DataLoader(
    dataset=my_dataset,
    batch_size=batch_size,
    collate_fn=collate_fn,
    num_workers=16
)

In [7]:
# Example
next(iter(my_data_loader))

[([6, 5, 55, 3],
  [54, 78, 3],
  [6, 5, 470, 898, 458, 471, 5, 3207, 12, 777, 8, 67, 3],
  [6, 5, 9, 11, 674, 0, 12170, 2]),
 (4, 3, 13, 8),
 ([54, 78, 3],
  [6, 5, 470, 898, 458, 471, 5, 3207, 12, 777, 8, 67, 3],
  [6, 5, 9, 11, 674, 0, 12170, 2],
  [6, 2]),
 (3, 13, 8, 2)]

# 2.3 OpenNMT 모델 로드
OpenNMT의 pretrained embedding과 rnn의 parameter를 땡겨옴.

Sample은 Embedding dimension 10, RNN size 15의 Tiny model임.

In [8]:
encoder = build_pretrained_model(onmt_model_path, my_dataset.vocab)

Loading pretrained model... 

1
RNNEncoder(
  (embeddings): Embeddings(
    (make_embedding): Sequential(
      (emb_luts): Elementwise(
        (0): Embedding(23786, 10, padding_idx=1)
      )
    )
  )
  (rnn): GRU(10, 15, bidirectional=True)
) 



# 3. Reference Score 계산
Reference score 계산을 위한 dataset 불러옴. (query / gold / gen)

In [9]:
dataset_path = 'dataset'

gold_reply_path = os.path.join(dataset_path, 'gold_sampled_3000.txt')
gen_reply_path = os.path.join(dataset_path, 'gen_sampled_3000.txt')

data_path_list = [gold_reply_path, gen_reply_path]

In [10]:
ref_dataset = Dataset(
    vocab_path=onmt_vocab_path,
    data_path_list=data_path_list,
    max_length=50
)

[*] Loading onmt vocab dictionary...
Number of dictionary : 23786 

[*] Reading lines...
Avg length of data 0 : 6.97
Avg length of data 1 : 3.93

Read 3000 sentence pairs
Trim data to 2999 sentence pairs 

[*] Success to preprocess data! 



DataLoader 생성

In [11]:
ref_data_loader = DataLoader(
    dataset=ref_dataset,
    batch_size=batch_size,
    collate_fn=collate_fn,
    num_workers=16
)

Reference Scorer instance 생성

In [12]:
ref_scorer = RefScorer(encoder, device)

DataLoader를 돌면서 Cosine Similarity 계산

In [13]:
for gold_indices, gold_lens, gen_indices, gen_lens in ref_data_loader:
    print(ref_scorer.get_ref_score(gold_indices, gold_lens, gen_indices, gen_lens))

tensor([ 0.7032,  0.6155,  0.6117,  0.1440], device='cuda:0')
tensor([ 0.5581,  0.7775,  0.5723,  0.5783], device='cuda:0')
tensor([ 0.5295,  0.1601,  0.6400,  0.4473], device='cuda:0')
tensor([ 0.4414,  0.6365,  0.3485,  0.7338], device='cuda:0')
tensor([ 0.6559,  0.8206,  0.5920,  0.3414], device='cuda:0')
tensor([ 0.6567,  0.5672,  0.4187,  0.5975], device='cuda:0')
tensor([ 0.5643,  0.5367,  0.9256,  0.7548], device='cuda:0')
tensor([ 0.6392,  0.3909,  0.6439,  0.8721], device='cuda:0')
tensor([ 0.3295,  0.4616,  1.0000,  0.7810], device='cuda:0')
tensor([ 0.3670,  0.8975,  0.1833,  0.5063], device='cuda:0')
tensor([ 0.7857,  0.4350,  0.3823,  0.3493], device='cuda:0')
tensor([ 0.4159,  0.4018,  0.4514,  0.8754], device='cuda:0')
tensor([ 0.5300,  0.6804,  0.4690,  0.6971], device='cuda:0')
tensor([ 0.5057,  0.5860,  0.7046,  0.8095], device='cuda:0')
tensor([ 0.3946,  0.2948,  0.9608,  0.4696], device='cuda:0')
tensor([ 0.7205,  0.4207,  0.3429,  0.7285], device='cuda:0')
tensor([

tensor([ 0.9231,  0.7720,  0.2093,  0.6323], device='cuda:0')
tensor([ 0.6234,  0.3006,  0.3483,  0.3418], device='cuda:0')
tensor([ 0.7110,  0.8004,  0.6386,  0.2897], device='cuda:0')
tensor([ 0.5507,  0.3209,  0.8787,  0.8193], device='cuda:0')
tensor([ 0.7517,  0.2845,  0.6940,  0.0914], device='cuda:0')
tensor([ 0.7558,  0.1897,  0.8053,  0.7032], device='cuda:0')
tensor([ 0.7965,  0.4693,  0.4696,  0.8935], device='cuda:0')
tensor([ 0.6523,  0.6426,  0.4798,  0.8660], device='cuda:0')
tensor([ 0.9484,  0.4993,  0.7324,  0.6629], device='cuda:0')
tensor([ 0.2865,  0.8092,  0.5710,  0.8378], device='cuda:0')
tensor([ 0.4139,  0.8282,  0.5645,  0.8633], device='cuda:0')
tensor([ 0.5379,  0.8056,  0.5327,  0.7503], device='cuda:0')
tensor([ 0.3230,  0.8638,  0.7141,  0.4878], device='cuda:0')
tensor([ 0.7028,  0.6388,  0.7113,  0.5722], device='cuda:0')
tensor([ 0.3400,  0.4546,  0.3704,  0.7761], device='cuda:0')
tensor([ 0.9149,  0.4741,  0.2718,  0.5259], device='cuda:0')
tensor([

tensor([ 0.7467,  0.6115,  0.4424,  0.6489], device='cuda:0')
tensor([ 0.7055,  0.3668,  0.7985,  0.7412], device='cuda:0')
tensor([ 0.3110,  0.8310,  0.1631,  0.7761], device='cuda:0')
tensor([ 0.2071,  0.6974,  0.6837,  0.9065], device='cuda:0')
tensor([ 0.6219,  0.6934,  0.8255,  0.6233], device='cuda:0')
tensor([ 0.4132,  0.6910,  0.0794,  0.3300], device='cuda:0')
tensor([ 1.0000,  0.4756,  0.5908,  0.3520], device='cuda:0')
tensor([ 0.5737,  0.7850,  0.5806,  0.3458], device='cuda:0')
tensor([ 0.6849,  0.8330,  0.6555,  0.9102], device='cuda:0')
tensor([ 0.3976,  0.1160,  0.7998,  0.5876], device='cuda:0')
tensor([ 0.0685,  1.0000,  0.6205,  0.5601], device='cuda:0')
tensor([ 0.4313,  0.3930,  0.5449,  0.7443], device='cuda:0')
tensor([ 0.5413,  0.2278,  0.7622,  0.4978], device='cuda:0')
tensor([ 0.4805,  0.4941,  0.5351,  0.7167], device='cuda:0')
tensor([ 0.4494,  0.7008,  0.5281,  0.4977], device='cuda:0')
tensor([ 0.1667,  0.6524,  0.7596,  0.3397], device='cuda:0')
tensor([

tensor([ 0.2924,  0.6073,  0.0890,  0.7598], device='cuda:0')
tensor([ 0.5398,  0.5539,  0.7372,  0.6341], device='cuda:0')
tensor([ 0.7847,  0.4054,  0.3836,  0.4238], device='cuda:0')
tensor([ 0.6463,  0.7174,  0.3270,  0.1851], device='cuda:0')
tensor([ 0.9321,  1.0000,  0.5456,  0.5230], device='cuda:0')
tensor([ 0.7645,  0.3943,  0.6563,  0.3180], device='cuda:0')
tensor([ 0.5364,  0.6852,  0.3350,  0.0851], device='cuda:0')
tensor([ 0.7020,  0.7231,  0.7713,  0.6460], device='cuda:0')
tensor([ 0.2526,  0.7759,  0.8019,  0.5848], device='cuda:0')
tensor([ 0.6415,  0.9069,  0.5474,  0.7879], device='cuda:0')
tensor([ 0.4948,  0.3146,  0.6184,  0.4721], device='cuda:0')
tensor([ 0.3323,  0.3325,  0.5772,  0.5174], device='cuda:0')
tensor([ 0.3302,  0.6849,  0.9089,  0.7061], device='cuda:0')
tensor([ 0.3633,  0.8754,  0.1948,  0.5074], device='cuda:0')
tensor([ 0.7893,  0.6264,  0.7028,  0.5270], device='cuda:0')
tensor([ 0.5376,  0.7370,  0.6869,  0.6738], device='cuda:0')
tensor([

tensor([ 0.5874,  0.5050,  0.4706,  0.4324], device='cuda:0')
tensor([ 0.5977,  0.5835,  0.8539,  0.6911], device='cuda:0')
tensor([ 0.6243,  0.4731,  0.6507,  0.5156], device='cuda:0')
tensor([ 0.9069,  0.6023,  0.3065,  0.6851], device='cuda:0')
tensor([ 0.2630,  0.2584,  0.1707,  0.8755], device='cuda:0')
tensor([ 0.6327,  0.6606,  1.0000,  0.7677], device='cuda:0')
tensor([ 0.7917,  0.3111,  0.4375,  0.5997], device='cuda:0')
tensor([ 0.4509,  0.4047,  0.8127,  0.8669], device='cuda:0')
tensor([ 0.8061,  0.4554,  0.7190,  0.2368], device='cuda:0')
tensor([ 0.4933,  0.3323,  0.4734,  0.3601], device='cuda:0')
tensor([ 0.2321,  0.2352,  0.5416,  0.4685], device='cuda:0')
tensor([ 0.5669,  0.9128,  0.7083,  0.2896], device='cuda:0')
tensor([ 1.0000,  0.1457,  0.3340,  0.6819], device='cuda:0')
tensor([ 0.2591,  0.2481,  0.5999,  0.5991], device='cuda:0')
tensor([ 0.3836,  0.8950,  0.7578,  0.6590], device='cuda:0')
tensor([ 0.6056,  0.8338,  0.7899,  0.8616], device='cuda:0')
tensor([

tensor([ 0.6354,  1.0000,  0.0783,  0.6685], device='cuda:0')
tensor([ 0.7207,  1.0000,  0.5953,  0.4436], device='cuda:0')
tensor([ 0.7481,  0.0290,  0.5551,  0.7503], device='cuda:0')
tensor([ 0.5483,  0.8473,  0.3644,  0.4576], device='cuda:0')
tensor([ 0.5302,  0.4615,  0.4029,  0.1008], device='cuda:0')
tensor([ 0.6150,  0.7199,  0.0828,  0.2047], device='cuda:0')
tensor([ 0.7258,  0.3934,  0.6336,  0.4417], device='cuda:0')
tensor([ 0.5496,  0.9044,  0.5279,  0.6301], device='cuda:0')
tensor([ 0.3025,  0.6024,  0.9006,  0.5941], device='cuda:0')
tensor([ 0.6266,  0.1609,  0.4879,  0.6717], device='cuda:0')
tensor([ 0.3904,  0.7204,  0.7835,  0.6886], device='cuda:0')
tensor([ 0.7013,  0.7950,  0.6461,  0.6751], device='cuda:0')
tensor([ 0.7984,  0.5640,  0.6345, -0.0462], device='cuda:0')
tensor([ 0.4614,  0.6734,  0.5654,  0.4538], device='cuda:0')
tensor([ 0.5674,  0.7920,  0.6435,  0.8475], device='cuda:0')
tensor([ 0.7245,  0.6180,  0.4288,  0.7388], device='cuda:0')
tensor([