In [8]:
from IPython.display import display, Markdown

# Config

In [96]:
%run Config.py

# Data

## Tokenization

In [256]:
!!perl ../OpenNMT-py/tools/tokenizer.perl  -l en \
< ../data/"cornell movie-dialogs corpus"/src_movie_lines.txt \
> ../data/"cornell movie-dialogs corpus"/src_movie_lines_tok.txt

['Tokenizer Version 1.1', 'Language: en', 'Number of threads: 1']

In [257]:
!!perl ../OpenNMT-py/tools/tokenizer.perl  -l en \
< ../data/"cornell movie-dialogs corpus"/tgt_movie_lines.txt \
> ../data/"cornell movie-dialogs corpus"/tgt_movie_lines_tok.txt

['Tokenizer Version 1.1', 'Language: en', 'Number of threads: 1']

## Preprocessing

In [2]:
!!python ../OpenNMT-py/preprocess.py --train_src "../data/cornell movie-dialogs corpus/src_movie_lines_tok.txt" --train_tgt "../data/cornell movie-dialogs corpus/tgt_movie_lines_tok.txt" --save_data ../data/cornell_raw

['Please backup existing pt files: ../data/cornell_raw.train*.pt, to avoid overwriting them!']

In [3]:
!!python ../OpenNMT-py/preprocess.py --train_src "../data/cornell movie-dialogs corpus/src_movie_lines_tok.txt" --train_tgt "../data/cornell movie-dialogs corpus/tgt_movie_lines_tok.txt" --save_data ../data/cornell_raw_min_30_10_tok --src_words_min_frequency 30 --tgt_words_min_frequency 30 --src_seq_length 10 --tgt_seq_length 10

['Please backup existing pt files: ../data/cornell_raw_min_30_10_tok.train*.pt, to avoid overwriting them!']

In [46]:
!!python ../OpenNMT-py/preprocess.py --train_src "../data/cornell movie-dialogs corpus/src_movie_lines_tok.txt" --train_tgt "../data/cornell movie-dialogs corpus/tgt_movie_lines_tok.txt" --save_data ../data/cornell_raw_min_100_tok --src_words_min_frequency 100 --tgt_words_min_frequency 100 --src_seq_length 10 --tgt_seq_length 10

['[2019-09-12 02:53:40,588 INFO] Extracting features...',
 '[2019-09-12 02:53:40,590 INFO]  * number of source features: 0.',
 '[2019-09-12 02:53:40,591 INFO]  * number of target features: 0.',
 '[2019-09-12 02:53:40,591 INFO] Building `Fields` object...',
 '[2019-09-12 02:53:40,591 INFO] Building & saving training data...',
 '[2019-09-12 02:53:40,591 INFO] Reading source and target files: ../data/cornell movie-dialogs corpus/src_movie_lines_tok.txt ../data/cornell movie-dialogs corpus/tgt_movie_lines_tok.txt.',
 '[2019-09-12 02:53:40,770 INFO] Building shard 0.',
 '[2019-09-12 02:53:48,692 INFO]  * saving 0th train data shard to ../data/cornell_raw_min_100_tok.train.0.pt.',
 '[2019-09-12 02:53:49,922 INFO]  * tgt vocab size: 379.',
 '[2019-09-12 02:53:49,948 INFO]  * src vocab size: 380.']

In [97]:
vocab_fields = torch.load("../data/cornell_raw_min_100_tok.vocab.pt")

In [98]:
src_text_field = vocab_fields["src"].base_field
src_vocab = src_text_field.vocab
src_padding = src_vocab.stoi[src_text_field.pad_token] #stoi: mapping token strings to numerical identifiers.
# ['<unk>', '<blank>', 'I', 'you', 'the', 'to', 'a', 'of', 'and', 'You']
# src_text_field.pad_token : '<blank>'

tgt_text_field = vocab_fields['tgt'].base_field
tgt_vocab = tgt_text_field.vocab
tgt_padding = tgt_vocab.stoi[tgt_text_field.pad_token]

In [99]:
config.src_vocab_size = len(src_vocab)
config.tgt_vocab_size = len(tgt_vocab)
config.src_padding = src_padding
config.tgt_padding = tgt_padding

In [100]:
config.src_unk = src_vocab.stoi[src_text_field.unk_token]
config.tgt_unk = tgt_vocab.stoi[tgt_text_field.unk_token]
config.tgt_bos = tgt_vocab.stoi[tgt_text_field.init_token]
config.tgt_eos = tgt_vocab.stoi[tgt_text_field.eos_token]

In [101]:
config.src_vocab = src_vocab
config.tgt_vocab = tgt_vocab

## Data Loading

In [102]:
import onmt
from itertools import chain

train_data_file = "../data/cornell_raw_min_100_tok.train.0.pt"
train_iter = onmt.inputters.inputter.DatasetLazyIter(dataset_paths=[train_data_file],
                                                     fields=vocab_fields,
                                                     batch_size=1,
                                                     batch_size_multiple=1,
                                                     batch_size_fn=None,
                                                     device="cpu",
                                                     is_train=True,
                                                     repeat=False,
                                                     pool_factor=8192)

data = list(train_iter)
filtered_data = []
for x in data:
    # Filtering sentences with <unk> token
    if not ((x.src[0].squeeze() == config.src_unk).any() or (x.tgt.squeeze() == config.tgt_unk).any()):
        filtered_data.append(x)  

INFO:root:Loading dataset from ../data/cornell_raw_min_100_tok.train.0.pt
INFO:root:number of examples: 72114


In [103]:
config.PRELOADING_SIZE = len(filtered_data)

In [104]:
display(Markdown(f'#### Stats'))
display(Markdown(f'##### {config.PRELOADING_SIZE:,} records'))
display(Markdown(f'##### {config.src_vocab_size:,} src vocabulary size'))
display(Markdown(f'##### {config.tgt_vocab_size:,} tgt vocabulary size'))

#### Stats

##### 8,216 records

##### 380 src vocabulary size

##### 379 tgt vocabulary size

# seq2seq-DQN

In [105]:
%run modules/NoisyLinear.py

In [106]:
%run modules/DQN.py

In [107]:
%run modules/Model.py

In [108]:
model = Model(config, DQN)

In [109]:
model.current_model

DQN(
  (encoder_embeddings): Embeddings(
    (make_embedding): Sequential(
      (emb_luts): Elementwise(
        (0): Embedding(380, 100, padding_idx=1)
      )
    )
  )
  (encoder): RNNEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(380, 100, padding_idx=1)
        )
      )
    )
    (rnn): LSTM(100, 250, bidirectional=True)
  )
  (decoder_embeddings): Embeddings(
    (make_embedding): Sequential(
      (emb_luts): Elementwise(
        (0): Embedding(379, 100, padding_idx=1)
      )
    )
  )
  (decoder): InputFeedRNNDecoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(379, 100, padding_idx=1)
        )
      )
    )
    (dropout): Dropout(p=0.0)
    (rnn): StackedLSTM(
      (dropout): Dropout(p=0.0)
      (layers): ModuleList(
        (0): LSTMCell(600, 500)
      )
    )
    (attn): GlobalAttention(
      (linear_in)

In [110]:
%run modules/MSELoss.py

In [111]:
loss = MSELoss(
    #nn.MSELoss(reduction="none"),
    nn.SmoothL1Loss(reduction="none"),
    model.current_model.generator
)

In [112]:
%run modules/Reward.py

In [113]:
config.rewards = ['BLEU']
config.rewards_weights = [1]    

reward = Reward(config)

In [114]:
lr = 1
torch_optimizer = torch.optim.SGD(model.current_model.parameters(), lr=lr)
optim = onmt.utils.optimizers.Optimizer(torch_optimizer, learning_rate=lr, max_grad_norm=2)

In [115]:
#report_manager = onmt.utils.ReportMgr(report_every=1, start_time=None, tensorboard_writer=None)

In [116]:
# Preload Experience Replay Buffer
if len(model.replay_memory) == 0:
    for example in filtered_data:
        model.replay_memory.preload(example.src[0].squeeze(1), example.tgt.squeeze(1), 1)
        model.sample_buffer.preload(example.src[0].squeeze(1), example.tgt.squeeze(1), None)

In [117]:
%run modules/QLearning.py

In [119]:
trainer = QLearning(config,
                    model,
                    reward=reward,
                    train_loss=loss,
                    valid_loss=loss,
                    optim=optim,
                    gpu_verbose_level=100)
                    #shard_size = 0

In [120]:
#for i in model.target_model.parameters():
#    print(i.abs().sum())

In [121]:
#for i, t in enumerate(model.replay_memory._storage):
#    if t[1].size(0) > 10:
#        print(i, t[1].size(0), t[1])
    
    #if t[1][-1].item() == 96:
    #    print(i, t[1][-1])
    #    print(t[1].size())
    #print((t[1] == 0).sum())
    #print(t[1].size(0))
    
#for x in model.replay_memory._storage:
#    #print([t for t in x[1]])
#    print(' '.join([tgt_vocab.itos[t.item()] for t in x[1]]))

for i, x in enumerate(filtered_data[5000:5010]):
    print(' '.join([src_vocab.itos[token] for token in x.src[0].squeeze().tolist()]) + '  ||  ' + ' '.join([tgt_vocab.itos[token] for token in x.tgt.squeeze().tolist()]))

Yes sir .  ||  <s> Are you sure ? </s>
I can &apos;t tell you .  ||  <s> Where we going ? </s>
So you did fuck up .  ||  <s> Yes . </s>
What do you mean ?  ||  <s> What &apos;s it like where I &apos;m going ? </s>
No .  ||  <s> But why not ? </s>
Do you believe in it ?  ||  <s> What ? </s>
Well ?  ||  <s> Well what ? </s>
Hello .  ||  <s> Hi . </s>
Where is she ?  ||  <s> With her mother . </s>
And now ?  ||  <s> And now I want you . </s>


In [None]:
result = trainer.train(train_steps=100000, valid_steps=200)

In [76]:
model.replay_memory._storage[30000]

(tensor([[ 25],
         [  7],
         [ 98],
         [515],
         [912],
         [ 73],
         [ 71],
         [  2]]), tensor([[419],
         [419],
         [419],
         [419],
         [419],
         [419],
         [419],
         [419],
         [419],
         [419],
         [419],
         [419]]), tensor(0.0765))

In [95]:
#for x in model.replay_memory._storage[190000:190020]:
#    print(' '.join([src_vocab.itos[token] for token in x[0].squeeze().tolist()]) + '  ||  ' + ' '.join([tgt_vocab.itos[token] for token in x[1].squeeze().tolist()]))

In [124]:
for i in range(0,len(model.replay_memory),1000):
    print(i, i+1000, sum([y[2] for y in model.replay_memory._storage[i:i+1000]]))

0 1000 1000
1000 2000 1000
2000 3000 1000
3000 4000 1000
4000 5000 1000
5000 6000 1000
6000 7000 1000
7000 8000 1000
8000 9000 tensor(269.1369)
9000 10000 tensor(69.9702)
10000 11000 tensor(68.1051)
11000 12000 tensor(70.0445)
12000 13000 tensor(69.2290)
13000 14000 tensor(67.6353)
14000 15000 tensor(69.1051)
15000 16000 tensor(69.9398)
16000 17000 tensor(67.1464)
17000 18000 tensor(40.8107)


In [125]:
for i in range(0,len(model.replay_memory),1000):
    print(i, i+1000, sum([model.replay_memory._it_sum[y] for y in range(i, i+1000)]))

0 1000 2865.5386549279237
1000 2000 2854.8148937806864
2000 3000 2921.720928437504
3000 4000 2842.947141045659
4000 5000 2923.080664388974
5000 6000 2846.373647637572
6000 7000 2857.5506537195292
7000 8000 2906.641666827157
8000 9000 5019.612149560479
9000 10000 6265.274101594786
10000 11000 7026.173323113606
11000 12000 7777.136460404011
12000 13000 8890.74375323773
13000 14000 9831.411941575092
14000 15000 11290.923115782087
15000 16000 12465.047093196603
16000 17000 13669.182292303707
17000 18000 8865.997057539264


In [126]:
for x in model.replay_memory._storage[5:20]:
    #print("SRC", x[0])
    #print("TGT", x[1].unsqueeze(0))
    text = model.current_model.infer(x[0].unsqueeze(1), torch.ShortTensor([x[0].size(0)]), 1)
    print(text)
    text = [[torch.LongTensor([3])]]
    #print(type(text), type(text[0]), type(text[0][0]))
    #print("Prediction:",  torch.cat((torch.LongTensor([config.tgt_bos]), text[0][0])))
    #print(' '.join([src_vocab.itos[token] for token in x[0].squeeze().tolist()]) + '  ||  ' + ' '.join([tgt_vocab.itos[token] for token in text[0][0].tolist()]))
    #print(reward(x[0], [[torch.cat((torch.LongTensor([config.tgt_bos]), text[0][0]))]], x[1].unsqueeze(0)))

[[tensor([293, 293, 293, 293, 293, 293, 293, 293, 293, 293, 293])]]
[[tensor([293, 293, 293, 293, 293, 293, 293, 293, 293, 293, 293])]]
[[tensor([293, 293, 293, 293, 293, 293, 293, 293, 293, 293, 293])]]
[[tensor([293, 293, 293, 293, 293, 293, 293, 293, 293, 293, 293])]]
[[tensor([293, 293, 293, 293, 293, 293, 293, 293, 293, 293, 293])]]
[[tensor([293, 293, 293, 293, 293, 293, 293, 293, 293, 293, 293])]]
[[tensor([293, 293, 293, 293, 293, 293, 293, 293, 293, 293, 293])]]
[[tensor([293, 293, 293, 293, 293, 293, 293, 293, 293, 293, 293])]]
[[tensor([293, 293, 293, 293, 293, 293, 293, 293, 293, 293, 293])]]
[[tensor([293, 293, 293, 293, 293, 293, 293, 293, 293, 293, 293])]]
[[tensor([293, 293, 293, 293, 293, 293, 293, 293, 293, 293, 293])]]
[[tensor([293, 293, 293, 293, 293, 293, 293, 293, 293, 293, 293])]]
[[tensor([293, 293, 293, 293, 293, 293, 293, 293, 293, 293, 293])]]
[[tensor([293, 293, 293, 293, 293, 293, 293, 293, 293, 293, 293])]]
[[tensor([293, 293, 293, 293, 293, 293, 293, 293