# Conversational agent based on a seq2seq model with attention

## Setup environment if running in Google Colab

In [1]:
# %%bash
# git clone https://github.com/Paulescu/talking-machines.git
# mv talking-machines/* .
# rm -r talking-machines
# pip install -q -r requirements_py3.6.txt

In [2]:
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'svg'

## Set environment variables

In [3]:
from pathlib import Path
import sys

if 'google.colab' in str(get_ipython()):
    print('Running in Colab')
    
    # mount google drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    STORAGE_DIR = Path('/content/drive/MyDrive/chatbot-course')
    ROOT_DIR = Path('/content')
    
else:
    print('Running in local')
    STORAGE_DIR = Path('/Users/paulabartabajo/src/online-courses/advanced-nlp-chatbot')
    ROOT_DIR = STORAGE_DIR

DATA_DIR = STORAGE_DIR / 'data'
ARTIFACTS_DIR = STORAGE_DIR / 'artifacts'
VECTORS_CACHE_DIR = STORAGE_DIR / 'vector_cache'

sys.path.insert(0, str(ROOT_DIR))

Running in local


In [4]:
import torch

if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    print('GPU acceleration is available and will be used :-)')
else:
    DEVICE = torch.device("cpu")
    print('GPU is not available. If you are using Google Colab, change the runtime to GPU, otherwise training will '
          'take too long.')

GPU is not available. If you are using Google Colab, change the runtime to GPU, otherwise training will take too long.


## Download the data

In [5]:
# !wget http://parl.ai/downloads/personachat/personachat.tgz -P $DATA_DIR
# !tar -tvf ${DATA_DIR}/personachat.tgz -C ${DATA_DIR}

In [6]:
# DATA_DIR = DATA_DIR / 'personachat'

------

## Generate train, val, test files

In [7]:
# %autoreload 2
# from src.data import generate_file_with_training_examples
# from src.util import pprint

# # train set
# generate_file_with_training_examples(
#     input_file=DATA_DIR/'train_both_original.txt',
#     output_file=DATA_DIR/'train_both_original.json',   
#     n_past_utterances=1,
#     include_persona=True,
#     include_partner=True,
#     autocorrect=False,
# )

# # validation set
# generate_file_with_training_examples(
#     input_file=DATA_DIR/'valid_both_original.txt',
#     output_file=DATA_DIR/'valid_both_original.json',   
#     n_past_utterances=1,
#     include_persona=True,
#     include_partner=True,
#     autocorrect=False,
# )

# # test set
# generate_file_with_training_examples(
#     input_file=DATA_DIR/'test_both_original.txt',
#     output_file=DATA_DIR/'test_both_original.json',   
#     n_past_utterances=1,
#     include_persona=True,
#     include_partner=True,
#     autocorrect=False,
# )

## Generate the vocabulary

I use all data available in the train set to build the vocab.

I want to keep the vocab fixed while I play with different train sub-datasets

In [8]:
# load sentence processor, to speed up development iterations
from src.util import load_vocab
sentence_processor = load_vocab(ARTIFACTS_DIR/f'sentence_processor_18743.pkl')
vocab = sentence_processor.vocab
vocab.itos

['<unk>',
 '<PAD>',
 '<BOS>',
 '<EOS>',
 '.',
 'i',
 'a',
 'my',
 'to',
 'am',
 'is',
 'the',
 'have',
 ',',
 'like',
 'in',
 'you',
 '?',
 'and',
 'do',
 'love',
 'of',
 'favorite',
 'that',
 '!',
 'for',
 'work',
 'not',
 'are',
 'at',
 'with',
 'it',
 'on',
 'me',
 'what',
 'an',
 'live',
 'but',
 'was',
 'be',
 'enjoy',
 'go',
 'just',
 'as',
 'how',
 'good',
 'so',
 'time',
 'school',
 'music',
 'play',
 'from',
 'm',
 'about',
 'can',
 'when',
 'too',
 'food',
 'up',
 'eat',
 'all',
 'one',
 'own',
 'out',
 'very',
 'want',
 'would',
 'lot',
 'your',
 'two',
 'get',
 'well',
 'day',
 'job',
 'been',
 'really',
 'no',
 'years',
 'college',
 'great',
 'family',
 'band',
 'oh',
 'dogs',
 'read',
 'color',
 'make',
 'mother',
 'we',
 'kids',
 'parents',
 'yes',
 'going',
 'drive',
 'nice',
 'old',
 'they',
 'home',
 'dog',
 'new',
 't',
 'them',
 'cool',
 'will',
 'never',
 'hair',
 'any',
 'mom',
 'friends',
 'doing',
 'married',
 'there',
 'people',
 'much',
 'only',
 'its',
 'year

In [9]:
# %autoreload 2
# from src.data import get_sentence_processor

# sentence_processor = get_sentence_processor(
#     train_file=DATA_DIR/'train_both_original.json',
# #     train_file=DATA_DIR/'aux.json',
#     min_word_freq=1,
#     max_vocab_size=99999,
#     use_persona_info=True,
#     glove_vectors='glove.6B.300d',
#     vectors_cache=VECTORS_CACHE_DIR,
# )
# print(f'{len(sentence_processor.vocab):,} words in the vocabulary.')

# from src.util import save_vocab
# vocab = sentence_processor.vocab
# save_vocab(
#     sentence_processor.vocab,
#     ARTIFACTS_DIR/f'vocab_{len(vocab)}.pkl'
# )
# print('Vocabulary saved.')

# save_vocab(
#     sentence_processor,
#     ARTIFACTS_DIR/f'sentence_processor_{len(vocab)}.pkl'
# )
# print('Saved sentence processor.')

--------

## Quick exploratory data analysis

In [10]:
# %autoreload 2
# from src.util import plot_sentence_lengths

# plot_sentence_lengths(
#     train_file=DATA_DIR/'train_none_original.csv',
#     test_file=DATA_DIR/'test_none_original.csv',
# )

In [11]:
print(vocab.stoi['hdfsi'])
print(vocab.stoi['<PAD>'])
print(vocab.stoi['<BOS>'])
print(vocab.stoi['<EOS>'])
print(vocab.stoi['hi'])
# print(vocab.vectors[0])
# print(vocab.vectors[1])
# print(vocab.vectors[2])
# print(vocab.vectors[3])
# print(vocab.vectors[vocab.stoi['hi']])

0
1
2
3
141


## PyTorch datasets

In [12]:
%autoreload 2
from src.data import get_datasets

train_ds, val_ds, test_ds = get_datasets(
    path=DATA_DIR,
    train='train_both_original.json',
    val='valid_both_original.json',
    test='test_both_original.json',
    sentence_processor=sentence_processor,
    train_size=100,
    use_persona_info=True,
)

print(f'{len(train_ds):,}')
print(f'{len(val_ds):,}')
print(f'{len(test_ds):,}')



100
14,602
14,056


## PyTorch DataLoaders

In [13]:
%autoreload 2
from src.data import get_dataloaders

train_iter, validation_iter, test_iter = get_dataloaders(
    train_ds, val_ds, test_ds,
    n_examples_per_batch=64,
#     n_tokens_per_batch=2400,
    device=DEVICE
)

x = next(iter(train_iter))
print('Example \n-------')
print(x.src[0].shape)
print(x.src[0])
# print(x.src[1])

Example 
-------
torch.Size([64, 17])
tensor([[   2,   16,  439,  ...,    4,    3,    1],
        [   2,    5,    9,  ..., 1871,    4,    3],
        [   2,    5,   26,  ...,    4,    3,    1],
        ...,
        [   2,   33,  117,  ...,    1,    1,    1],
        [   2, 1981,    4,  ...,    1,    1,    1],
        [   2,    5,  469,  ...,    1,    1,    1]])




## Check the data that enters into the model

In [14]:
# n_examples = 5
# for batch in train_iter:
#     try:
#         persona, _ = batch.persona
#     except: 
#         pass
#     src, _ = batch.src
#     tgt, _ = batch.tgt
    
#     for i in range(n_examples):
#         print(f'\nExample {i}: ')
        
#         # persona
#         try:
#             ids = persona[i, :].cpu().detach().numpy()
#             words = [vocab.itos[x] for x in ids]
#             print('persona: ', ' '.join(words))
#         except:
#             pass
        
#         # source text
#         ids = src[i, :].cpu().detach().numpy()
#         words = [vocab.itos[x] for x in ids]
#         print('SRC: ', ' '.join(words))

#         # actual response
#         ids = tgt[i, :].cpu().detach().numpy()
#         words = [vocab.itos[x] for x in ids]
#         print('TGT: ', ' '.join(words))
        
#     break

## Model

In [20]:
%autoreload 2
from src.transformer.model import Transformer, count_parameters
from src.data import PAD_TOKEN

vocab = sentence_processor.vocab

# For the momemnt, no GloVe vectors.

model = Transformer(
    model_dimension=512,
    
    vocab_size=len(vocab),
    
    number_of_heads=8,
    number_of_layers=6,
    dropout_probability=0.2,
    padding_idx=vocab.stoi[PAD_TOKEN],
    log_attention_weights=False,
).to(DEVICE)

print(f'The model has {count_parameters(model):,} parameters')

The model has 72,948,535 parameters


## Train

In [22]:
%autoreload 2
from src.transformer.train import TransformerTrainer

trainer = TransformerTrainer(
    model,
    train_iter,
    validation_iter,
    learning_rate=1e-4,
    vocab=vocab,
    pad_token_id=vocab.stoi[PAD_TOKEN],
    checkpoint_dir=ARTIFACTS_DIR,
    validation_freq=1,
    validation_n_examples=10,
    loss_fn='cross-entropy',
    debug=False,
    use_label_smoothing=False,
)

epochs = 10
trainer.train_test_loop(epochs)

Epoch: 000


[autoreload of src.transformer.train failed: Traceback (most recent call last):
  File "/Users/paulabartabajo/miniconda3/envs/chatbot/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/Users/paulabartabajo/miniconda3/envs/chatbot/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 394, in superreload
    module = reload(module)
  File "/Users/paulabartabajo/miniconda3/envs/chatbot/lib/python3.7/imp.py", line 314, in reload
    return importlib.reload(module)
  File "/Users/paulabartabajo/miniconda3/envs/chatbot/lib/python3.7/importlib/__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 630, in _exec
  File "<frozen importlib._bootstrap_external>", line 728, in exec_module
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "/Users/paulabartabajo/src/online-courses/advanced-nlp-chatbot/src/tra

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/14602 [00:00<?, ?it/s]


Example 0: 
SRC:  <BOS> i live in colorado . i am a mechanical engineer . my favorite activity is rock climbing . i have a black lab named trooper . i like to go hiking in the spring . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <BOS> trooper is my sons name <EOS>
TGT:  <BOS> awesome . how old is he ?
MODEL:  <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>

Example 1: 
SRC:  <BOS> i am an economics major . i am married with two kids . i just bought a house recently . i see the rolling stones in concert every year . i eat a raw diet . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <BOS> its called live action roleplaying <EOS>
TGT:  <BOS> my kids might like that game .
MODEL:  <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>

Example 2: 
SRC:  <BOS> i dream of moving to the city . my family has raised horses for a generations . i was raised on a 

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/14602 [00:00<?, ?it/s]


Example 0: 
SRC:  <BOS> i live in colorado . i am a mechanical engineer . my favorite activity is rock climbing . i have a black lab named trooper . i like to go hiking in the spring . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <BOS> trooper is my sons name <EOS>
TGT:  <BOS> awesome . how old is he ?
MODEL:  i i i i i <EOS> <EOS> <EOS>

Example 1: 
SRC:  <BOS> i am an economics major . i am married with two kids . i just bought a house recently . i see the rolling stones in concert every year . i eat a raw diet . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <BOS> its called live action roleplaying <EOS>
TGT:  <BOS> my kids might like that game .
MODEL:  i i i i i i <EOS> <EOS>

Example 2: 
SRC:  <BOS> i dream of moving to the city . my family has raised horses for a generations . i was raised on a horse farm . i am from a small town . i want

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/14602 [00:00<?, ?it/s]


Example 0: 
SRC:  <BOS> i live in colorado . i am a mechanical engineer . my favorite activity is rock climbing . i have a black lab named trooper . i like to go hiking in the spring . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <BOS> trooper is my sons name <EOS>
TGT:  <BOS> awesome . how old is he ?
MODEL:  i i i i i <EOS> <EOS> <EOS>

Example 1: 
SRC:  <BOS> i am an economics major . i am married with two kids . i just bought a house recently . i see the rolling stones in concert every year . i eat a raw diet . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <BOS> its called live action roleplaying <EOS>
TGT:  <BOS> my kids might like that game .
MODEL:  i i i i i i <EOS> <EOS>

Example 2: 
SRC:  <BOS> i dream of moving to the city . my family has raised horses for a generations . i was raised on a horse farm . i am from a small town . i want

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/14602 [00:00<?, ?it/s]


Example 0: 
SRC:  <BOS> i live in colorado . i am a mechanical engineer . my favorite activity is rock climbing . i have a black lab named trooper . i like to go hiking in the spring . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <BOS> trooper is my sons name <EOS>
TGT:  <BOS> awesome . how old is he ?
MODEL:  i i i i <EOS> <EOS> <EOS> <EOS>

Example 1: 
SRC:  <BOS> i am an economics major . i am married with two kids . i just bought a house recently . i see the rolling stones in concert every year . i eat a raw diet . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <BOS> its called live action roleplaying <EOS>
TGT:  <BOS> my kids might like that game .
MODEL:  i i i i <EOS> <EOS> <EOS> <EOS>

Example 2: 
SRC:  <BOS> i dream of moving to the city . my family has raised horses for a generations . i was raised on a horse farm . i am from a small t

  0%|          | 0/100 [00:00<?, ?it/s]

KeyboardInterrupt: 

## In case you want to resume training from a saved checkpoint

In [127]:
run_id = '116f63f4-6ab8-11eb-a582-acbc32b70c09'
trainer.load(run_id=run_id, epoch=9)

n_epochs = 10
trainer.train_test_loop(n_epochs)

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 000, Train loss: 4.6936, Val loss: 4.5945, Train ppl: 109.3, Val ppl: 98.9
artifacts/116f63f4-6ab8-11eb-a582-acbc32b70c09/9.ckpt was saved
artifacts/116f63f4-6ab8-11eb-a582-acbc32b70c09/params.json file was saved


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 001, Train loss: 4.6872, Val loss: 4.5905, Train ppl: 108.6, Val ppl: 98.5
artifacts/116f63f4-6ab8-11eb-a582-acbc32b70c09/10.ckpt was saved
artifacts/116f63f4-6ab8-11eb-a582-acbc32b70c09/params.json file was saved


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 002, Train loss: 4.6767, Val loss: 4.5925, Train ppl: 107.4, Val ppl: 98.7


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 003, Train loss: 4.6707, Val loss: 4.5887, Train ppl: 106.8, Val ppl: 98.4
artifacts/116f63f4-6ab8-11eb-a582-acbc32b70c09/12.ckpt was saved
artifacts/116f63f4-6ab8-11eb-a582-acbc32b70c09/params.json file was saved


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 004, Train loss: 4.6637, Val loss: 4.5880, Train ppl: 106.0, Val ppl: 98.3
artifacts/116f63f4-6ab8-11eb-a582-acbc32b70c09/13.ckpt was saved
artifacts/116f63f4-6ab8-11eb-a582-acbc32b70c09/params.json file was saved


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 005, Train loss: 4.6581, Val loss: 4.5776, Train ppl: 105.4, Val ppl: 97.3
artifacts/116f63f4-6ab8-11eb-a582-acbc32b70c09/14.ckpt was saved
artifacts/116f63f4-6ab8-11eb-a582-acbc32b70c09/params.json file was saved


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 006, Train loss: 4.6563, Val loss: 4.5849, Train ppl: 105.2, Val ppl: 98.0


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 007, Train loss: 4.6507, Val loss: 4.5755, Train ppl: 104.7, Val ppl: 97.1
artifacts/116f63f4-6ab8-11eb-a582-acbc32b70c09/16.ckpt was saved
artifacts/116f63f4-6ab8-11eb-a582-acbc32b70c09/params.json file was saved


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 008, Train loss: 4.6474, Val loss: 4.5824, Train ppl: 104.3, Val ppl: 97.8


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 009, Train loss: 4.6414, Val loss: 4.5735, Train ppl: 103.7, Val ppl: 96.9
artifacts/116f63f4-6ab8-11eb-a582-acbc32b70c09/18.ckpt was saved
artifacts/116f63f4-6ab8-11eb-a582-acbc32b70c09/params.json file was saved
