# Setup environment

In [2]:
# %%bash
# git clone https://github.com/Paulescu/talking-machines.git
# mv talking-machines/* .
# rm -r talking-machines
# pip install -q -r requirements_py3.6.txt

In [1]:
%load_ext autoreload
%autoreload 2

In [5]:
from pathlib import Path

if 'google.colab' in str(get_ipython()):
    print('Running in Colab')
    DATA_DIR = Path('/content/data')
    ARTIFACTS_DIR = Path('/content/drive/MyDrive/chatbot-course')
    VECTORS_CACHE_DIR = ARTIFACTS_DIR
#     CURRENT_DIR = Path('/content')
    
    # mount google drive
    from google.colab import drive
    drive.mount('/content/drive')
else:
    print('Running in local')
    DATA_DIR = Path('./data')
    ARTIFACTS_DIR = Path('./artifacts')
    VECTORS_CACHE_DIR = Path('./.vector_cache')
#     CURRENT_DIR = Path('./')

Running in local


In [6]:
import torch
if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    print('GPU acceleration is available and will be used :-)')
else:
    DEVICE = torch.device("cpu")
    print('GPU is not available. If you are using Google Colab, change the runtime to GPU, otherwise training will '
          'take too long.')

GPU is not available. If you are using Google Colab, change the runtime to GPU, otherwise training will take too long.


# Download the data

In [75]:
!sh download_data.sh

--2021-01-27 21:16:43--  https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.251.30
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.251.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 209850483 (200M) [application/json]
Saving to: ‘./data/personachat_self_original.json.1’


2021-01-27 21:17:19 (5.60 MB/s) - ‘./data/personachat_self_original.json.1’ saved [209850483/209850483]



In [8]:
%autoreload 2
from utils.data import generate_train_validation_test_files

generate_train_validation_test_files(
    file=DATA_DIR/'personachat_self_original.json',
    autocorrect=False
)

  0%|          | 0/17878 [00:00<?, ?it/s]

0 lines removed
Train set 131,438


  0%|          | 0/1000 [00:00<?, ?it/s]

0 lines removed
Test set 7,801
Saved /Users/paulabartabajo/src/online-courses/advanced-nlp-chatbot/data/train.csv
Saved /Users/paulabartabajo/src/online-courses/advanced-nlp-chatbot/data/val.csv
Saved /Users/paulabartabajo/src/online-courses/advanced-nlp-chatbot/data/test.csv


In [9]:
%autoreload 2
from utils.data import get_datasets_and_vocab

train_dataset, validation_dataset, test_dataset, vocab = get_datasets_and_vocab(
    path_to_files=DATA_DIR,
    train='train.csv',
    validation='val.csv',
    test='test.csv',
    train_size=100,
    validation_size=100,
    use_glove_vectors=True,
    vectors_cache=VECTORS_CACHE_DIR,
)

from utils.serialize import save_vocab
save_vocab(vocab, Path(ARTIFACTS_DIR) / f'vocab_{len(vocab)}')
print(f'Vocab size: {len(vocab):,}')

Vocab size: 477


In [43]:
vocab.freqs

Counter({'hi': 66,
         ',': 222,
         'how': 90,
         'are': 146,
         'you': 321,
         'doing': 46,
         '?': 248,
         'i': 720,
         "'": 192,
         'm': 106,
         'getting': 12,
         'ready': 12,
         'to': 216,
         'do': 190,
         'some': 38,
         'cheetah': 7,
         'chasing': 7,
         'stay': 7,
         'in': 80,
         'shape': 7,
         '.': 1283,
         'must': 7,
         'be': 19,
         'very': 22,
         'fast': 7,
         'hunting': 14,
         'is': 112,
         'one': 12,
         'of': 77,
         'my': 171,
         'favorite': 44,
         'hobbies': 20,
         'am': 120,
         '!': 91,
         'for': 82,
         'hobby': 16,
         'like': 90,
         'canning': 10,
         'or': 29,
         'whittling': 6,
         'also': 29,
         'remodel': 6,
         'homes': 7,
         'when': 25,
         'not': 57,
         'out': 9,
         'bow': 6,
         'that': 98,
   

In [10]:
%autoreload 2
from utils.data import get_dataloaders

train_iter, validation_iter, test_iter = get_dataloaders(
    train_dataset, validation_dataset, test_dataset,
    batch_size=2400,
    device=DEVICE
)

x = next(iter(validation_iter))
print('Example \n-------')
print(x.src[0])
print(x.src[1])

Example 
-------
tensor([[ 2, 30, 12,  ..., 79,  4,  3],
        [ 2, 55,  9,  ...,  3,  1,  1],
        [ 2, 30,  9,  ..., 23,  3,  1],
        ...,
        [ 2,  3,  1,  ...,  1,  1,  1],
        [ 2,  3,  1,  ...,  1,  1,  1],
        [ 2,  3,  1,  ...,  1,  1,  1]])
tensor([62, 60, 61, 53, 51, 42, 40, 39, 34, 47, 42, 37, 40, 29, 22, 29, 23, 23,
        21, 21, 21, 21, 19, 18,  2, 16,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
         2,  2,  2])




In [11]:
%autoreload 2
from model import Seq2seqRNN, count_parameters

hidden_dim = 256
n_layers = 3
n_directions_encoder = 2
model = Seq2seqRNN(len(vocab),
                   vocab.vectors.shape[1],
                   hidden_dim,
                   n_layers,
                   n_directions_encoder,
                   dropout=0.2,
                   pretrained_embeddings=vocab.vectors,
                   freeze_embeddings=False,
#                    attention_type='dot'
                  )

print(f'The model has {count_parameters(model):,} parameters')

The model has 5,476,657 parameters


In [14]:
%autoreload 2
from train import Seq2seqRNNTrainer
from utils.data import PAD_TOKEN

trainer = Seq2seqRNNTrainer(model,
                            train_iter,
                            validation_iter,
                            learning_rate=3e-4,
                            pad_token_id=vocab.stoi[PAD_TOKEN],
                            gradient_clip=5,
                            teacher_forcing=0.5,
                            checkpoint_dir=ARTIFACTS_DIR)
n_epochs = 2
trainer.train_test_loop(n_epochs)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 000, Train loss: 6.1542, Val loss: 6.1182, Train ppl: 470.7, Val ppl: 454.1
artifacts/f04249a4-69a1-11eb-bc39-acbc32b70c09/0.ckpt was saved
artifacts/f04249a4-69a1-11eb-bc39-acbc32b70c09/params.json file was saved


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 001, Train loss: 6.1150, Val loss: 6.0492, Train ppl: 452.6, Val ppl: 423.8
artifacts/f04249a4-69a1-11eb-bc39-acbc32b70c09/1.ckpt was saved
artifacts/f04249a4-69a1-11eb-bc39-acbc32b70c09/params.json file was saved


In [135]:
trainer.load(run_id=trainer.run_id, epoch=1)

n_epochs = 2
trainer.train_test_loop(n_epochs)

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 000, Train loss: 5.7879, Val loss: 4.7615, Train ppl: 326.3, Val ppl: 116.9
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/1.ckpt was saved
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/params.json file was saved


  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 001, Train loss: 5.1757, Val loss: 4.3103, Train ppl: 176.9, Val ppl: 74.5
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/2.ckpt was saved
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/params.json file was saved


In [136]:
n_epochs = 2
trainer.train_test_loop(n_epochs)

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 000, Train loss: 4.9722, Val loss: 4.1971, Train ppl: 144.3, Val ppl: 66.5
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/3.ckpt was saved
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/params.json file was saved


  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 001, Train loss: 4.8659, Val loss: 4.1958, Train ppl: 129.8, Val ppl: 66.4
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/4.ckpt was saved
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/params.json file was saved
