# Setup environment

In [2]:
# %%bash
# git clone https://github.com/Paulescu/talking-machines.git
# mv talking-machines/* .
# rm -r talking-machines
# pip install -r requirements_py3.6.txt

In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
from pathlib import Path

if 'google.colab' in str(get_ipython()):
    print('Running in Colab')
    CHECKPOINT_DIR = Path('/content/drive/MyDrive/chatbot-course')

    # mount google drive
    from google.colab import drive
    drive.mount('/content/drive')
else:
    print('Running in local')
    CHECKPOINT_DIR = Path('./checkpoints')

Running in local


In [5]:
import torch
if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    print('GPU acceleration is available and will be used :-)')
else:
    DEVICE = torch.device("cpu")
    print('GPU is not available. If you are using Google Colab, change the runtime to GPU, otherwise training will '
          'take too long.')

GPU is not available. If you are using Google Colab, change the runtime to GPU, otherwise training will take too long.


# Download the data

In [75]:
!sh download_data.sh

--2021-01-27 21:16:43--  https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.251.30
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.251.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 209850483 (200M) [application/json]
Saving to: ‘./data/personachat_self_original.json.1’


2021-01-27 21:17:19 (5.60 MB/s) - ‘./data/personachat_self_original.json.1’ saved [209850483/209850483]



In [7]:
%autoreload 2
from utils.data import generate_train_validation_test_files

generate_train_validation_test_files(autocorrect=False)

  0%|          | 0/17878 [00:00<?, ?it/s]

0 lines removed
Train set 131,438


  0%|          | 0/1000 [00:00<?, ?it/s]

0 lines removed
Test set 7,801


In [8]:
%autoreload 2
from pathlib import Path

from torchtext.data.utils import get_tokenizer
from utils.vocab import get_vocab, WordVocab
from utils.constants import *

vocab = WordVocab(Path(DATA_DIR) / 'train.csv')
vocab.load_glove_vectors()
vocab.save(Path(ARTIFACTS_DIR) / f'vocab_{vocab.size}')
print(f'Vocab size: {vocab.size:,}')

Vocab size: 18,693


In [9]:
vocab.freqs

Counter({'hi': 59614,
         ',': 426959,
         'how': 120286,
         'are': 159481,
         'you': 382513,
         'doing': 50777,
         '?': 360196,
         'i': 775562,
         "'": 221273,
         'm': 108792,
         'getting': 6956,
         'ready': 3826,
         'to': 229458,
         'do': 200340,
         'some': 21041,
         'cheetah': 44,
         'chasing': 197,
         'stay': 4548,
         'in': 97019,
         'shape': 648,
         '.': 1517386,
         'must': 5451,
         'be': 36621,
         'very': 19795,
         'fast': 2938,
         'hunting': 1341,
         'is': 160168,
         'one': 18898,
         'of': 85136,
         'my': 206584,
         'favorite': 29605,
         'hobbies': 9566,
         'am': 110176,
         '!': 173744,
         'for': 83374,
         'hobby': 3030,
         'like': 115944,
         'canning': 93,
         'or': 12589,
         'whittling': 102,
         'also': 13602,
         'remodel': 80,
         '

In [17]:
%autoreload 2
from utils.data import get_datasets

train_ds, val_ds, test_ds = get_datasets(vocab, train_size=10000, val_size=1000)

In [18]:
%autoreload 2
from utils.data import get_dataloaders

train_iter, val_iter, test_iter = get_dataloaders(
    train_ds, val_ds, test_ds,
    batch_size=2400,
    device=DEVICE
)

x = next(iter(train_iter))
print('Example \n-------')
print(x.src[0])
print(x.src[1])

Example 
-------
tensor([[  2,  35,   6,  ..., 138,   8,   3],
        [  2,  35,   6,  ...,  90,   4,   3],
        [  2,  35,  55,  ...,   3,   1,   1],
        ...,
        [  2,  86,  55,  ...,   1,   1,   1],
        [  2,  35,  55,  ...,   1,   1,   1],
        [  2,  46,   6,  ...,   1,   1,   1]])
tensor([132, 132, 130, 130, 128, 128, 124, 124, 122, 122, 123, 123, 122, 121,
        121, 119, 119, 117])


In [19]:
%autoreload 2
from model import Seq2seqRNN, count_parameters

hidden_dim = 256
n_layers = 3
n_directions_encoder = 2
model = Seq2seqRNN(vocab.size,
                   vocab.vectors_dim,
                   hidden_dim,
                   n_layers,
                   n_directions_encoder,
                   dropout=0.2,
                   pretrained_embeddings=vocab.vectors,
                   freeze_embeddings=False)

print(f'The model has {count_parameters(model):,} parameters')

The model has 11,979,769 parameters


In [20]:
%autoreload 2
from train import Seq2seqRNNTrainer

trainer = Seq2seqRNNTrainer(model,
                            train_iter,
                            val_iter,
                            learning_rate=3e-4,
                            pad_token_id=vocab.pad_token_id,
                            gradient_clip=99999,
                            teacher_forcing=0.5,
                            checkpoint_dir=CHECKPOINT_DIR)
n_epochs = 2
trainer.train_test_loop(n_epochs)

  0%|          | 0/10000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [135]:
trainer.load(run_id=trainer.run_id, epoch=1)

n_epochs = 2
trainer.train_test_loop(n_epochs)

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 000, Train loss: 5.7879, Val loss: 4.7615, Train ppl: 326.3, Val ppl: 116.9
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/1.ckpt was saved
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/params.json file was saved


  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 001, Train loss: 5.1757, Val loss: 4.3103, Train ppl: 176.9, Val ppl: 74.5
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/2.ckpt was saved
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/params.json file was saved


In [136]:
n_epochs = 2
trainer.train_test_loop(n_epochs)

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 000, Train loss: 4.9722, Val loss: 4.1971, Train ppl: 144.3, Val ppl: 66.5
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/3.ckpt was saved
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/params.json file was saved


  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 001, Train loss: 4.8659, Val loss: 4.1958, Train ppl: 129.8, Val ppl: 66.4
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/4.ckpt was saved
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/params.json file was saved
