# Setup environment

In [None]:
# %%bash
# git clone https://github.com/Paulescu/talking-machines.git
# mv talking-machines/* .
# rm -r talking-machines
# pip install -r requirements_py3.6.txt

In [112]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [120]:
from pathlib import Path

if 'google.colab' in str(get_ipython()):
    print('Runnin in Colab')
    CHECKPOINT_DIR = Path('/content/drive/MyDrive/chatbot-course')
else:
    print('Running in local')
    CHECKPOINT_DIR = Path('./checkpoints')

Running in local


In [121]:
import torch
if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    print('GPU acceleration is available and will be used :-)')
else:
    DEVICE = torch.device("cpu")
    print('GPU is not available. If you are using Google Colab, change the runtime to GPU, otherwise training will '
          'take too long.')

GPU is not available. If you are using Google Colab, change the runtime to GPU, otherwise training will take too long.


# Download the data

In [75]:
!sh download_data.sh

--2021-01-27 21:16:43--  https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.251.30
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.251.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 209850483 (200M) [application/json]
Saving to: ‘./data/personachat_self_original.json.1’


2021-01-27 21:17:19 (5.60 MB/s) - ‘./data/personachat_self_original.json.1’ saved [209850483/209850483]



In [122]:
%autoreload 2
from data_util import generate_train_validation_test_files

train_csv, val_csv, test_csv = generate_train_validation_test_files(autocorrect=True)

  0%|          | 0/17878 [00:00<?, ?it/s]

0 lines removed
Train set 122,499


  0%|          | 0/1000 [00:00<?, ?it/s]

0 lines removed
Test set 7,801


In [131]:
%autoreload 2
from data_util import TrainingDataWrapper, save_vocab

# Dataset objects
dw = TrainingDataWrapper()
train_ds, val_ds, test_ds = dw.get_datasets(
    train_csv,
    val_csv,
    test_csv,

    # speed up development iterations
    train_size=150, # 132000,
    val_size=7, # 7801,
    
    use_glove=True
)
print(f'Train set size: {len(train_ds):,}')
print(f'Validation set size: {len(val_ds):,}')
print('Vocab size: ', dw.vocab_size)

# Save vocab to disk
save_vocab(dw.vocab, CHECKPOINT_DIR / f'vocab_{dw.vocab_size}')

Train set size: 150
Validation set size: 7
Vocab size:  694


In [132]:
# DataLoader objects
train_iter, val_iter, test_iter = dw.get_dataloaders(
    train_ds, val_ds, test_ds,
    batch_size=2400,
    device=DEVICE
)

x = next(iter(train_iter))
print('Example \n-------')
print(x.src[0])
print(x.src[1])

Example 
-------
tensor([[ 2, 50, 10,  ...,  0, 85,  3],
        [ 2, 38, 10,  ...,  1,  1,  1],
        [ 2, 50, 10,  ...,  1,  1,  1],
        [ 2, 84, 10,  ...,  1,  1,  1],
        [ 2,  4, 18,  ...,  1,  1,  1],
        [ 2, 84, 22,  ...,  1,  1,  1]])
tensor([210, 194, 179, 180, 177, 171])


In [133]:
%autoreload 2
from model import Seq2seqRNN, count_parameters

hidden_dim = 256
n_layers = 3
n_directions_encoder = 2
model = Seq2seqRNN(dw.vocab_size,
                   dw.embedding_dim,
                   hidden_dim,
                   n_layers,
                   n_directions_encoder,
                   dropout=0.2,
                   pretrained_embeddings=dw.embeddings,
                   freeze_embeddings=False)

print(f'The model has {count_parameters(model):,} parameters')

The model has 5,554,126 parameters


In [134]:
%autoreload 2
from train import Seq2seqRNNTrainer

trainer = Seq2seqRNNTrainer(model,
                            train_iter,
                            val_iter,
                            learning_rate=3e-4,
                            pad_token_id=dw.pad_token_id,
                            gradient_clip=99999,
                            teacher_forcing=0.5,
                            checkpoint_dir=CHECKPOINT_DIR)
n_epochs = 2
trainer.train_test_loop(n_epochs)

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 000, Train loss: 6.5203, Val loss: 6.4679, Train ppl: 678.8, Val ppl: 644.1
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/0.ckpt was saved
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/params.json file was saved


  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 001, Train loss: 6.4153, Val loss: 6.0018, Train ppl: 611.1, Val ppl: 404.1
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/1.ckpt was saved
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/params.json file was saved


In [135]:
trainer.load(run_id=trainer.run_id, epoch=1)

n_epochs = 2
trainer.train_test_loop(n_epochs)

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 000, Train loss: 5.7879, Val loss: 4.7615, Train ppl: 326.3, Val ppl: 116.9
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/1.ckpt was saved
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/params.json file was saved


  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 001, Train loss: 5.1757, Val loss: 4.3103, Train ppl: 176.9, Val ppl: 74.5
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/2.ckpt was saved
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/params.json file was saved


In [136]:
n_epochs = 2
trainer.train_test_loop(n_epochs)

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 000, Train loss: 4.9722, Val loss: 4.1971, Train ppl: 144.3, Val ppl: 66.5
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/3.ckpt was saved
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/params.json file was saved


  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 001, Train loss: 4.8659, Val loss: 4.1958, Train ppl: 129.8, Val ppl: 66.4
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/4.ckpt was saved
checkpoints/0193b9c0-6544-11eb-b0e5-acbc32b70c09/params.json file was saved
