# Notebook regarding measurements with RNNs

## Saving the dataset

In [8]:
import datautils
import torch
import torchtext
from torch.utils.data import DataLoader
import pickle
import torch.nn as nn
import RNNutils
import time
import pandas as pd

In [9]:
DATASET = "./Dataset/power-gb-train.tsv"
DATA_DIR = "./Dataset/"
RES_DIR = "./Results/"
EMBED_DIR = "./Embeddings/"
CHECK_DIR = "./Checkpoints/"
DEVICE = datautils.get_device()

Load the dataset (with k-fold)

In [3]:
X_train, y_train, X_val, y_val, _, _ = datautils.split_kfold_dataset(DATASET)

# get tokenizer
tokenizer = torchtext.data.utils.get_tokenizer(
    "spacy", language="en_core_web_sm"
)

# build vocabulary
en_vocab_list = []
for idx, fold in enumerate(X_train):
    print("Building vocabulary...", idx)
    fold_vocab = datautils.build_vocab(fold, tokenizer)
    fold_vocab.set_default_index(fold_vocab["<unk>"])
    en_vocab_list.append(fold_vocab)

# process all sets (train, val)

for idx, (train_fold, val_fold) in enumerate(zip(X_train, X_val)):

    # get vocabulary buildt on current training fold
    fold_vocab = en_vocab_list[idx]

    print(f"Processing fold {idx+1}...")
    # process training fold
    X_train[idx] = datautils.data_process(train_fold, fold_vocab, tokenizer)

    print(f"Processing fold {idx+1}... validation set")
    # process validation fold
    X_val[idx] = datautils.data_process(val_fold, fold_vocab, tokenizer)

Building vocabulary... 0
Building vocabulary... 1
Building vocabulary... 2
Building vocabulary... 3
Building vocabulary... 4
Processing fold 1...
Processing fold 1... validation set
Processing fold 2...
Processing fold 2... validation set
Processing fold 3...
Processing fold 3... validation set
Processing fold 4...
Processing fold 4... validation set
Processing fold 5...
Processing fold 5... validation set


In [5]:
for idx, (train_fold, train_target, val_fold, val_target) in enumerate(
    zip(X_train, y_train, X_val, y_val)
):

    print(f"Processing fold {idx+1}...")
    # create dataset
    train_dataset = datautils.TextDataset(
        train_fold, train_target, en_vocab_list[idx]
    )
    val_dataset = datautils.TextDataset(
        val_fold, val_target, en_vocab_list[idx]
    )

    # save datasets
    torch.save(train_dataset, f"{DATA_DIR}train_dataset_{idx}.pt")
    torch.save(val_dataset, f"{DATA_DIR}val_dataset_{idx}.pt")


# save vocabulary list using pickle

with open(f"{EMBED_DIR}vocab_list.pkl", "wb") as f:
    pickle.dump(en_vocab_list, f)

Processing fold 1...
Processing fold 2...
Processing fold 3...
Processing fold 4...
Processing fold 5...


load the dataset (with hold-out)

In [3]:
X_train, y_train, X_val, y_val, _, _ = datautils.split_holdout_dataset(DATASET)

# get tokenizer
tokenizer = torchtext.data.utils.get_tokenizer(
    "spacy", language="en_core_web_sm"
)
curr_vocab = datautils.build_vocab(X_train, tokenizer)
curr_vocab.set_default_index(curr_vocab["<unk>"])

# process datasets
X_train = datautils.data_process(X_train, curr_vocab, tokenizer)
X_val = datautils.data_process(X_val, curr_vocab, tokenizer)

# create dataset objects
X_train = datautils.TextDataset(X_train, y_train, curr_vocab)
X_val = datautils.TextDataset(X_val, y_val, curr_vocab)

# save datasets
torch.save(X_train, f"{DATA_DIR}train_dataset.pt")
torch.save(X_val, f"{DATA_DIR}val_dataset.pt")

# save vocabulary with pickle
with open(f"{EMBED_DIR}vocab.pkl", "wb") as f:
    pickle.dump(curr_vocab, f)

### Training hold-out validation

In [10]:
import pandas as pd
import itertools

In [11]:
# load the datasets
train_dataset = torch.load(f"{DATA_DIR}train_dataset.pt")
val_dataset = torch.load(f"{DATA_DIR}val_dataset.pt")

# load the vocabulary
with open(f"{EMBED_DIR}vocab.pkl", "rb") as f:
    curr_vocab = pickle.load(f)

In [12]:
N_EPOCHS = 1000
CLIP = 1
BATCH_SIZE = 2048 * 4
EMBEDDING_DIM = 25
HIDDEN_DIM = 25
OUTPUT_DIM = 1
INPUT_DIM = len(curr_vocab)
PATIECE = 5
CHECKPOINT_STEPS = 10

learning_rate_list = [0.01]
weight_decay_list = [0]

In [13]:
model = RNNutils.RNN(
    INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, output_dim=OUTPUT_DIM, device=DEVICE
)
train_iterator = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=train_dataset.generate_batch,
)
val_iterator = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=val_dataset.generate_batch,
)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_list[0])

In [14]:
min_loss = float("inf")
patience = PATIECE

# initailize dataframe for results
results = pd.DataFrame(
    columns=[
        "optim",
        "lr",
        "weight_decay",
        "epoch",
        "train_loss",
        "val_loss",
        "precision",
        "recall",
        "f1_score",
    ]
)

for lr, weight_decay in itertools.product(
    learning_rate_list, weight_decay_list
):

    for epoch in range(N_EPOCHS):

        start_time = time.process_time()

        print("Training...")
        train_loss = RNNutils.train_rnn(
            model, train_iterator, optimizer, criterion, CLIP, #device=DEVICE
        )
        print("Evaluating...")
        valid_loss, precision, recall, f1_score = RNNutils.evaluate(
            model, val_iterator, criterion,# device=DEVICE
        )

        end_time = time.process_time()

        results = pd.concat(
            [
                results,
                pd.DataFrame(
                    {
                        "optim": "Adam",
                        "lr": lr,
                        "weight_decay": 0,
                        "epoch": [epoch + 1],
                        "train_loss": [train_loss],
                        "val_loss": [valid_loss],
                        "precision": [precision],
                        "recall": [recall],
                        "f1_score": [f1_score],
                    }
                ),
            ]
        )

        # save model checkpoint every 10 epochs
        if (epoch + 1) % CHECKPOINT_STEPS == 0:
            torch.save(
                {
                    "epoch": epoch,
                    "model_state_dict": model.state_dict(),
                    "optimizer_state_dict": optimizer.state_dict(),
                    "loss": valid_loss,
                    "patience": patience,
                },
                f"{RES_DIR}rnn_checkpoint_{epoch + 1}.pt",
            )
            results.to_csv(f"{RES_DIR}rnn_results-1-temp.csv", index=False)

        print(f"Epoch: {epoch+1} | Time: {end_time-start_time:.2f}s")
        print(f"\tTrain Loss: {train_loss:.3f}")
        print(f"\t Val. Loss: {valid_loss:.3f}")
        print(
            f"\t Val. Precision: {precision:.3f}, Recall: {recall:.3f}, F1 Score: {f1_score:.3f}"
        )

        # early stopping
        if valid_loss < min_loss:
            min_loss = valid_loss
            patience = PATIECE
        else:
            patience -= 1

        if patience == 0:
            break

results.to_csv(f"{RES_DIR}rnn_results-1.csv", index=False)

Training...


In [5]:
results = pd.read_csv(f"{RES_DIR}rnn_results-1.csv")
results.sort_values("f1_score", inplace=True)
results

Unnamed: 0,optim,lr,weight_decay,epoch,train_loss,val_loss,precision,recall,f1_score
1,Adam,0.001,0,2,0.679495,0.677676,0.532281,0.516542,0.470575
0,Adam,0.001,0,1,0.703805,0.680974,0.519346,0.511607,0.475854
2,Adam,0.001,0,3,0.669179,0.678746,0.539636,0.531443,0.518594
4,Adam,0.001,0,5,0.633891,0.701004,0.553453,0.544928,0.536978
6,Adam,0.001,0,7,0.58737,0.738696,0.545186,0.543852,0.543568
5,Adam,0.001,0,6,0.612563,0.72399,0.550993,0.548768,0.548079
3,Adam,0.001,0,4,0.655943,0.685922,0.550219,0.549095,0.54905


### Training k-fold (ignore for now)

In [3]:
# load datasets saved in previous step
train_dataset_list = []
val_dataset_list = []

# using 5-fold cross validation
for idx in range(5):
    train_dataset_list.append(torch.load(f"{DATA_DIR}train_dataset_{idx}.pt"))
    val_dataset_list.append(torch.load(f"{DATA_DIR}val_dataset_{idx}.pt"))

In [5]:
# iterate over each fold, using dataloader for minibatches

for idx, (train_dataset, val_dataset) in enumerate(
    zip(train_dataset_list, val_dataset_list)
):
    print(f"Processing fold {idx+1}...")
    train_loader = DataLoader(
        train_dataset,
        batch_size=128,
        shuffle=True,
        collate_fn=train_dataset.generate_batch,
    )

    # print(train_dataset[0])
    # iterate on the dataloaders
    for i, (X_batch, y_batch, x_len) in enumerate(train_loader):
        print(f"Batch {i+1}...")
        print(X_batch.shape)
        print(y_batch.shape)
        print(x_len)
        break

Processing fold 1...
Batch 1...
torch.Size([128, 2609])
torch.Size([128, 1])
[150, 665, 437, 113, 443, 1631, 352, 118, 602, 126, 108, 645, 1010, 151, 1335, 221, 1347, 2553, 1290, 1055, 980, 276, 836, 182, 156, 298, 151, 166, 1581, 379, 860, 94, 161, 308, 175, 116, 585, 865, 246, 587, 121, 2609, 215, 252, 283, 1001, 666, 743, 181, 95, 236, 659, 100, 992, 371, 183, 794, 775, 953, 670, 390, 647, 112, 134, 349, 668, 123, 341, 128, 793, 1730, 106, 99, 1076, 163, 785, 1295, 467, 428, 989, 233, 868, 738, 186, 452, 108, 700, 636, 550, 624, 792, 1019, 172, 223, 452, 645, 196, 118, 218, 2443, 164, 833, 924, 358, 984, 333, 2386, 239, 751, 141, 588, 1413, 649, 329, 1016, 462, 107, 137, 93, 795, 286, 589, 140, 105, 208, 108, 99, 1622]
Processing fold 2...
Batch 1...
torch.Size([128, 2365])
torch.Size([128, 1])
[709, 462, 2070, 492, 1006, 245, 186, 120, 243, 141, 123, 349, 1690, 178, 153, 511, 618, 219, 1191, 106, 917, 2365, 303, 208, 163, 500, 185, 112, 847, 652, 321, 249, 188, 827, 203, 279, 614, 