# Notebook regarding measurements with RNNs

## Saving the dataset

In [1]:
import datautils
import torch
import torchtext
from torch.utils.data import DataLoader
import pickle
import torch.nn as nn
import RNNutils
import time
import pandas as pd

In [2]:
DATASET = "./Dataset/power-gb-train.tsv"
DATA_DIR = "./Dataset/"
RES_DIR = "./Results/"
EMBED_DIR = "./Embeddings/"
CHECK_DIR = "./Checkpoints/"
DEVICE = datautils.get_device()

load the dataset (with hold-out)

In [5]:
X_train, y_train, X_val, y_val, _, _ = datautils.split_holdout_dataset(DATASET)

# get tokenizer
tokenizer = torchtext.data.utils.get_tokenizer(
    "spacy", language="en_core_web_sm"
)

# build vocabulary
min_freq = 1
curr_vocab = None
while curr_vocab is None or len(curr_vocab) > 20000:
    curr_vocab = datautils.build_vocab(X_train, tokenizer, min_freq=min_freq)
    curr_vocab.set_default_index(curr_vocab["<unk>"])
    min_freq += 1
    print(len(curr_vocab))

# process datasets
X_train = datautils.data_process(X_train, curr_vocab, tokenizer)
X_val = datautils.data_process(X_val, curr_vocab, tokenizer)

# create dataset objects
X_train = datautils.TextDataset(X_train, y_train, curr_vocab)
X_val = datautils.TextDataset(X_val, y_val, curr_vocab)

# save datasets
torch.save(X_train, f"{DATA_DIR}train_dataset.pt")
torch.save(X_val, f"{DATA_DIR}val_dataset.pt")

# save vocabulary with pickle
with open(f"{EMBED_DIR}vocab.pkl", "wb") as f:
    pickle.dump(curr_vocab, f)

72276
46491
37987
33080
29729
27245
25353
23816
22555
21529
20552
19712


### Training hold-out validation

In [6]:
import pandas as pd
import itertools

In [7]:
# load the datasets
train_dataset = torch.load(f"{DATA_DIR}train_dataset.pt")
val_dataset = torch.load(f"{DATA_DIR}val_dataset.pt")

# load the vocabulary
with open(f"{EMBED_DIR}vocab.pkl", "rb") as f:
    curr_vocab = pickle.load(f)

In [8]:
N_EPOCHS = 1000
CLIP = 1
BATCH_SIZE = 2048 * 4
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
OUTPUT_DIM = 1
INPUT_DIM = len(curr_vocab)
PATIECE = 10
CHECKPOINT_STEPS = 10

# experiment number
RES_NUM = 2

learning_rate_list = [0.01]
weight_decay_list = [0]

In [13]:
model = RNNutils.BiLSTM(
    INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, output_dim=OUTPUT_DIM, device=DEVICE
)
train_iterator = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=train_dataset.generate_batch,
)
val_iterator = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=val_dataset.generate_batch,
)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_list[0])

In [14]:
min_loss = float("inf")
patience = PATIECE

# initailize dataframe for results
results = pd.DataFrame(
    columns=[
        "optim",
        "lr",
        "weight_decay",
        "epoch",
        "train_loss",
        "val_loss",
        "precision",
        "recall",
        "f1_score",
    ]
)

for lr, weight_decay in itertools.product(
    learning_rate_list, weight_decay_list
):

    for epoch in range(N_EPOCHS):

        start_time = time.process_time()

        print("Training...")
        train_loss = RNNutils.train_rnn(
            model,
            train_iterator,
            optimizer,
            criterion,
            CLIP,  # device=DEVICE
        )
        print("Evaluating...")
        valid_loss, precision, recall, f1_score = RNNutils.evaluate(
            model,
            val_iterator,
            criterion,  # device=DEVICE
        )

        end_time = time.process_time()

        results = pd.concat(
            [
                results,
                pd.DataFrame(
                    {
                        "optim": "Adam",
                        "lr": lr,
                        "weight_decay": 0,
                        "epoch": [epoch + 1],
                        "train_loss": [train_loss],
                        "val_loss": [valid_loss],
                        "precision": [precision],
                        "recall": [recall],
                        "f1_score": [f1_score],
                    }
                ),
            ]
        )

        # save model checkpoint every 10 epochs
        if (epoch + 1) % CHECKPOINT_STEPS == 0:
            torch.save(
                {
                    "epoch": epoch,
                    "model_state_dict": model.state_dict(),
                    "optimizer_state_dict": optimizer.state_dict(),
                    "loss": valid_loss,
                    "patience": patience,
                },
                f"{RES_DIR}rnn_checkpoint_{epoch + 1}.pt",
            )
            results.to_csv(
                f"{RES_DIR}rnn_results-{RES_NUM}-temp.csv", index=False
            )

        print(f"Epoch: {epoch+1} | Time: {end_time-start_time:.2f}s")
        print(f"\tTrain Loss: {train_loss:.3f}")
        print(f"\t Val. Loss: {valid_loss:.3f}")
        print(
            f"\t Val. Precision: {precision:.3f}, Recall: {recall:.3f}, F1 Score: {f1_score:.3f}"
        )

        # early stopping
        if valid_loss < min_loss:
            min_loss = valid_loss
            patience = PATIECE
        else:
            patience -= 1

        if patience == 0:
            break

results.to_csv(f"{RES_DIR}rnn_results-{RES_NUM}.csv", index=False)

Training...


In [5]:
results = pd.read_csv(f"{RES_DIR}rnn_results-{RES_NUM}.csv")
results.sort_values("f1_score", inplace=True)
results

Unnamed: 0,optim,lr,weight_decay,epoch,train_loss,val_loss,precision,recall,f1_score
1,Adam,0.001,0,2,0.679495,0.677676,0.532281,0.516542,0.470575
0,Adam,0.001,0,1,0.703805,0.680974,0.519346,0.511607,0.475854
2,Adam,0.001,0,3,0.669179,0.678746,0.539636,0.531443,0.518594
4,Adam,0.001,0,5,0.633891,0.701004,0.553453,0.544928,0.536978
6,Adam,0.001,0,7,0.58737,0.738696,0.545186,0.543852,0.543568
5,Adam,0.001,0,6,0.612563,0.72399,0.550993,0.548768,0.548079
3,Adam,0.001,0,4,0.655943,0.685922,0.550219,0.549095,0.54905
