# Notebook regarding measurements with RNNs

## Saving the dataset

In [1]:
import datautils
import torch
import torchtext
from torch.utils.data import DataLoader
import pickle
import torch.nn as nn
import RNNutils
import time
import pandas as pd

In [2]:
DATASET = "./Dataset/power-gb-train.tsv"
DATA_DIR = "./Dataset/"
RES_DIR = "./Results/"
EMBED_DIR = "./Embeddings/"
CHECK_DIR = "./Checkpoints/"
DEVICE = datautils.get_device()

In [3]:
from gensim.downloader import load

EMBEDDING="word2vec-ruscorpora-300"
#download the embedding
pretrained_embeddings=load(EMBEDDING)
vector_embeddings=None
with open(f"{EMBED_DIR}vocab.pkl", "rb") as f:
    curr_vocab = pickle.load(f)
    # Create a matrix to store the vectors
    vector_embeddings = torch.zeros(len(curr_vocab), pretrained_embeddings.vector_size)

    # Set the vectors for our vocabulary words
    for i, word in enumerate(curr_vocab.get_itos()):
        if word in pretrained_embeddings:
            vector_embeddings[i] = torch.tensor(pretrained_embeddings[word]) 



load the dataset (with hold-out)

In [27]:
from collections import Counter


X_train, y_train, X_val, y_val, X_test, y_test = datautils.split_holdout_dataset(DATASET)

# get tokenizer
tokenizer = torchtext.data.utils.get_tokenizer(
    "spacy", language="en_core_web_sm"
)

# build vocabulary
min_freq = 1
curr_vocab = None
while curr_vocab is None or len(curr_vocab) > 10000:
    curr_vocab = datautils.build_vocab(X_train, tokenizer, min_freq=min_freq)
    curr_vocab.set_default_index(curr_vocab["<unk>"])
    min_freq += 1
    print(len(curr_vocab),f"{min_freq} min_freq")

# process datasets
X_train = datautils.data_process(X_train, curr_vocab, tokenizer)
X_val = datautils.data_process(X_val, curr_vocab, tokenizer)
X_test = datautils.data_process(X_test, curr_vocab, tokenizer)

# create dataset objects
X_train = datautils.TextDataset(X_train, y_train, curr_vocab)
X_val = datautils.TextDataset(X_val, y_val, curr_vocab)
X_test = datautils.TextDataset(X_test, y_test, curr_vocab)

# save datasets
torch.save(X_train, f"{DATA_DIR}train_dataset.pt")
torch.save(X_val, f"{DATA_DIR}val_dataset.pt")
torch.save(X_test, f"{DATA_DIR}test_dataset.pt")

# save vocabulary with pickle
with open(f"{EMBED_DIR}vocab.pkl", "wb") as f:
    pickle.dump(curr_vocab, f)

72276 2 min_freq
46491 3 min_freq
37987 4 min_freq
33080 5 min_freq
29729 6 min_freq
27245 7 min_freq
25353 8 min_freq
23816 9 min_freq
22555 10 min_freq
21529 11 min_freq
20552 12 min_freq
19712 13 min_freq
18970 14 min_freq
18299 15 min_freq
17669 16 min_freq
17070 17 min_freq
16575 18 min_freq
16108 19 min_freq
15623 20 min_freq
15203 21 min_freq
14864 22 min_freq
14495 23 min_freq
14173 24 min_freq
13883 25 min_freq
13564 26 min_freq
13267 27 min_freq
12996 28 min_freq
12756 29 min_freq
12553 30 min_freq
12356 31 min_freq
12150 32 min_freq
11959 33 min_freq
11747 34 min_freq
11550 35 min_freq
11374 36 min_freq
11174 37 min_freq
10984 38 min_freq
10830 39 min_freq
10679 40 min_freq
10514 41 min_freq
10381 42 min_freq
10228 43 min_freq
10092 44 min_freq
9944 45 min_freq


### Training hold-out validation

In [4]:
import pandas as pd
import itertools 

In [5]:
# load the datasets
train_dataset = torch.load(f"{DATA_DIR}train_dataset.pt")
val_dataset = torch.load(f"{DATA_DIR}val_dataset.pt")

# load the vocabulary
with open(f"{EMBED_DIR}vocab.pkl", "rb") as f:
    curr_vocab = pickle.load(f)

In [13]:
N_EPOCHS = 1000
CLIP = 1
BATCH_SIZE = 8
EMBEDDING_DIM = 128
HIDDEN_DIM = 1024
OUTPUT_DIM = 1
INPUT_DIM = len(curr_vocab)
PATIECE = 10
CHECKPOINT_STEPS = 10
DROP_OUT = 0.2
LAYERS = 3
MEAN_POOL = True
# experiment number
RES_NUM = 12

learning_rate_list = [0.001]
weight_decay_list = [0]

In [17]:
model = RNNutils.BiLSTM(
    INPUT_DIM,
    EMBEDDING_DIM,
    HIDDEN_DIM,
    output_dim=OUTPUT_DIM,
    dropout=DROP_OUT,
    n_layers=LAYERS,
    mean_pooling=MEAN_POOL,
    pretrained_embedding=None,
    device=DEVICE, 
).to(DEVICE)

train_iterator = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=train_dataset.generate_batch,
)
val_iterator = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=val_dataset.generate_batch,
)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_list[0])

In [18]:
model

BiLSTM(
  (embedding): Embedding(9944, 128)
  (lstm): LSTM(128, 1024, batch_first=True, bidirectional=True)
  (fc): Sequential(
    (0): Linear(in_features=2048, out_features=1024, bias=True)
    (1): Dropout(p=0.2, inplace=False)
    (2): Sigmoid()
    (3): Linear(in_features=1024, out_features=1024, bias=True)
    (4): Dropout(p=0.2, inplace=False)
    (5): Sigmoid()
    (6): Linear(in_features=1024, out_features=1, bias=True)
  )
  (dropout): Dropout(p=0.2, inplace=False)
  (activation): Sigmoid()
)

In [26]:
min_loss = float("inf")
patience = PATIECE

# initailize dataframe for results
results = pd.DataFrame(
    columns=[
        "optim",
        "lr",
        "weight_decay",
        "epoch",
        "train_loss",
        "val_loss",
        "precision",
        "recall",
        "f1_score",
    ]
)

for lr, weight_decay in itertools.product(
    learning_rate_list, weight_decay_list
):

    for epoch in range(N_EPOCHS):

        start_time = time.process_time()

        print("Training...")
        train_loss = RNNutils.train_rnn(
            model,
            train_iterator,
            optimizer,
            criterion,
            CLIP,  # device=DEVICE
            device=DEVICE,
        )
        print("Evaluating...")
        valid_loss, precision, recall, f1_score = RNNutils.evaluate(
            model,
            val_iterator,
            criterion,  # device=DEVICE
            device=DEVICE,
        )

        end_time = time.process_time()

        results = pd.concat(
            [
                results,
                pd.DataFrame(
                    {
                        "optim": "Adam",
                        "lr": lr,
                        "weight_decay": 0,
                        "epoch": [epoch + 1],
                        "train_loss": [train_loss],
                        "val_loss": [valid_loss],
                        "precision": [precision],
                        "recall": [recall],
                        "f1_score": [f1_score],
                    }
                ),
            ]
        )

        # save model checkpoint every 10 epochs
        if (epoch + 1) % CHECKPOINT_STEPS == 0:
            torch.save(
                {
                    "epoch": epoch,
                    "model_state_dict": model.state_dict(),
                    "optimizer_state_dict": optimizer.state_dict(),
                    "loss": valid_loss,
                    "patience": patience,
                },
                f"{RES_DIR}rnn_checkpoint_{epoch + 1}.pt",
            )
            results.to_csv(
                f"{RES_DIR}rnn_results-{RES_NUM}-temp.csv", index=False
            )

        print(f"Epoch: {epoch+1} | Time: {end_time-start_time:.2f}s")
        print(f"\tTrain Loss: {train_loss:.3f}")
        print(f"\t Val. Loss: {valid_loss:.3f}")
        print(
            f"\t Val. Precision: {precision:.3f}, Recall: {recall:.3f}, F1 Score: {f1_score:.3f}"
        )

        # early stopping
        if valid_loss < min_loss:
            min_loss = valid_loss
            patience = PATIECE
        else:
            patience -= 1

        if patience == 0:
            break

results.to_csv(f"{RES_DIR}bilstm-results{RES_NUM}.csv", index=False)

Training...
Evaluating...


  results = pd.concat(


Epoch: 1 | Time: 124.27s
	Train Loss: 0.699
	 Val. Loss: 0.707
	 Val. Precision: 0.564, Recall: 0.553, F1 Score: 0.512
Training...
Evaluating...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch: 2 | Time: 114.05s
	Train Loss: 0.701
	 Val. Loss: 0.679
	 Val. Precision: 0.284, Recall: 0.500, F1 Score: 0.362
Training...
Evaluating...
Epoch: 3 | Time: 129.25s
	Train Loss: 0.697
	 Val. Loss: 0.683
	 Val. Precision: 0.564, Recall: 0.553, F1 Score: 0.512
Training...
Evaluating...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch: 4 | Time: 116.27s
	Train Loss: 0.701
	 Val. Loss: 0.679
	 Val. Precision: 0.284, Recall: 0.500, F1 Score: 0.362
Training...
Evaluating...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch: 5 | Time: 133.80s
	Train Loss: 0.697
	 Val. Loss: 0.682
	 Val. Precision: 0.284, Recall: 0.500, F1 Score: 0.362
Training...
Evaluating...
Epoch: 6 | Time: 124.27s
	Train Loss: 0.697
	 Val. Loss: 0.710
	 Val. Precision: 0.564, Recall: 0.553, F1 Score: 0.512
Training...
Evaluating...
Epoch: 7 | Time: 125.58s
	Train Loss: 0.698
	 Val. Loss: 0.686
	 Val. Precision: 0.564, Recall: 0.553, F1 Score: 0.512
Training...
Evaluating...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch: 8 | Time: 122.09s
	Train Loss: 0.698
	 Val. Loss: 0.688
	 Val. Precision: 0.284, Recall: 0.500, F1 Score: 0.362
Training...
Evaluating...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch: 9 | Time: 139.52s
	Train Loss: 0.697
	 Val. Loss: 0.680
	 Val. Precision: 0.284, Recall: 0.500, F1 Score: 0.362
Training...
Evaluating...
Epoch: 10 | Time: 177.75s
	Train Loss: 0.698
	 Val. Loss: 0.683
	 Val. Precision: 0.564, Recall: 0.553, F1 Score: 0.512
Training...
Evaluating...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch: 11 | Time: 203.36s
	Train Loss: 0.697
	 Val. Loss: 0.724
	 Val. Precision: 0.284, Recall: 0.500, F1 Score: 0.362
Training...
Evaluating...
Epoch: 12 | Time: 191.53s
	Train Loss: 0.703
	 Val. Loss: 0.764
	 Val. Precision: 0.216, Recall: 0.500, F1 Score: 0.302


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
results = pd.read_csv(f"{RES_DIR}rnn_results-{RES_NUM}.csv")
results.sort_values("f1_score", inplace=True)
results

In [31]:

with torch.no_grad():
    test_dataset = torch.load(f"{DATA_DIR}test_dataset.pt")
    test_iterator = DataLoader(
        test_dataset,
        batch_size=BATCH_SIZE,
        collate_fn=train_dataset.generate_batch,
        shuffle=True,   
    )   
    test_loss, precision, recall, f1_score = RNNutils.evaluate(
        model,
        test_iterator,
        criterion,  # device=DEVICE
        device=DEVICE,
    )
    print(test_loss, precision, recall, f1_score)

KeyboardInterrupt: 