# 3. vježba: analiza klasifikacije sentimenta

### Zadatak 1. Učitavanje podataka (25% bodova)

In [1]:
from util import get_word_frequency_sorted

frequencies = get_word_frequency_sorted("./data/sst_train_raw.csv")

print("Total number of words:", len(frequencies))
for word, freq in list(frequencies.items())[:5]:
    print(f"{word}\t-> {freq}")

Total number of words: 14804
the	-> 5954
a	-> 4361
and	-> 3831
of	-> 3631
to	-> 2438


In [2]:
special_symbols = {'<PAD>': 0, '<UNK>': 1}
label_frequency = {"positive": 2, "negative": 1}

In [3]:
from vocab import Vocab

data_vocab = Vocab(frequencies, max_size=-1, min_freq=0, special_symbols=special_symbols)
stoi = data_vocab.stoi

for word in ['<PAD>', '<UNK>', 'the', 'a', 'and', 'my', 'twists', 'lets', 'sports', 'amateurishly']:
    print(f"{word} -> {stoi[word]}")

<PAD> -> 0
<UNK> -> 1
the -> 2
a -> 3
and -> 4
my -> 188
twists -> 930
lets -> 956
sports -> 1275
amateurishly -> 6818


In [4]:
label_vocab = Vocab({}, max_size=-1, min_freq=0, special_symbols=label_frequency)
stoi = label_vocab.stoi

for word, index in stoi.items():
    print(f"{word} -> {index}")

positive -> 0
negative -> 1


In [5]:
from dataset import NlpDataset

train_dataset = NlpDataset('./data/sst_train_raw.csv', data_vocab, label_vocab)
instance_text, instance_label = train_dataset.instances[3]
print("Text:", instance_text)
print("Label:", instance_label)

numericalized_text, numericalized_label = train_dataset[3]
print(f"Numericalized text: {numericalized_text}")
print(f"Numericalized label: {numericalized_label}")

Text: ['yet', 'the', 'act', 'is', 'still', 'charming', 'here']
Label: positive
Numericalized text: tensor([189,   2, 674,   7, 129, 348, 143], dtype=torch.int32)
Numericalized label: 0


In [6]:
from torch.utils.data import DataLoader
from util import pad_collate_fn

batch_size = 2
shuffle = False

train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=pad_collate_fn)
texts, labels, lengths = next(iter(train_dataloader))

print(f"Texts: {texts}")
print(f"Labels: {labels}")
print(f"Lengths: {lengths}")

Texts: tensor([[   2,  554,    7, 2872,    6,   22,    2, 2873, 1236,    8,   96, 4800,
            4,   10,   72,    8,  242,    6,   75,    3, 3576,   56, 3577,   34,
         2022, 2874, 7123, 3578, 7124,   42,  779, 7125,    0,    0],
        [   2, 2875, 2023, 4801,    5,    2, 3579,    5,    2, 2876, 4802,    7,
           40,  829,   10,    3, 4803,    5,  627,   62,   27, 2877, 2024, 4804,
          962,  715,    8, 7126,  555,    5, 7127, 4805,    8, 7128]],
       dtype=torch.int32)
Labels: tensor([0, 0], dtype=torch.int32)
Lengths: tensor([32, 34])


In [7]:
%reset

## Zadatak 2. Implementacija baseline modela (25% bodova)

In [8]:
SAVE_DIR = './models/basic'

In [9]:
# Hyperparams

SEED = 7052020
VOCAB_MAX_SIZE = -1
VOCAB_MIN_FREQ = 1
TRAIN_BATCH_SIZE = 10
VALID_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32
EPOCHS = 30
LR = 1e-4
SHUFFLE = True

In [17]:
from loader import vocab_data_loader

train_dataloader, valid_dataloader, test_dataloader, embeddings = vocab_data_loader(
    seed=SEED,
    vocab_max_size=VOCAB_MAX_SIZE,
    vocab_min_freq=VOCAB_MIN_FREQ,
    train_bs=TRAIN_BATCH_SIZE,
    valid_bs=VALID_BATCH_SIZE,
    test_bs=TEST_BATCH_SIZE,
    shuffle=SHUFFLE
)

In [15]:
import torch
from torch.nn import BCEWithLogitsLoss
from torch.optim import Adam

from baseline import Baseline
from engine import train, evaluate
from util import get_metrics

from torch.utils.tensorboard import SummaryWriter


model = Baseline(embeddings)

criterion = BCEWithLogitsLoss()
optimizer = Adam(model.parameters(), lr=LR)

baseline_writer = SummaryWriter('runs/baseline')

for epoch in range(EPOCHS):
    loss, y_pred, y_true = train(model, train_dataloader, optimizer, criterion, -1)
    torch.save(model, SAVE_DIR + f"/baseline/epoch-{epoch}.pickle")
    accuracy, precision, recall, f1 = get_metrics(y_pred, y_true)

    baseline_writer.add_scalar("Loss/train", loss, epoch)
    baseline_writer.add_scalar("Accuracy/train", accuracy, epoch)
    baseline_writer.add_scalar("Precision/train", precision, epoch)
    baseline_writer.add_scalar("Recall/train", recall, epoch)
    baseline_writer.add_scalar("F1/train", f1, epoch)

    print(f"Train epoch {epoch}:")
    print(f"\tAccuracy: {accuracy}, Loss: {loss}")

    loss, y_pred, y_true = evaluate(model, valid_dataloader, criterion)
    accuracy, precision, recall, f1 = get_metrics(y_pred, y_true)

    baseline_writer.add_scalar("Loss/validate", loss, epoch)
    baseline_writer.add_scalar("Accuracy/validate", accuracy, epoch)
    baseline_writer.add_scalar("Precision/validate", precision, epoch)
    baseline_writer.add_scalar("Recall/validate", recall, epoch)
    baseline_writer.add_scalar("F1/validate", f1, epoch)

    print(f"Validate epoch {epoch}:")
    print(f"\tAccuracy: {accuracy}, Loss: {loss}")

baseline_writer.flush()
baseline_writer.close()

Train epoch 1:
	Accuracy: 0.6005780346820809, Loss: 0.6669273212810473
Validate epoch 1:
	Accuracy: 0.7084019769357496, Loss: 0.6123208236276058
Train epoch 2:
	Accuracy: 0.7320809248554914, Loss: 0.5555407534961756
Validate epoch 2:
	Accuracy: 0.7484898407468424, Loss: 0.5330309935829096
Train epoch 3:
	Accuracy: 0.7673410404624278, Loss: 0.5031714297701858
Validate epoch 3:
	Accuracy: 0.7902251510159253, Loss: 0.48953733580154285
Train epoch 4:
	Accuracy: 0.7802023121387284, Loss: 0.4815769983250971
Validate epoch 4:
	Accuracy: 0.7660626029654036, Loss: 0.49232064997940733
Train epoch 5:
	Accuracy: 0.7833815028901734, Loss: 0.4717148376811791
Validate epoch 5:
	Accuracy: 0.7913234486545854, Loss: 0.4711927844766985
Train epoch 6:
	Accuracy: 0.7933526011560693, Loss: 0.4612705295937771
Validate epoch 6:
	Accuracy: 0.7764964305326744, Loss: 0.479785015185674
Train epoch 7:
	Accuracy: 0.7946531791907514, Loss: 0.4570081312467769
Validate epoch 7:
	Accuracy: 0.7891268533772653, Loss: 0.4

In [16]:
BEST_EPOCH = 19

model = torch.load(SAVE_DIR + f"/baseline/epoch-{BEST_EPOCH}.pickle")

loss, y_pred, y_true = evaluate(model, test_dataloader, criterion)
accuracy, precision, recall, f1 = get_metrics(y_pred, y_true)

print("Test results:")
print(f"\tAccuracy: {accuracy}, Loss: {loss}")

Test results:
	Accuracy: 0.7786697247706422, Loss: 0.44540453223245485


In [18]:
%reset

## Zadatak 3. Implementacija povratne neuronske mreže (25% bodova)

In [28]:
SAVE_DIR = './models/basic'

In [29]:
# Hyperparams

SEED = 7052020
VOCAB_MAX_SIZE = -1
VOCAB_MIN_FREQ = 1
TRAIN_BATCH_SIZE = 10
VALID_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32
EPOCHS = 30
LR = 1e-4
GRADIENT_CLIP = 0.25
SHUFFLE = True

In [30]:
from loader import vocab_data_loader

train_dataloader, valid_dataloader, test_dataloader, embeddings = vocab_data_loader(
    seed=SEED,
    vocab_max_size=VOCAB_MAX_SIZE,
    vocab_min_freq=VOCAB_MIN_FREQ,
    train_bs=TRAIN_BATCH_SIZE,
    valid_bs=VALID_BATCH_SIZE,
    test_bs=TEST_BATCH_SIZE,
    shuffle=SHUFFLE
)

In [31]:
from rnn import Rnn

MODE = 'gru'
model = Rnn(embeddings, mode=MODE)
print(model)

Rnn(
  (embeddings): Embedding(7123, 300, padding_idx=0)
  (rnn): GRU(300, 300)
  (fc1): Linear(in_features=300, out_features=150, bias=True)
  (fc2): Linear(in_features=150, out_features=1, bias=True)
)


In [7]:
import torch
from torch.nn import BCEWithLogitsLoss
from torch.optim import Adam

from engine import train, evaluate
from util import get_metrics

from torch.utils.tensorboard import SummaryWriter

criterion = BCEWithLogitsLoss()
optimizer = Adam(model.parameters(), lr=LR)

rnn_writer = SummaryWriter(f'runs/{MODE}')

for epoch in range(EPOCHS):
    loss, y_pred, y_true = train(model, train_dataloader, optimizer, criterion, GRADIENT_CLIP)
    torch.save(model, SAVE_DIR + f"/{MODE}/epoch-{epoch}.pickle")
    accuracy, precision, recall, f1 = get_metrics(y_pred, y_true)

    rnn_writer.add_scalar("Loss/train", loss, epoch)
    rnn_writer.add_scalar("Accuracy/train", accuracy, epoch)
    rnn_writer.add_scalar("Precision/train", precision, epoch)
    rnn_writer.add_scalar("Recall/train", recall, epoch)
    rnn_writer.add_scalar("F1/train", f1, epoch)

    print(f"Train epoch {epoch}:")
    print(f"\tAccuracy: {accuracy}, Loss: {loss}")

    loss, y_pred, y_true = evaluate(model, valid_dataloader, criterion)
    accuracy, precision, recall, f1 = get_metrics(y_pred, y_true)

    rnn_writer.add_scalar("Loss/validate", loss, epoch)
    rnn_writer.add_scalar("Accuracy/validate", accuracy, epoch)
    rnn_writer.add_scalar("Precision/validate", precision, epoch)
    rnn_writer.add_scalar("Recall/validate", recall, epoch)
    rnn_writer.add_scalar("F1/validate", f1, epoch)

    print(f"Validate epoch {epoch}:")
    print(f"\tAccuracy: {accuracy}, Loss: {loss}")

rnn_writer.flush()
rnn_writer.close()

Train epoch 0:
	Accuracy: 0.522543352601156, Loss: 0.6921697370923323
Validate epoch 0:
	Accuracy: 0.5063152114222954, Loss: 0.692521149652046
Train epoch 1:
	Accuracy: 0.7213872832369942, Loss: 0.5529462322377401
Validate epoch 1:
	Accuracy: 0.7759472817133443, Loss: 0.49551953296912343
Train epoch 2:
	Accuracy: 0.7903179190751445, Loss: 0.47280673084059205
Validate epoch 2:
	Accuracy: 0.7726523887973641, Loss: 0.5305561208934114
Train epoch 3:
	Accuracy: 0.8005780346820809, Loss: 0.4480341517587187
Validate epoch 3:
	Accuracy: 0.8034047226798462, Loss: 0.4495425242603871


KeyboardInterrupt: 

In [36]:
BEST_EPOCH = 6

model = torch.load(SAVE_DIR + f"/lstm/epoch-{BEST_EPOCH}.pickle")

loss, y_pred, y_true = evaluate(model, test_dataloader, criterion)
accuracy, precision, recall, f1 = get_metrics(y_pred, y_true)

print("Test results:")
print(f"\tAccuracy: {accuracy}, Loss: {loss}")

Test results:
	Accuracy: 0.7901376146788991, Loss: 0.4403384306601116


In [39]:
%reset

Nothing done.


## Zadatak 4. Usporedba modela i pretraga hiperparametara (25% bodova)

### Bidirectional

In [43]:
import os

SAVE_DIR = './models/vs=300_bs=10_ls=2_drop=0.45_bidir'
LOG_SAVE_DIR = './runs/vs=300_bs=10_ls=2_drop=0.45_bidir'
os.mkdir(SAVE_DIR)
os.mkdir(LOG_SAVE_DIR)

for mode in ['rnn', 'lstm', 'gru']:
    os.mkdir(SAVE_DIR + '/' + mode)

In [None]:
# Hyperparams

SEED = 7052020
VOCAB_MAX_SIZE = -1
VOCAB_MIN_FREQ = 1
TRAIN_BATCH_SIZE = 10
VALID_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32
EPOCHS = 10
LR = 1e-4
GRADIENT_CLIP = 0.25
DROPOUT = 0.45
LAYERS = 2
SHUFFLE = True

In [None]:
from loader import vocab_data_loader
from rnn import Rnn

import torch
from torch.nn import BCEWithLogitsLoss
from torch.optim import Adam

from engine import train, evaluate
from util import get_metrics

from torch.utils.tensorboard import SummaryWriter

for MODE in ['rnn', 'lstm', 'gru']:
    train_dataloader, valid_dataloader, test_dataloader, embeddings = vocab_data_loader(
        seed=SEED,
        vocab_max_size=VOCAB_MAX_SIZE,
        vocab_min_freq=VOCAB_MIN_FREQ,
        train_bs=TRAIN_BATCH_SIZE,
        valid_bs=VALID_BATCH_SIZE,
        test_bs=TEST_BATCH_SIZE,
        shuffle=SHUFFLE
    )

    model = Rnn(embeddings, mode=MODE, bidirectional=True, layers=LAYERS, dropout=DROPOUT)
    print(model)

    criterion = BCEWithLogitsLoss()
    optimizer = Adam(model.parameters(), lr=LR)

    writer = SummaryWriter(f'{LOG_SAVE_DIR}/{MODE}')

    for epoch in range(EPOCHS):
        loss, y_pred, y_true = train(model, train_dataloader, optimizer, criterion, GRADIENT_CLIP)
        torch.save(model, SAVE_DIR + f"/{MODE}/epoch-{epoch}.pickle")
        accuracy, precision, recall, f1 = get_metrics(y_pred, y_true)

        writer.add_scalar("Loss/train", loss, epoch)
        writer.add_scalar("Accuracy/train", accuracy, epoch)
        writer.add_scalar("Precision/train", precision, epoch)
        writer.add_scalar("Recall/train", recall, epoch)
        writer.add_scalar("F1/train", f1, epoch)

        print(f"Train epoch {epoch}:")
        print(f"\tAccuracy: {accuracy}, Loss: {loss}")

        loss, y_pred, y_true = evaluate(model, valid_dataloader, criterion)
        accuracy, precision, recall, f1 = get_metrics(y_pred, y_true)

        writer.add_scalar("Loss/validate", loss, epoch)
        writer.add_scalar("Accuracy/validate", accuracy, epoch)
        writer.add_scalar("Precision/validate", precision, epoch)
        writer.add_scalar("Recall/validate", recall, epoch)
        writer.add_scalar("F1/validate", f1, epoch)

        print(f"Validate epoch {epoch}:")
        print(f"\tAccuracy: {accuracy}, Loss: {loss}")

    writer.flush()
    writer.close()

### Hyperparameters Optimization

In [None]:
# Hyperparams

fixed_params = {
    'seed': 7052020,
    'epochs': 10,
    'min_frequency': 1,
    'lr': 1e-4,
    'grad_clip': 0.25
}

VALID_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32
SHUFFLE = True

variable_params = {
    'vs': [200, 14804],     # vocabulary size
    'bs': [1, 160],         # batch size
    'ls': [1, 4],           # layer size
    'drop': [0, 0.69],      # dropout
}

In [None]:
from loader import vocab_data_loader
from rnn import Rnn

import torch
from torch.nn import BCEWithLogitsLoss
from torch.optim import Adam

from engine import train, evaluate
from util import get_metrics

from torch.utils.tensorboard import SummaryWriter

import os
from itertools import product

MODE = 'gru'
for vs, bs, ls, drop in product(variable_params['vs'], variable_params['bs'], variable_params['ls'], variable_params['drop']):
    print(vs, bs, ls, drop)
    train_dataloader, valid_dataloader, test_dataloader, embeddings = vocab_data_loader(
        seed=fixed_params['seed'],
        vocab_max_size=vs,
        vocab_min_freq=fixed_params['min_frequency'],
        train_bs=bs,
        valid_bs=VALID_BATCH_SIZE,
        test_bs=TEST_BATCH_SIZE,
        shuffle=SHUFFLE
    )

    model = Rnn(embeddings, mode=MODE, bidirectional=True, layers=ls, dropout=drop, hidden_size=150)
    print(model)

    criterion = BCEWithLogitsLoss()
    optimizer = Adam(model.parameters(), lr=fixed_params['lr'])

    filename = f"gru_vs={vs}_bs={bs}_ls={ls}_drop={drop}_bidir".replace(".", "")

    SAVE_DIR = f'./models/{filename}'
    os.mkdir(SAVE_DIR)

    writer = SummaryWriter(f'/runs')

    for epoch in range(fixed_params['epochs']):
        loss, y_pred, y_true = train(model, train_dataloader, optimizer, criterion, fixed_params['grad_clip'])
        torch.save(model, SAVE_DIR + f"/epoch-{epoch}.pickle")
        accuracy, precision, recall, f1 = get_metrics(y_pred, y_true)

        writer.add_scalar("Loss/train", loss, epoch)
        writer.add_scalar("Accuracy/train", accuracy, epoch)
        writer.add_scalar("Precision/train", precision, epoch)
        writer.add_scalar("Recall/train", recall, epoch)
        writer.add_scalar("F1/train", f1, epoch)

        print(f"Train epoch {epoch}:")
        print(f"\tAccuracy: {accuracy}, Loss: {loss}")

        loss, y_pred, y_true = evaluate(model, valid_dataloader, criterion)
        accuracy, precision, recall, f1 = get_metrics(y_pred, y_true)

        writer.add_scalar("Loss/validate", loss, epoch)
        writer.add_scalar("Accuracy/validate", accuracy, epoch)
        writer.add_scalar("Precision/validate", precision, epoch)
        writer.add_scalar("Recall/validate", recall, epoch)
        writer.add_scalar("F1/validate", f1, epoch)

        print(f"Validate epoch {epoch}:")
        print(f"\tAccuracy: {accuracy}, Loss: {loss}")

    writer.flush()
    writer.close()
