In [1]:
import sys
sys.path.append('/content/drive/MyDrive/Lab3')
sys.path.append('/content/drive/MyDrive/Lab3/data')

In [2]:
import random
import importlib
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from dataloader import *
from colors import *

In [3]:
class Recurrent(nn.Module):
    def __init__(self, type='GRU', hidden_size=150, num_layers=2, dropout=0.5, bidirectional=False):
        super(Recurrent, self).__init__()

        self.type = type
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.bidirectional = bidirectional

        if self.type == 'GRU':
            self.rnn = nn.GRU(input_size=300,
                              hidden_size=self.hidden_size,
                              batch_first=False,
                              num_layers=self.num_layers,
                              dropout=self.dropout,
                              bidirectional=self.bidirectional)
        elif self.type == 'LSTM':
            self.rnn = nn.LSTM(input_size=300,
                               hidden_size=self.hidden_size,
                               batch_first=False,
                               num_layers=self.num_layers,
                               dropout=self.dropout,
                               bidirectional=self.bidirectional)
        else:
            self.rnn = nn.RNN(input_size=300,
                              hidden_size=self.hidden_size,
                              batch_first=False,
                              num_layers=self.num_layers,
                              dropout=self.dropout,
                              bidirectional=self.bidirectional)

        self.fc1 = nn.Linear(self.hidden_size, self.hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(self.hidden_size, 1) # Outputs a single value (for binary classification)


    def forward(self, x):
      """
      Forward pass for the recurrent neural network.

      Args:
          x (torch.Tensor): Input tensor of shape (seq_len, batch_size, input_size),
                            where seq_len is the length of the sequence,
                            batch_size is the number of samples in a batch,
                            and input_size is the size of word embeddings (300 for GloVe).

      Returns:
          torch.Tensor: The output tensor of shape (batch_size, 1),
                        representing raw logits for each sample in the batch.
      """

      # Pass the input sequence through the RNN layer (LSTM, GRU, or RNN)
      if self.type == 'LSTM':
          # LSTM returns (all_hidden_states, (all hidden states,(final hidden state,final cell state))
          _, (x, _) = self.rnn(x)  # We only take the final hidden state (x), ignoring cell state (_)
      else:
          # GRU and vanilla RNN return (all_hidden_states, final_hidden_state)
          _, x = self.rnn(x)  # We only take the final hidden state (x)

      # Extract the last layer's hidden state
      # The RNN outputs hidden states for all layers, but we only need the last layer
      x = x[-1]  # Shape: (batch_size, hidden_size)

      x = self.fc1(x)  # Shape: (batch_size, hidden_size)

      x = self.relu(x)  # Shape: (batch_size, hidden_size)

      x = self.fc2(x)  # Shape: (batch_size, 1) - single output per sample

      return x  # Output logits (raw scores)



In [4]:
def train(model, dataloader, epochs, optimizer, loss_fcn, embeddings, validation_dataloader, device):
    print('Training Started')
    model.train()

    for epoch in range(epochs):
      model.train()
      total_loss = 0
      for i, (words, labels, lengths) in enumerate(dataloader):
        optimizer.zero_grad()

        # Move data to the GPU
        words, labels = words.to(device), labels.to(device)

        words = words.type(torch.LongTensor).to(device)
        words = embeddings(words).to(device)
        words = words.transpose(1, 0)

        output = model.forward(words).squeeze()
        loss = loss_fcn(output, labels)
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25)
        optimizer.step()

      accuracy, CM, precision, recall, f1 = evaluate(model, validation_dataloader, loss_fcn, embeddings, device)
      print(Colors.RED + f'Epoch {epoch}: Valid accuracy: {accuracy.item()}' + Colors.RESET)

    print('Finished Training')
    return accuracy, precision, recall, f1


def evaluate(model, dataloader, loss_fcn, embeddings, device):
    model.eval()
    confusion_matrix = torch.zeros(2, 2).to(device)

    with torch.no_grad():
        for i, (words, labels, lengths) in enumerate(dataloader):
            words, labels = words.to(device), labels.to(device)

            words = words.type(torch.LongTensor).to(device)
            words = embeddings(words).to(device)
            words = words.transpose(1, 0)

            output = model.forward(words).squeeze()
            predictions = torch.round(torch.sigmoid(output))
            for i in range(len(predictions)):
                confusion_matrix[int(labels[i]), int(predictions[i])] += 1

    accuracy = confusion_matrix.diag().sum() / confusion_matrix.sum()
    precision = confusion_matrix[0, 0] / confusion_matrix[0, :].sum()
    recall = confusion_matrix[0, 0] / confusion_matrix[:, 0].sum()
    f1 = 2 * precision * recall / (precision + recall)

    return accuracy, confusion_matrix, precision, recall, f1


In [5]:
if __name__ == '__main__':
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f'Using device: {device}')

    seed = 7052020
    torch.manual_seed(seed)
    np.random.seed(seed)
    batch_size = 10
    shuffle = True

    train_dataset = NLPDataset('/content/drive/MyDrive/Lab3/data/sst_train_raw.csv', train=True)
    train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=pad_collate_fn)

    text_vocab = Vocab(train_dataset.text_frequencies, max_size=-1, min_freq=0)
    label_vocab = Vocab(train_dataset.label_frequencies, max_size=-1, min_freq=0)

    test_dataset = NLPDataset('/content/drive/MyDrive/Lab3/data/sst_test_raw.csv', train=False, text_vocab=text_vocab, label_vocab=label_vocab)
    test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=pad_collate_fn)

    validation_dataset = NLPDataset('/content/drive/MyDrive/Lab3/data/sst_valid_raw.csv', train=False, text_vocab=text_vocab, label_vocab=label_vocab)
    validation_dataloader = DataLoader(dataset=validation_dataset, batch_size=len(validation_dataset), collate_fn=pad_collate_fn)

    word_rep = load_embeddings('/content/drive/MyDrive/Lab3/data/sst_glove_6b_300d.txt')
    word_embeddings = gen_embeddings(text_vocab.stoi, word_rep).to(device)

    loss_fcn = nn.BCEWithLogitsLoss()

    cell_type = ['LSTM', 'GRU', 'RNN']
    hidden_size = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
    num_layers = [2, 3, 4, 5, 6, 7, 8, 9, 10]
    dropout = [0.0, 0.15, 0.25, 0.35, 0.5, 0.65, 0.75, 0.85]
    bidirectional = [True, False]
    results = []


    # Test different seeds
    for i in range(5):
        hidden = 100
        layers = 3
        drop = 0.35
        bi_dir = True

        print(Colors.BLUE + f'Cell type: LSTM, Hidden_size: {hidden}, Num layers: {layers}, Dropout: {drop}, Bidirectional: {bi_dir}' + Colors.RESET)

        model = Recurrent(type='LSTM', hidden_size=hidden, num_layers=layers, dropout=drop, bidirectional=bi_dir).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
        accuracy, precision, recall, f1 = train(model, train_dataloader, 5, optimizer, loss_fcn, word_embeddings, validation_dataloader, device)
        accuracy = round(accuracy.item() * 100, 2)
        results.append((seed, 'LSTM', accuracy, precision.item(), recall.item(), f1.item()))
        seed = random.randint(0, 1000000)
        torch.manual_seed(seed)
        np.random.seed(seed)

    test_accuracy, test_CM, test_precision, test_recall, test_f1 = evaluate(model, test_dataloader, loss_fcn, word_embeddings, device)
    print(f'Test Accuracy: {test_accuracy.item()}')

    # Save seed results
    with open('/content/drive/MyDrive/Lab3/results/good_cell_seeds_LSTM.txt', 'w') as f:
        for item in results:
            f.write(f'Seed: {item[0]}, Cell: {item[1]}, Accuracy: {item[2]}, Precision: {item[3]}, Recall: {item[4]}, F1: {item[5]}\n')

    """
        For the Task 4
    """

    # Test different parameters
    # for type in cell_type:
    #     for _ in range(15):
    #         hidden = random.choice(hidden_size)
    #         layers = random.choice(num_layers)
    #         drop = random.choice(dropout)
    #         bi_dir = random.choice(bidirectional)

    #         print(Colors.BLUE + f'Cell type: {type}, Hidden_size: {hidden}, Num layers: {layers}, Dropout: {drop}, Bidirectional: {bi_dir}' + Colors.RESET)

    #         model = Recurrent(type=type, hidden_size=hidden, num_layers=layers, dropout=drop, bidirectional=bi_dir).to(device)
    #         optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

    #         accuracy, f1, _, _ = train(model, train_dataloader, 5, optimizer, loss_fcn, word_embeddings, validation_dataloader, device)
    #         accuracy = round(accuracy.item() * 100, 2)
    #         results.append((type, hidden, layers, drop, bi_dir, accuracy, f1.item()))

    # # Save parameter results
    # with open('/content/drive/MyDrive/Lab3/results/rnn_params.txt', 'w') as f:
    #     for item in results:
    #         f.write(f'Cell type: {item[0]}, Hidden size: {item[1]}, Num layers: {item[2]}, Dropout: {item[3]}, Bidirectional: {item[4]}, Accuracy: {item[5]}, F1: {item[6]}\n')


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
word: rigidly index: 12136
word rigidly is in word_rep at index 12136
word: paradigm index: 12137
word paradigm is in word_rep at index 12137
word: permitting index: 12138
word permitting is in word_rep at index 12138
word: well-worn index: 12139
word well-worn is in word_rep at index 12139
word: bmw index: 12140
word bmw is in word_rep at index 12140
word: haranguing index: 12141
word haranguing is in word_rep at index 12141
word: squashed index: 12142
word squashed is in word_rep at index 12142
word: preferably index: 12143
word preferably is in word_rep at index 12143
word: semi index: 12144
word semi is in word_rep at index 12144
word: re-hash index: 12145
word: jump index: 12146
word jump is in word_rep at index 12146
word: nausea index: 12147
word nausea is in word_rep at index 12147
word: dull-witted index: 12148
word: disquietingly index: 12149
word disquietingly is in word_rep at index 12149
word: jams index: 121