# 1.1

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from collections import defaultdict

# Special tokens
START_TOKEN = "<s>"
END_TOKEN = "</s>"
UNKNOWN_TOKEN = "UUUNKKK"

def load_data(file_path):
    sentences = []
    sentence = []
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                word, pos = line.split('\t')
                sentence.append((word, pos))
        if sentence:
            sentences.append(sentence)
    return sentences

class POSTaggingDataset(Dataset):
    def __init__(self, sentences, word_to_ix, tag_to_ix, context_window):
        self.sentences = sentences
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix
        self.context_window = context_window

    def __len__(self):
        return sum(len(sentence) for sentence in self.sentences)

    def __getitem__(self, idx):
        for sentence in self.sentences:
            if idx < len(sentence):
                break
            idx -= len(sentence)

        context_indices = []
        for i in range(-self.context_window, self.context_window + 1):
            position = idx + i
            if position < 0:
                word = START_TOKEN
            elif position >= len(sentence):
                word = END_TOKEN
            else:
                word, _ = sentence[position]
            context_indices.append(self.word_to_ix.get(word, self.word_to_ix[UNKNOWN_TOKEN]))

        _, target_tag = sentence[idx]
        target_index = self.tag_to_ix[target_tag]

        return torch.tensor(context_indices, dtype=torch.long), torch.tensor(target_index, dtype=torch.long)

class FFNNTagger(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, context_window):
        super(FFNNTagger, self).__init__()
        self.embedding_dim = embedding_dim
        self.context_window_size = 1 + 2 * context_window

        # Initialize embeddings within a specific range
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        nn.init.uniform_(self.embeddings.weight, -0.01, 0.01)

        # Initialize hidden layer weights and biases within a specific range
        self.hidden_layer = nn.Linear(self.context_window_size * embedding_dim, hidden_dim)
        nn.init.uniform_(self.hidden_layer.weight, -0.01, 0.01)
        nn.init.uniform_(self.hidden_layer.bias, -0.01, 0.01)

        # Initialize output layer weights and biases within a specific range
        self.output_layer = nn.Linear(hidden_dim, tagset_size)
        nn.init.uniform_(self.output_layer.weight, -0.01, 0.01)
        nn.init.uniform_(self.output_layer.bias, -0.01, 0.01)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view(-1)
        hidden_out = torch.tanh(self.hidden_layer(embeds))
        tag_scores = self.output_layer(hidden_out)
        return tag_scores

def train_model(train_data_path, dev_data_path, test_data_path, context_window):
    # Hyperparameters
    EMBEDDING_DIM = 50
    HIDDEN_DIM = 128
    LEARNING_RATE = 0.02
    EPOCHS = 10

    # Load data
    train_sentences = load_data('twpos-train.tsv')
    dev_sentences = load_data('twpos-dev.tsv')
    test_sentences = load_data('twpos-devtest.tsv')

    # Create vocabulary and tag set mappings
    word_to_ix = defaultdict(lambda: len(word_to_ix))
    tag_to_ix = defaultdict(lambda: len(tag_to_ix))

    # Add special tokens to the vocabulary
    word_to_ix[START_TOKEN]
    word_to_ix[END_TOKEN]
    word_to_ix[UNKNOWN_TOKEN]

    # Build vocab and tag index from training data only
    for sentence in train_sentences:
        for word, tag in sentence:
            word_to_ix[word]
            tag_to_ix[tag]

    # Create datasets and dataloaders
    train_dataset = POSTaggingDataset(train_sentences, word_to_ix, tag_to_ix, context_window)
    dev_dataset = POSTaggingDataset(dev_sentences, word_to_ix, tag_to_ix, context_window)

    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

    # Initialize model and optimizer
    model = FFNNTagger(len(word_to_ix), len(tag_to_ix), EMBEDDING_DIM, HIDDEN_DIM, context_window)
    optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)
    loss_function = nn.CrossEntropyLoss()

    # Training loop with early stopping based on DEV accuracy
    best_dev_accuracy = 0.0

    for epoch in range(EPOCHS):
        model.train()

        total_loss = 0.0

        for context_indices, target_index in train_loader:
            model.zero_grad()

            log_probs = model(context_indices)

            loss = loss_function(log_probs.view(1,-1), target_index)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss:.4f}")

        # Evaluate on DEV set after each epoch to check for early stopping criteria.
        dev_accuracy = evaluate_model(model, dev_dataset)

        print(f"DEV Accuracy: {dev_accuracy:.2f}%")

        if dev_accuracy > best_dev_accuracy:
            best_dev_accuracy = dev_accuracy

    # Final evaluation on DEVTEST set.
    test_dataset = POSTaggingDataset(test_sentences, word_to_ix, tag_to_ix, context_window)

    test_accuracy = evaluate_model(model, test_dataset)

    print(f"Final DEVTEST Accuracy: {test_accuracy:.2f}%")

def evaluate_model(model, dataset):
    model.eval()

    correct_predictions = 0

    with torch.no_grad():
        for context_indices, target_index in DataLoader(dataset):
            log_probs = model(context_indices)

            predicted_index = torch.argmax(log_probs).item()

            correct_predictions += (predicted_index == target_index.item())

    accuracy_percentage = (correct_predictions / len(dataset)) * 100

    return accuracy_percentage

print("w = 0")
train_model('twpos-train.tsv',
            'twpos-dev.tsv',
            'twpos-devtest.tsv',
            context_window=0)  # For w=0
print()
print("w = 1")
train_model('twpos-train.tsv',
            'twpos-dev.tsv',
            'twpos-devtest.tsv',
            context_window=1)  # For w=1


w = 0
Epoch 1/10, Loss: 46328.2555
DEV Accuracy: 13.69%
Epoch 2/10, Loss: 37205.6970
DEV Accuracy: 59.30%
Epoch 3/10, Loss: 19245.3192
DEV Accuracy: 71.48%
Epoch 4/10, Loss: 12726.4430
DEV Accuracy: 74.09%
Epoch 5/10, Loss: 9848.0026
DEV Accuracy: 76.04%
Epoch 6/10, Loss: 8445.5727
DEV Accuracy: 74.57%
Epoch 7/10, Loss: 7412.6244
DEV Accuracy: 75.75%
Epoch 8/10, Loss: 6555.7232
DEV Accuracy: 76.13%
Epoch 9/10, Loss: 5929.0838
DEV Accuracy: 76.33%
Epoch 10/10, Loss: 5537.2412
DEV Accuracy: 76.79%
Final DEVTEST Accuracy: 77.65%

w = 1
Epoch 1/10, Loss: 45953.5500
DEV Accuracy: 22.86%
Epoch 2/10, Loss: 24672.6986
DEV Accuracy: 71.06%
Epoch 3/10, Loss: 12628.4079
DEV Accuracy: 77.99%
Epoch 4/10, Loss: 7964.3573
DEV Accuracy: 79.82%
Epoch 5/10, Loss: 5970.2712
DEV Accuracy: 79.92%
Epoch 6/10, Loss: 4998.0739
DEV Accuracy: 80.15%
Epoch 7/10, Loss: 3940.5087
DEV Accuracy: 79.13%
Epoch 8/10, Loss: 3083.0431
DEV Accuracy: 80.61%
Epoch 9/10, Loss: 2514.0898
DEV Accuracy: 80.19%
Epoch 10/10, Loss

# 1.2 Feature Engineering

In [3]:
class POSTaggingDataset2(Dataset):
    def __init__(self, sentences, word_to_ix, tag_to_ix, context_window):
        self.sentences = sentences
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix
        self.context_window = context_window

    def __len__(self):
        return sum(len(sentence) for sentence in self.sentences)

    def __getitem__(self, idx):
        for sentence in self.sentences:
            if idx < len(sentence):
                break
            idx -= len(sentence)

        context_indices = []
        for i in range(-self.context_window, self.context_window + 1):
            position = idx + i
            if position < 0:
                word = START_TOKEN
            elif position >= len(sentence):
                word = END_TOKEN
            else:
                word, _ = sentence[position]
            context_indices.append(self.word_to_ix.get(word, self.word_to_ix[UNKNOWN_TOKEN]))

        # Extract additional features for the center word
        center_word, target_tag = sentence[idx]
        target_index = self.tag_to_ix[target_tag]

        # Feature 1: Capitalization (binary)
        is_capitalized = 1 if center_word[0].isupper() else 0

        # Feature 2: Suffixes (binary for common suffixes)
        has_suffix_ing = 1 if center_word.endswith("ing") else 0
        has_suffix_ed = 1 if center_word.endswith("ed") else 0
        has_suffix_ly = 1 if center_word.endswith("ly") else 0

        # Feature 3: Special characters (binary)
        contains_special_char = 1 if any(char in "!@#?&" for char in center_word) else 0


        features = [is_capitalized, has_suffix_ing, has_suffix_ed, has_suffix_ly, contains_special_char]
        # print(f"Features for word '{center_word}': {features}")  # Add this print statement

        features_tensor = torch.tensor(features, dtype=torch.float32)

        return torch.tensor(context_indices, dtype=torch.long), features_tensor, torch.tensor(target_index, dtype=torch.long)


class FFNNTagger2(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, context_window, feature_dim):
        super(FFNNTagger2, self).__init__()
        self.embedding_dim = embedding_dim
        self.context_window_size = 1 + 2 * context_window
        self.feature_dim = feature_dim

        # Embedding layer
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        nn.init.uniform_(self.embeddings.weight, -0.01, 0.01)

        input_size = self.context_window_size * embedding_dim + feature_dim
        self.hidden_layer = nn.Linear(input_size, hidden_dim)
        nn.init.uniform_(self.hidden_layer.weight, -0.01, 0.01)
        nn.init.uniform_(self.hidden_layer.bias, -0.01, 0.01)

        # Output layer
        self.output_layer = nn.Linear(hidden_dim, tagset_size)
        nn.init.uniform_(self.output_layer.weight, -0.01, 0.01)
        nn.init.uniform_(self.output_layer.bias, -0.01, 0.01)

    def forward(self, inputs, features):

        embeds = self.embeddings(inputs).view(-1)

        with torch.no_grad():
            features = features.squeeze(0)

        # Concatenate the features with the embeddings
        combined_input = torch.cat((embeds, features), dim=0)

        # Pass through the hidden layer
        hidden_out = torch.tanh(self.hidden_layer(combined_input))

        # Output layer
        tag_scores = self.output_layer(hidden_out)

        return tag_scores


def train_model_features(train_data_path, dev_data_path, test_data_path, context_window):
    # Hyperparameters
    EMBEDDING_DIM = 50
    HIDDEN_DIM = 128
    LEARNING_RATE = 0.02
    EPOCHS = 10
    FEATURE_DIM = 5

    # Load data
    train_sentences = load_data(train_data_path)
    dev_sentences = load_data(dev_data_path)
    test_sentences = load_data(test_data_path)

    # Create vocabulary and tag set mappings
    word_to_ix = defaultdict(lambda: len(word_to_ix))
    tag_to_ix = defaultdict(lambda: len(tag_to_ix))

    # Add special tokens to the vocabulary
    word_to_ix[START_TOKEN]
    word_to_ix[END_TOKEN]
    word_to_ix[UNKNOWN_TOKEN]

    # Build vocab and tag index from training data only
    for sentence in train_sentences:
        for word, tag in sentence:
            word_to_ix[word]
            tag_to_ix[tag]

    # Create datasets and dataloaders
    train_dataset = POSTaggingDataset2(train_sentences, word_to_ix, tag_to_ix, context_window)
    dev_dataset = POSTaggingDataset2(dev_sentences, word_to_ix, tag_to_ix, context_window)

    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

    # Initialize model and optimizer
    model = FFNNTagger2(len(word_to_ix), len(tag_to_ix), EMBEDDING_DIM, HIDDEN_DIM, context_window, FEATURE_DIM)
    optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)
    loss_function = nn.CrossEntropyLoss()

    # Training loop with early stopping based on DEV accuracy
    best_dev_accuracy = 0.0

    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0.0

        for batch_idx, (context_indices, features, target_index) in enumerate(train_loader):
            # print(f"Processing batch {batch_idx}")  # Print to verify batches are being processed
            # print(f"Context indices: {context_indices}")  # Print the context indices
            # print(f"Features: {features}")  # Print the features extracted
            # print(f"Target index: {target_index}")  # Print the target index

            model.zero_grad()

            log_probs = model(context_indices, features)

            loss = loss_function(log_probs.view(1, -1), target_index)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss:.4f}")

        # Evaluate on DEV set after each epoch
        dev_accuracy = evaluate_model_features(model, dev_dataset)
        print(f"DEV Accuracy: {dev_accuracy:.2f}%")

        if dev_accuracy > best_dev_accuracy:
            best_dev_accuracy = dev_accuracy

    # Final evaluation on DEVTEST set
    test_dataset = POSTaggingDataset2(test_sentences, word_to_ix, tag_to_ix, context_window)
    test_accuracy = evaluate_model_features(model, test_dataset)
    print(f"Final DEVTEST Accuracy: {test_accuracy:.2f}%")


def evaluate_model_features(model, dataset):
    model.eval()

    correct_predictions = 0

    with torch.no_grad():
        for context_indices, features, target_index in DataLoader(dataset):
            log_probs = model(context_indices, features)

            predicted_index = torch.argmax(log_probs).item()
            correct_predictions += (predicted_index == target_index.item())

    accuracy_percentage = (correct_predictions / len(dataset)) * 100
    return accuracy_percentage

print("w = 0")
train_model_features('twpos-train.tsv',
            'twpos-dev.tsv',
            'twpos-devtest.tsv',
            context_window=0)
print()
print("w = 1")
train_model_features('twpos-train.tsv',
            'twpos-dev.tsv',
            'twpos-devtest.tsv',
            context_window=1)

w = 0
Epoch 1/10, Loss: 36689.1398
DEV Accuracy: 49.57%
Epoch 2/10, Loss: 20098.2078
DEV Accuracy: 69.07%
Epoch 3/10, Loss: 13695.4710
DEV Accuracy: 75.46%
Epoch 4/10, Loss: 10728.6314
DEV Accuracy: 77.93%
Epoch 5/10, Loss: 8200.0050
DEV Accuracy: 79.34%
Epoch 6/10, Loss: 6750.3968
DEV Accuracy: 79.07%
Epoch 7/10, Loss: 5968.8906
DEV Accuracy: 79.65%
Epoch 8/10, Loss: 5466.1237
DEV Accuracy: 79.38%
Epoch 9/10, Loss: 5151.2382
DEV Accuracy: 77.04%
Epoch 10/10, Loss: 4894.4456
DEV Accuracy: 79.26%
Final DEVTEST Accuracy: 81.05%

w = 1
Epoch 1/10, Loss: 35523.9398
DEV Accuracy: 58.78%
Epoch 2/10, Loss: 15516.8664
DEV Accuracy: 77.66%
Epoch 3/10, Loss: 8552.6263
DEV Accuracy: 81.00%
Epoch 4/10, Loss: 5794.2820
DEV Accuracy: 81.44%
Epoch 5/10, Loss: 4566.8702
DEV Accuracy: 81.60%
Epoch 6/10, Loss: 3631.7004
DEV Accuracy: 82.00%
Epoch 7/10, Loss: 2967.7965
DEV Accuracy: 81.75%
Epoch 8/10, Loss: 2314.1690
DEV Accuracy: 81.77%
Epoch 9/10, Loss: 1878.1420
DEV Accuracy: 81.25%
Epoch 10/10, Loss:

# 1.3 .1 Training the pre trained embeddings

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict

# Special tokens
START_TOKEN = "<s>"
END_TOKEN = "</s>"
UNKNOWN_TOKEN = "UUUNKKK"

# Load data from file
def load_data(file_path):
    sentences = []
    sentence = []
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                word, pos = line.split('\t')
                sentence.append((word, pos))
        if sentence:
            sentences.append(sentence)
    return sentences

# Load pretrained embeddings from twitter-embeddings.txt
def load_pretrained_embeddings(filepath, embedding_dim):
    embeddings = {}
    with open(filepath, 'r') as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            embeddings[word] = vector

    # Initialize special tokens
    embeddings[START_TOKEN] = embeddings.get(END_TOKEN)
    embeddings[UNKNOWN_TOKEN] = embeddings.get(UNKNOWN_TOKEN, np.random.uniform(-0.01, 0.01, embedding_dim))

    return embeddings

class POSTaggingDataset3(Dataset):
    def __init__(self, sentences, word_to_ix, tag_to_ix, pretrained_embeddings, embedding_dim, context_window):
        self.sentences = sentences
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix
        self.context_window = context_window
        self.embedding_dim = embedding_dim
        self.pretrained_embeddings = pretrained_embeddings

    def __len__(self):
        return sum(len(sentence) for sentence in self.sentences)

    def __getitem__(self, idx):
        for sentence in self.sentences:
            if idx < len(sentence):
                break
            idx -= len(sentence)

        context_indices = []
        for i in range(-self.context_window, self.context_window + 1):
            position = idx + i
            if position < 0:
                word = START_TOKEN
            elif position >= len(sentence):
                word = END_TOKEN
            else:
                word, _ = sentence[position]
            context_indices.append(self.word_to_ix.get(word, self.word_to_ix[UNKNOWN_TOKEN]))

        # Extract word and POS tag for the center word
        center_word, target_tag = sentence[idx]
        target_index = self.tag_to_ix[target_tag]

        return torch.tensor(context_indices, dtype=torch.long), torch.tensor(target_index, dtype=torch.long)

class FFNNTagger3(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, context_window, pretrained_embeddings, word_to_ix):
        super(FFNNTagger3, self).__init__()
        self.embedding_dim = embedding_dim
        self.context_window_size = 1 + 2 * context_window

        # Initialize an embedding layer using pretrained embeddings
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)

        # Initialize the embedding weights with the pretrained embeddings
        pretrained_weight = self.initialize_pretrained_weights(vocab_size, embedding_dim, pretrained_embeddings, word_to_ix)
        self.embeddings.weight.data.copy_(torch.tensor(pretrained_weight))

        # Hidden layer
        self.hidden_layer = nn.Linear(self.context_window_size * embedding_dim, hidden_dim)
        nn.init.uniform_(self.hidden_layer.weight, -0.01, 0.01)
        nn.init.uniform_(self.hidden_layer.bias, -0.01, 0.01)

        # Output layer
        self.output_layer = nn.Linear(hidden_dim, tagset_size)
        nn.init.uniform_(self.output_layer.weight, -0.01, 0.01)
        nn.init.uniform_(self.output_layer.bias, -0.01, 0.01)

    def initialize_pretrained_weights(self, vocab_size, embedding_dim, pretrained_embeddings, word_to_ix):
        pretrained_weight = np.random.uniform(-0.01, 0.01, (vocab_size, embedding_dim))
        for word, idx in word_to_ix.items():
            if word in pretrained_embeddings:
                pretrained_weight[idx] = pretrained_embeddings[word]
        return pretrained_weight

    def forward(self, inputs):
        # Get the embeddings for the input word indices
        embeds = self.embeddings(inputs).view(-1)

        # Pass through the hidden layer
        hidden_out = torch.tanh(self.hidden_layer(embeds))

        # Output layer
        tag_scores = self.output_layer(hidden_out)
        return tag_scores


def train_model_pretrained(train_data_path, dev_data_path, test_data_path, embedding_path, context_window):
    # Hyperparameters
    EMBEDDING_DIM = 50
    HIDDEN_DIM = 128
    LEARNING_RATE = 0.02
    EPOCHS = 10

    # Load data
    train_sentences = load_data(train_data_path)
    dev_sentences = load_data(dev_data_path)
    test_sentences = load_data(test_data_path)

    # Load pretrained embeddings
    pretrained_embeddings = load_pretrained_embeddings(embedding_path, EMBEDDING_DIM)

    # Create vocabulary and tag set mappings
    word_to_ix = defaultdict(lambda: len(word_to_ix))
    tag_to_ix = defaultdict(lambda: len(tag_to_ix))

    # Add special tokens to the vocabulary
    word_to_ix[START_TOKEN]
    word_to_ix[END_TOKEN]
    word_to_ix[UNKNOWN_TOKEN]

    # Build vocab and tag index from training data only
    for sentence in train_sentences:
        for word, tag in sentence:
            word_to_ix[word]
            tag_to_ix[tag]

    # Create datasets and dataloaders
    train_dataset = POSTaggingDataset3(train_sentences, word_to_ix, tag_to_ix, pretrained_embeddings, EMBEDDING_DIM, context_window)
    dev_dataset = POSTaggingDataset3(dev_sentences, word_to_ix, tag_to_ix, pretrained_embeddings, EMBEDDING_DIM, context_window)

    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

    # Initialize model and optimizer
    model = FFNNTagger3(len(word_to_ix), len(tag_to_ix), EMBEDDING_DIM, HIDDEN_DIM, context_window, pretrained_embeddings, word_to_ix)
    optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)
    loss_function = nn.CrossEntropyLoss()

    # Training loop
    best_dev_accuracy = 0.0

    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0.0

        for context_indices, target_index in train_loader:
            model.zero_grad()

            log_probs = model(context_indices)

            loss = loss_function(log_probs.view(1, -1), target_index)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss:.4f}")

        # Evaluate on DEV set after each epoch
        dev_accuracy = evaluate_model_pretrained(model, dev_dataset)
        print(f"DEV Accuracy: {dev_accuracy:.2f}%")

        if dev_accuracy > best_dev_accuracy:
            best_dev_accuracy = dev_accuracy

    # Final evaluation on DEVTEST set
    test_dataset = POSTaggingDataset3(test_sentences, word_to_ix, tag_to_ix, pretrained_embeddings, EMBEDDING_DIM, context_window)
    test_accuracy = evaluate_model_pretrained(model, test_dataset)
    print(f"Final DEVTEST Accuracy: {test_accuracy:.2f}%")

def evaluate_model_pretrained(model, dataset):
    model.eval()
    correct_predictions = 0

    with torch.no_grad():
        for context_indices, target_index in DataLoader(dataset):
            log_probs = model(context_indices)

            predicted_index = torch.argmax(log_probs).item()
            correct_predictions += (predicted_index == target_index.item())

    accuracy_percentage = (correct_predictions / len(dataset)) * 100
    return accuracy_percentage

print("w = 0")
train_model_pretrained('twpos-train.tsv',
                       'twpos-dev.tsv',
                       'twpos-devtest.tsv',
                       'twitter-embeddings.txt',
                       context_window=0)
print()
print("w = 1")
train_model_pretrained('twpos-train.tsv',
                       'twpos-dev.tsv',
                       'twpos-devtest.tsv',
                       'twitter-embeddings.txt',
                       context_window=1)

w = 0
Epoch 1/10, Loss: 17579.4120
DEV Accuracy: 76.77%
Epoch 2/10, Loss: 7775.4920
DEV Accuracy: 77.39%
Epoch 3/10, Loss: 5768.5907
DEV Accuracy: 76.37%
Epoch 4/10, Loss: 4904.8336
DEV Accuracy: 76.10%
Epoch 5/10, Loss: 4576.7104
DEV Accuracy: 76.87%
Epoch 6/10, Loss: 4263.7991
DEV Accuracy: 76.71%
Epoch 7/10, Loss: 4081.6675
DEV Accuracy: 76.93%
Epoch 8/10, Loss: 3896.2557
DEV Accuracy: 76.91%
Epoch 9/10, Loss: 3836.2148
DEV Accuracy: 76.95%
Epoch 10/10, Loss: 3703.6257
DEV Accuracy: 76.81%
Final DEVTEST Accuracy: 77.39%

w = 1
Epoch 1/10, Loss: 15623.8788
DEV Accuracy: 79.03%
Epoch 2/10, Loss: 5944.0927
DEV Accuracy: 81.04%
Epoch 3/10, Loss: 3575.9568
DEV Accuracy: 80.75%
Epoch 4/10, Loss: 2399.1395
DEV Accuracy: 80.71%
Epoch 5/10, Loss: 1686.6299
DEV Accuracy: 82.43%
Epoch 6/10, Loss: 1266.8942
DEV Accuracy: 81.37%
Epoch 7/10, Loss: 962.5539
DEV Accuracy: 82.02%
Epoch 8/10, Loss: 760.1283
DEV Accuracy: 82.93%
Epoch 9/10, Loss: 567.7690
DEV Accuracy: 82.04%
Epoch 10/10, Loss: 455.35

# 1.3 .2 compare

In [13]:
class POSTaggingDataset4(Dataset):
    def __init__(self, sentences, word_to_ix, tag_to_ix, pretrained_embeddings, embedding_dim, context_window):
        self.sentences = sentences
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix
        self.context_window = context_window
        self.embedding_dim = embedding_dim
        self.pretrained_embeddings = pretrained_embeddings

    def __len__(self):
        return sum(len(sentence) for sentence in self.sentences)

    def __getitem__(self, idx):
        for sentence in self.sentences:
            if idx < len(sentence):
                break
            idx -= len(sentence)

        context_indices = []
        for i in range(-self.context_window, self.context_window + 1):
            position = idx + i
            if position < 0:
                word = START_TOKEN
            elif position >= len(sentence):
                word = END_TOKEN
            else:
                word, _ = sentence[position]
            context_indices.append(self.word_to_ix.get(word, self.word_to_ix[UNKNOWN_TOKEN]))

        # Extract word and POS tag for the center word
        center_word, target_tag = sentence[idx]
        target_index = self.tag_to_ix[target_tag]

        return torch.tensor(context_indices, dtype=torch.long), torch.tensor(target_index, dtype=torch.long)

class FFNNTagger4(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, context_window, pretrained_embeddings, word_to_ix):
        super(FFNNTagger4, self).__init__()
        self.embedding_dim = embedding_dim
        self.context_window_size = 1 + 2 * context_window

        # Initialize an embedding layer using pretrained embeddings
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)

        # Initialize the embedding weights with the pretrained embeddings
        pretrained_weight = self.initialize_pretrained_weights(vocab_size, embedding_dim, pretrained_embeddings, word_to_ix)
        self.embeddings.weight.data.copy_(torch.tensor(pretrained_weight))

        # Set embedding layer to not update (freeze embeddings)
        self.embeddings.weight.requires_grad = False

        # Hidden layer
        self.hidden_layer = nn.Linear(self.context_window_size * embedding_dim, hidden_dim)
        nn.init.uniform_(self.hidden_layer.weight, -0.01, 0.01)
        nn.init.uniform_(self.hidden_layer.bias, -0.01, 0.01)

        # Output layer
        self.output_layer = nn.Linear(hidden_dim, tagset_size)
        nn.init.uniform_(self.output_layer.weight, -0.01, 0.01)
        nn.init.uniform_(self.output_layer.bias, -0.01, 0.01)

    def initialize_pretrained_weights(self, vocab_size, embedding_dim, pretrained_embeddings, word_to_ix):
        pretrained_weight = np.random.uniform(-0.01, 0.01, (vocab_size, embedding_dim))
        for word, idx in word_to_ix.items():
            if word in pretrained_embeddings:
                pretrained_weight[idx] = pretrained_embeddings[word]
        return pretrained_weight

    def forward(self, inputs):
        # Get the embeddings for the input word indices
        embeds = self.embeddings(inputs).view(-1)

        # Pass through the hidden layer
        hidden_out = torch.tanh(self.hidden_layer(embeds))

        # Output layer
        tag_scores = self.output_layer(hidden_out)
        return tag_scores

def train_model_static_embeddings(train_data_path, dev_data_path, test_data_path, embedding_path, context_window):
    # Hyperparameters
    EMBEDDING_DIM = 50
    HIDDEN_DIM = 128
    LEARNING_RATE = 0.02
    EPOCHS = 10

    # Load data
    train_sentences = load_data(train_data_path)
    dev_sentences = load_data(dev_data_path)
    test_sentences = load_data(test_data_path)

    # Load pretrained embeddings
    pretrained_embeddings = load_pretrained_embeddings(embedding_path, EMBEDDING_DIM)

    # Create vocabulary and tag set mappings
    word_to_ix = defaultdict(lambda: len(word_to_ix))
    tag_to_ix = defaultdict(lambda: len(tag_to_ix))

    # Add special tokens to the vocabulary
    word_to_ix[START_TOKEN]
    word_to_ix[END_TOKEN]
    word_to_ix[UNKNOWN_TOKEN]

    # Build vocab and tag index from training data only
    for sentence in train_sentences:
        for word, tag in sentence:
            word_to_ix[word]
            tag_to_ix[tag]

    # Create datasets and dataloaders
    train_dataset = POSTaggingDataset4(train_sentences, word_to_ix, tag_to_ix, pretrained_embeddings, EMBEDDING_DIM, context_window)
    dev_dataset = POSTaggingDataset4(dev_sentences, word_to_ix, tag_to_ix, pretrained_embeddings, EMBEDDING_DIM, context_window)

    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

    # Initialize model and optimizer
    model = FFNNTagger4(len(word_to_ix), len(tag_to_ix), EMBEDDING_DIM, HIDDEN_DIM, context_window, pretrained_embeddings, word_to_ix)
    optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)
    loss_function = nn.CrossEntropyLoss()

    # Training loop
    best_dev_accuracy = 0.0

    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0.0

        for context_indices, target_index in train_loader:
            model.zero_grad()

            log_probs = model(context_indices)

            loss = loss_function(log_probs.view(1, -1), target_index)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss:.4f}")

        # Evaluate on DEV set after each epoch
        dev_accuracy = evaluate_model_static_embeddings(model, dev_dataset)
        print(f"DEV Accuracy: {dev_accuracy:.2f}%")

        if dev_accuracy > best_dev_accuracy:
            best_dev_accuracy = dev_accuracy

    # Final evaluation on DEVTEST set
    test_dataset = POSTaggingDataset4(test_sentences, word_to_ix, tag_to_ix, pretrained_embeddings, EMBEDDING_DIM, context_window)
    test_accuracy = evaluate_model_static_embeddings(model, test_dataset)
    print(f"Final DEVTEST Accuracy: {test_accuracy:.2f}%")

def evaluate_model_static_embeddings(model, dataset):
    model.eval()
    correct_predictions = 0

    with torch.no_grad():
        for context_indices, target_index in DataLoader(dataset):
            log_probs = model(context_indices)

            predicted_index = torch.argmax(log_probs).item()
            correct_predictions += (predicted_index == target_index.item())

    accuracy_percentage = (correct_predictions / len(dataset)) * 100
    return accuracy_percentage

print("w = 0")
train_model_static_embeddings('twpos-train.tsv',
                              'twpos-dev.tsv',
                              'twpos-devtest.tsv',
                              'twitter-embeddings.txt',
                              context_window=0)
print()
print("w = 1")
train_model_static_embeddings('twpos-train.tsv',
                              'twpos-dev.tsv',
                              'twpos-devtest.tsv',
                              'twitter-embeddings.txt',
                              context_window=1)

w = 0
Epoch 1/10, Loss: 19088.1056
DEV Accuracy: 73.12%
Epoch 2/10, Loss: 10974.5385
DEV Accuracy: 74.94%
Epoch 3/10, Loss: 10303.2349
DEV Accuracy: 76.13%
Epoch 4/10, Loss: 9959.6109
DEV Accuracy: 76.21%
Epoch 5/10, Loss: 9741.4381
DEV Accuracy: 76.29%
Epoch 6/10, Loss: 9489.0405
DEV Accuracy: 76.89%
Epoch 7/10, Loss: 9329.2530
DEV Accuracy: 76.79%
Epoch 8/10, Loss: 9127.5487
DEV Accuracy: 76.64%
Epoch 9/10, Loss: 8931.1695
DEV Accuracy: 77.35%
Epoch 10/10, Loss: 8753.0892
DEV Accuracy: 77.43%
Final DEVTEST Accuracy: 77.60%

w = 1
Epoch 1/10, Loss: 17296.4064
DEV Accuracy: 76.42%
Epoch 2/10, Loss: 9349.2565
DEV Accuracy: 77.99%
Epoch 3/10, Loss: 8416.2451
DEV Accuracy: 78.64%
Epoch 4/10, Loss: 7892.2359
DEV Accuracy: 78.10%
Epoch 5/10, Loss: 7484.8794
DEV Accuracy: 79.78%
Epoch 6/10, Loss: 7164.8380
DEV Accuracy: 80.00%
Epoch 7/10, Loss: 6860.9075
DEV Accuracy: 80.00%
Epoch 8/10, Loss: 6593.1387
DEV Accuracy: 79.86%
Epoch 9/10, Loss: 6282.2943
DEV Accuracy: 79.61%
Epoch 10/10, Loss: 6

# 1.3 .3 embeddings with features

In [14]:
class POSTaggingDataset5(Dataset):
    def __init__(self, sentences, word_to_ix, tag_to_ix, pretrained_embeddings, embedding_dim, context_window):
        self.sentences = sentences
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix
        self.context_window = context_window
        self.embedding_dim = embedding_dim
        self.pretrained_embeddings = pretrained_embeddings

    def __len__(self):
        return sum(len(sentence) for sentence in self.sentences)

    def __getitem__(self, idx):
        for sentence in self.sentences:
            if idx < len(sentence):
                break
            idx -= len(sentence)

        context_indices = []
        for i in range(-self.context_window, self.context_window + 1):
            position = idx + i
            if position < 0:
                word = START_TOKEN
            elif position >= len(sentence):
                word = END_TOKEN
            else:
                word, _ = sentence[position]
            context_indices.append(self.word_to_ix.get(word, self.word_to_ix[UNKNOWN_TOKEN]))

        # Extract word and POS tag for the center word
        center_word, target_tag = sentence[idx]
        target_index = self.tag_to_ix[target_tag]

        # Feature 1: Capitalization (binary)
        is_capitalized = 1 if center_word[0].isupper() else 0

        # Feature 2: Suffixes (binary for common suffixes)
        has_suffix_ing = 1 if center_word.endswith("ing") else 0
        has_suffix_ed = 1 if center_word.endswith("ed") else 0
        has_suffix_ly = 1 if center_word.endswith("ly") else 0

        # Feature 3: Special characters (binary)
        contains_special_char = 1 if any(char in "!@#?&" for char in center_word) else 0

        # Combine features into a tensor
        features = [is_capitalized, has_suffix_ing, has_suffix_ed, has_suffix_ly, contains_special_char]
        features_tensor = torch.tensor(features, dtype=torch.float32)

        return torch.tensor(context_indices, dtype=torch.long), features_tensor, torch.tensor(target_index, dtype=torch.long)

class FFNNTagger5(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, context_window, feature_dim, pretrained_embeddings, word_to_ix):
        super(FFNNTagger5, self).__init__()
        self.embedding_dim = embedding_dim
        self.context_window_size = 1 + 2 * context_window
        self.feature_dim = feature_dim

        # Initialize an embedding layer using pretrained embeddings
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)

        # Initialize the embedding weights with the pretrained embeddings
        pretrained_weight = self.initialize_pretrained_weights(vocab_size, embedding_dim, pretrained_embeddings, word_to_ix)
        self.embeddings.weight.data.copy_(torch.tensor(pretrained_weight))

        # Hidden layer: Adjust input to also include the extra feature dimensions
        input_size = self.context_window_size * embedding_dim + feature_dim
        self.hidden_layer = nn.Linear(input_size, hidden_dim)
        nn.init.uniform_(self.hidden_layer.weight, -0.01, 0.01)
        nn.init.uniform_(self.hidden_layer.bias, -0.01, 0.01)

        # Output layer
        self.output_layer = nn.Linear(hidden_dim, tagset_size)
        nn.init.uniform_(self.output_layer.weight, -0.01, 0.01)
        nn.init.uniform_(self.output_layer.bias, -0.01, 0.01)

    def initialize_pretrained_weights(self, vocab_size, embedding_dim, pretrained_embeddings, word_to_ix):
        pretrained_weight = np.random.uniform(-0.01, 0.01, (vocab_size, embedding_dim))
        for word, idx in word_to_ix.items():
            if word in pretrained_embeddings:
                pretrained_weight[idx] = pretrained_embeddings[word]
        return pretrained_weight

    def forward(self, inputs, features):
        # Get the embeddings for the input word indices
        embeds = self.embeddings(inputs).view(-1)

        # Ensure that features are not updated during backpropagation
        with torch.no_grad():
            features = features.squeeze(0)

        # Concatenate the features with the embeddings
        combined_input = torch.cat((embeds, features), dim=0)

        # Pass through the hidden layer
        hidden_out = torch.tanh(self.hidden_layer(combined_input))

        # Output layer
        tag_scores = self.output_layer(hidden_out)

        return tag_scores


def train_model_features_with_embeddings(train_data_path, dev_data_path, test_data_path, embedding_path, context_window):
    # Hyperparameters
    EMBEDDING_DIM = 50
    HIDDEN_DIM = 128
    FEATURE_DIM = 5
    LEARNING_RATE = 0.02
    EPOCHS = 10

    # Load data
    train_sentences = load_data(train_data_path)
    dev_sentences = load_data(dev_data_path)
    test_sentences = load_data(test_data_path)

    # Load pretrained embeddings
    pretrained_embeddings = load_pretrained_embeddings(embedding_path, EMBEDDING_DIM)

    # Create vocabulary and tag set mappings
    word_to_ix = defaultdict(lambda: len(word_to_ix))
    tag_to_ix = defaultdict(lambda: len(tag_to_ix))

    # Add special tokens to the vocabulary
    word_to_ix[START_TOKEN]
    word_to_ix[END_TOKEN]
    word_to_ix[UNKNOWN_TOKEN]

    # Build vocab and tag index from training data only
    for sentence in train_sentences:
        for word, tag in sentence:
            word_to_ix[word]
            tag_to_ix[tag]

    # Create datasets and dataloaders
    train_dataset = POSTaggingDataset5(train_sentences, word_to_ix, tag_to_ix, pretrained_embeddings, EMBEDDING_DIM, context_window)
    dev_dataset = POSTaggingDataset5(dev_sentences, word_to_ix, tag_to_ix, pretrained_embeddings, EMBEDDING_DIM, context_window)

    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

    # Initialize model and optimizer
    model = FFNNTagger5(len(word_to_ix), len(tag_to_ix), EMBEDDING_DIM, HIDDEN_DIM, context_window, FEATURE_DIM, pretrained_embeddings, word_to_ix)
    optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)
    loss_function = nn.CrossEntropyLoss()

    # Training loop
    best_dev_accuracy = 0.0

    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0.0

        for context_indices, features, target_index in train_loader:
            model.zero_grad()

            log_probs = model(context_indices, features)

            loss = loss_function(log_probs.view(1, -1), target_index)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss:.4f}")

        # Evaluate on DEV set after each epoch
        dev_accuracy = evaluate_model_features_with_embeddings(model, dev_dataset)
        print(f"DEV Accuracy: {dev_accuracy:.2f}%")

        if dev_accuracy > best_dev_accuracy:
            best_dev_accuracy = dev_accuracy

    # Final evaluation on DEVTEST set
    test_dataset = POSTaggingDataset5(test_sentences, word_to_ix, tag_to_ix, pretrained_embeddings, EMBEDDING_DIM, context_window)
    test_accuracy = evaluate_model_features_with_embeddings(model, test_dataset)
    print(f"Final DEVTEST Accuracy: {test_accuracy:.2f}%")

def evaluate_model_features_with_embeddings(model, dataset):
    model.eval()
    correct_predictions = 0

    with torch.no_grad():
        for context_indices, features, target_index in DataLoader(dataset):
            log_probs = model(context_indices, features)

            predicted_index = torch.argmax(log_probs).item()
            correct_predictions += (predicted_index == target_index.item())

    accuracy_percentage = (correct_predictions / len(dataset)) * 100
    return accuracy_percentage

print("w = 0")
train_model_features_with_embeddings('twpos-train.tsv',
                                     'twpos-dev.tsv',
                                     'twpos-devtest.tsv',
                                     'twitter-embeddings.txt',
                                     context_window=0)  # For w=0 (no context window)
print()
print("w = 1")
train_model_features_with_embeddings('twpos-train.tsv',
                                     'twpos-dev.tsv',
                                     'twpos-devtest.tsv',
                                     'twitter-embeddings.txt',
                                     context_window=1)  # For w=1 (context window of size 1)

w = 0
Epoch 1/10, Loss: 17184.7456
DEV Accuracy: 76.62%
Epoch 2/10, Loss: 7406.7630
DEV Accuracy: 76.69%
Epoch 3/10, Loss: 5572.6315
DEV Accuracy: 77.18%
Epoch 4/10, Loss: 4874.6868
DEV Accuracy: 76.98%
Epoch 5/10, Loss: 4432.6920
DEV Accuracy: 77.31%
Epoch 6/10, Loss: 4177.1593
DEV Accuracy: 76.81%
Epoch 7/10, Loss: 4056.4210
DEV Accuracy: 76.56%
Epoch 8/10, Loss: 3940.1602
DEV Accuracy: 77.00%
Epoch 9/10, Loss: 3781.5037
DEV Accuracy: 77.85%
Epoch 10/10, Loss: 3708.0340
DEV Accuracy: 76.71%
Final DEVTEST Accuracy: 77.54%

w = 1
Epoch 1/10, Loss: 15192.2301
DEV Accuracy: 80.83%
Epoch 2/10, Loss: 5682.0385
DEV Accuracy: 80.61%
Epoch 3/10, Loss: 3459.4999
DEV Accuracy: 82.12%
Epoch 4/10, Loss: 2258.5599
DEV Accuracy: 81.50%
Epoch 5/10, Loss: 1680.1238
DEV Accuracy: 82.02%
Epoch 6/10, Loss: 1234.2076
DEV Accuracy: 81.85%
Epoch 7/10, Loss: 921.9547
DEV Accuracy: 82.00%
Epoch 8/10, Loss: 676.8625
DEV Accuracy: 82.56%
Epoch 9/10, Loss: 536.8922
DEV Accuracy: 82.24%
Epoch 10/10, Loss: 435.18

# 1.4.1 Architecture Engineering

In [7]:
class FFNNTaggerLayers(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim1, hidden_dim2, num_hidden_layers, context_window, feature_dim, pretrained_embeddings, word_to_ix):
        super(FFNNTaggerLayers, self).__init__()
        self.embedding_dim = embedding_dim
        self.context_window_size = 1 + 2 * context_window
        self.feature_dim = feature_dim
        self.num_hidden_layers = num_hidden_layers

        # Initialize an embedding layer using pretrained embeddings
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)

        # Initialize the embedding weights with the pretrained embeddings
        pretrained_weight = self.initialize_pretrained_weights(vocab_size, embedding_dim, pretrained_embeddings, word_to_ix)
        self.embeddings.weight.data.copy_(torch.tensor(pretrained_weight))

        # Calculate input size to the first hidden layer
        input_size = self.context_window_size * embedding_dim + feature_dim

        # Define hidden layers based on number of layers
        if num_hidden_layers == 0:
            # No hidden layer, directly connect to the output layer
            self.output_layer = nn.Linear(input_size, tagset_size)
        elif num_hidden_layers == 1:
            self.hidden_layer1 = nn.Linear(input_size, hidden_dim1)
            self.output_layer = nn.Linear(hidden_dim1, tagset_size)
        elif num_hidden_layers == 2:
            self.hidden_layer1 = nn.Linear(input_size, hidden_dim1)
            self.hidden_layer2 = nn.Linear(hidden_dim1, hidden_dim2)
            self.output_layer = nn.Linear(hidden_dim2, tagset_size)

        # Initialize weights for hidden layers and output layer
        nn.init.uniform_(self.output_layer.weight, -0.01, 0.01)
        nn.init.uniform_(self.output_layer.bias, -0.01, 0.01)

        if num_hidden_layers >= 1:
            nn.init.uniform_(self.hidden_layer1.weight, -0.01, 0.01)
            nn.init.uniform_(self.hidden_layer1.bias, -0.01, 0.01)

        if num_hidden_layers == 2:
            nn.init.uniform_(self.hidden_layer2.weight, -0.01, 0.01)
            nn.init.uniform_(self.hidden_layer2.bias, -0.01, 0.01)

    def initialize_pretrained_weights(self, vocab_size, embedding_dim, pretrained_embeddings, word_to_ix):
        pretrained_weight = np.random.uniform(-0.01, 0.01, (vocab_size, embedding_dim))
        for word, idx in word_to_ix.items():
            if word in pretrained_embeddings:
                pretrained_weight[idx] = pretrained_embeddings[word]
        return pretrained_weight

    def forward(self, inputs, features):
        # Get the embeddings for the input word indices
        embeds = self.embeddings(inputs).view(-1)  # Flatten the embeddings

        # Ensure that features are not updated during backpropagation
        with torch.no_grad():
            features = features.squeeze(0)  # Squeeze features to remove batch dimension

        # Concatenate the features with the embeddings
        combined_input = torch.cat((embeds, features), dim=0)

        # Pass through the hidden layers based on the number of layers
        if self.num_hidden_layers == 0:
            hidden_out = combined_input  # No hidden layer
        elif self.num_hidden_layers == 1:
            hidden_out = torch.tanh(self.hidden_layer1(combined_input))
        elif self.num_hidden_layers == 2:
            hidden_out = torch.tanh(self.hidden_layer1(combined_input))
            hidden_out = torch.tanh(self.hidden_layer2(hidden_out))

        # Output layer
        tag_scores = self.output_layer(hidden_out)

        return tag_scores

def train_model_with_layers(train_data_path, dev_data_path, test_data_path, embedding_path, context_window, num_hidden_layers, hidden_dim1, hidden_dim2):
    # Hyperparameters
    EMBEDDING_DIM = 50
    FEATURE_DIM = 5
    LEARNING_RATE = 0.02
    EPOCHS = 10

    # Load data
    train_sentences = load_data(train_data_path)
    dev_sentences = load_data(dev_data_path)
    test_sentences = load_data(test_data_path)

    # Load pretrained embeddings
    pretrained_embeddings = load_pretrained_embeddings(embedding_path, EMBEDDING_DIM)

    # Create vocabulary and tag set mappings
    word_to_ix = defaultdict(lambda: len(word_to_ix))
    tag_to_ix = defaultdict(lambda: len(tag_to_ix))

    # Add special tokens to the vocabulary
    word_to_ix[START_TOKEN]
    word_to_ix[END_TOKEN]
    word_to_ix[UNKNOWN_TOKEN]

    # Build vocab and tag index from training data only
    for sentence in train_sentences:
        for word, tag in sentence:
            word_to_ix[word]
            tag_to_ix[tag]

    # Create datasets and dataloaders
    train_dataset = POSTaggingDataset5(train_sentences, word_to_ix, tag_to_ix, pretrained_embeddings, EMBEDDING_DIM, context_window)
    dev_dataset = POSTaggingDataset5(dev_sentences, word_to_ix, tag_to_ix, pretrained_embeddings, EMBEDDING_DIM, context_window)

    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

    # Initialize model and optimizer
    model = FFNNTaggerLayers(len(word_to_ix), len(tag_to_ix), EMBEDDING_DIM, hidden_dim1, hidden_dim2, num_hidden_layers, context_window, FEATURE_DIM, pretrained_embeddings, word_to_ix)
    optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)
    loss_function = nn.CrossEntropyLoss()

    # Training loop
    best_dev_accuracy = 0.0

    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0.0

        for context_indices, features, target_index in train_loader:
            model.zero_grad()

            log_probs = model(context_indices, features)

            loss = loss_function(log_probs.view(1, -1), target_index)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss:.4f}")

        # Evaluate on DEV set after each epoch
        dev_accuracy = evaluate_model_with_layers(model, dev_dataset)
        print(f"DEV Accuracy: {dev_accuracy:.2f}%")

        if dev_accuracy > best_dev_accuracy:
            best_dev_accuracy = dev_accuracy

    # Final evaluation on DEVTEST set
    test_dataset = POSTaggingDataset5(test_sentences, word_to_ix, tag_to_ix, pretrained_embeddings, EMBEDDING_DIM, context_window)
    test_accuracy = evaluate_model_with_layers(model, test_dataset)
    print(f"Final DEVTEST Accuracy: {test_accuracy:.2f}%")

def evaluate_model_with_layers(model, dataset):
    model.eval()
    correct_predictions = 0

    with torch.no_grad():
        for context_indices, features, target_index in DataLoader(dataset):
            log_probs = model(context_indices, features)

            predicted_index = torch.argmax(log_probs).item()
            correct_predictions += (predicted_index == target_index.item())

    accuracy_percentage = (correct_predictions / len(dataset)) * 100
    return accuracy_percentage

# 0 Hidden Layers
print("Running with 0 hidden layers")
train_model_with_layers('twpos-train.tsv', 'twpos-dev.tsv', 'twpos-devtest.tsv', 'twitter-embeddings.txt', context_window=1, num_hidden_layers=0, hidden_dim1=0, hidden_dim2=0)

# 1 Hidden Layer, size 256
print("\nRunning with 1 hidden layer, size 256")
train_model_with_layers('twpos-train.tsv', 'twpos-dev.tsv', 'twpos-devtest.tsv', 'twitter-embeddings.txt', context_window=1, num_hidden_layers=1, hidden_dim1=256, hidden_dim2=0)

# 1 Hidden Layer, size 512
print("\nRunning with 1 hidden layer, size 512")
train_model_with_layers('twpos-train.tsv', 'twpos-dev.tsv', 'twpos-devtest.tsv', 'twitter-embeddings.txt', context_window=1, num_hidden_layers=1, hidden_dim1=512, hidden_dim2=0)

# 2 Hidden Layers, first layer 256, second layer 256
print("\nRunning with 2 hidden layers, first layer 256, second layer 256")
train_model_with_layers('twpos-train.tsv', 'twpos-dev.tsv', 'twpos-devtest.tsv', 'twitter-embeddings.txt', context_window=1, num_hidden_layers=2, hidden_dim1=256, hidden_dim2=256)

# 2 Hidden Layers, first layer 256, second layer 512
print("\nRunning with 2 hidden layers, first layer 256, second layer 512")
train_model_with_layers('twpos-train.tsv', 'twpos-dev.tsv', 'twpos-devtest.tsv', 'twitter-embeddings.txt', context_window=1, num_hidden_layers=2, hidden_dim1=256, hidden_dim2=512)

# 2 Hidden Layers, first layer 512, second layer 256
print("\nRunning with 2 hidden layers, first layer 512, second layer 256")
train_model_with_layers('twpos-train.tsv', 'twpos-dev.tsv', 'twpos-devtest.tsv', 'twitter-embeddings.txt', context_window=1, num_hidden_layers=2, hidden_dim1=512, hidden_dim2=256)

# 2 Hidden Layers, first layer 512, second layer 512
print("\nRunning with 2 hidden layers, first layer 512, second layer 512")
train_model_with_layers('twpos-train.tsv', 'twpos-dev.tsv', 'twpos-devtest.tsv', 'twitter-embeddings.txt', context_window=1, num_hidden_layers=2, hidden_dim1=512, hidden_dim2=512)


Running with 0 hidden layers
Epoch 1/10, Loss: 14240.5413
DEV Accuracy: 81.52%
Epoch 2/10, Loss: 6555.8713
DEV Accuracy: 82.74%
Epoch 3/10, Loss: 4714.7967
DEV Accuracy: 82.66%
Epoch 4/10, Loss: 3504.5746
DEV Accuracy: 82.66%
Epoch 5/10, Loss: 2683.3231
DEV Accuracy: 82.85%
Epoch 6/10, Loss: 2131.2976
DEV Accuracy: 82.62%
Epoch 7/10, Loss: 1749.2745
DEV Accuracy: 82.49%
Epoch 8/10, Loss: 1446.7813
DEV Accuracy: 82.74%
Epoch 9/10, Loss: 1219.1930
DEV Accuracy: 82.95%
Epoch 10/10, Loss: 1032.9540
DEV Accuracy: 82.78%
Final DEVTEST Accuracy: 84.09%

Running with 1 hidden layer, size 256
Epoch 1/10, Loss: 14760.9555
DEV Accuracy: 81.58%
Epoch 2/10, Loss: 5678.1015
DEV Accuracy: 81.35%
Epoch 3/10, Loss: 3353.3333
DEV Accuracy: 81.83%
Epoch 4/10, Loss: 2343.7730
DEV Accuracy: 82.49%
Epoch 5/10, Loss: 1704.1010
DEV Accuracy: 81.12%
Epoch 6/10, Loss: 1306.5848
DEV Accuracy: 81.93%
Epoch 7/10, Loss: 956.4136
DEV Accuracy: 81.48%
Epoch 8/10, Loss: 756.4993
DEV Accuracy: 81.39%
Epoch 9/10, Loss: 

# 1.4.2 Different Activation Functions

Created the model with 1 hidden layer and width of 128. Since this is more simple and performed very well.

In [8]:
class FFNNTaggerActivation(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, context_window, feature_dim, pretrained_embeddings, word_to_ix, activation_function):
        super(FFNNTaggerActivation, self).__init__()
        self.embedding_dim = embedding_dim
        self.context_window_size = 1 + 2 * context_window
        self.feature_dim = feature_dim  # Number of additional features
        self.activation_function = activation_function  # Activation function choice

        # Initialize an embedding layer using pretrained embeddings
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)

        # Initialize the embedding weights with the pretrained embeddings
        pretrained_weight = self.initialize_pretrained_weights(vocab_size, embedding_dim, pretrained_embeddings, word_to_ix)
        self.embeddings.weight.data.copy_(torch.tensor(pretrained_weight))

        # Hidden layer: Adjust input to also include the extra feature dimensions
        input_size = self.context_window_size * embedding_dim + feature_dim
        self.hidden_layer = nn.Linear(input_size, hidden_dim)
        nn.init.uniform_(self.hidden_layer.weight, -0.01, 0.01)
        nn.init.uniform_(self.hidden_layer.bias, -0.01, 0.01)

        # Output layer
        self.output_layer = nn.Linear(hidden_dim, tagset_size)
        nn.init.uniform_(self.output_layer.weight, -0.01, 0.01)
        nn.init.uniform_(self.output_layer.bias, -0.01, 0.01)

    def initialize_pretrained_weights(self, vocab_size, embedding_dim, pretrained_embeddings, word_to_ix):
        pretrained_weight = np.random.uniform(-0.01, 0.01, (vocab_size, embedding_dim))
        for word, idx in word_to_ix.items():
            if word in pretrained_embeddings:
                pretrained_weight[idx] = pretrained_embeddings[word]
        return pretrained_weight

    def forward(self, inputs, features):
        # Get the embeddings for the input word indices
        embeds = self.embeddings(inputs).view(-1)  # Flatten the embeddings

        # Ensure that features are not updated during backpropagation
        with torch.no_grad():
            features = features.squeeze(0)  # Squeeze features to remove batch dimension

        # Concatenate the features with the embeddings
        combined_input = torch.cat((embeds, features), dim=0)

        # Apply the hidden layer
        hidden_out = self.hidden_layer(combined_input)

        # Apply the selected activation function
        if self.activation_function == 'identity':
            activated_out = hidden_out  # Identity activation (no transformation)
        elif self.activation_function == 'relu':
            activated_out = torch.relu(hidden_out)  # ReLU activation
        elif self.activation_function == 'sigmoid':
            activated_out = torch.sigmoid(hidden_out)  # Sigmoid activation
        else:
            raise ValueError("Unsupported activation function")

        # Output layer
        tag_scores = self.output_layer(activated_out)

        return tag_scores

def train_model_with_activation(train_data_path, dev_data_path, test_data_path, embedding_path, context_window, activation_function):
    # Hyperparameters
    EMBEDDING_DIM = 50
    HIDDEN_DIM = 128  # Single hidden layer with size 128
    FEATURE_DIM = 5
    LEARNING_RATE = 0.02
    EPOCHS = 10

    # Load data
    train_sentences = load_data(train_data_path)
    dev_sentences = load_data(dev_data_path)
    test_sentences = load_data(test_data_path)

    # Load pretrained embeddings
    pretrained_embeddings = load_pretrained_embeddings(embedding_path, EMBEDDING_DIM)

    # Create vocabulary and tag set mappings
    word_to_ix = defaultdict(lambda: len(word_to_ix))
    tag_to_ix = defaultdict(lambda: len(tag_to_ix))

    # Add special tokens to the vocabulary
    word_to_ix[START_TOKEN]
    word_to_ix[END_TOKEN]
    word_to_ix[UNKNOWN_TOKEN]

    # Build vocab and tag index from training data only
    for sentence in train_sentences:
        for word, tag in sentence:
            word_to_ix[word]
            tag_to_ix[tag]

    # Create datasets and dataloaders
    train_dataset = POSTaggingDataset5(train_sentences, word_to_ix, tag_to_ix, pretrained_embeddings, EMBEDDING_DIM, context_window)
    dev_dataset = POSTaggingDataset5(dev_sentences, word_to_ix, tag_to_ix, pretrained_embeddings, EMBEDDING_DIM, context_window)

    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

    # Initialize model and optimizer
    model = FFNNTaggerActivation(len(word_to_ix), len(tag_to_ix), EMBEDDING_DIM, HIDDEN_DIM, context_window, FEATURE_DIM, pretrained_embeddings, word_to_ix, activation_function)
    optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)
    loss_function = nn.CrossEntropyLoss()

    # Training loop
    best_dev_accuracy = 0.0

    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0.0

        for context_indices, features, target_index in train_loader:
            model.zero_grad()

            log_probs = model(context_indices, features)

            loss = loss_function(log_probs.view(1, -1), target_index)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss:.4f}")

        # Evaluate on DEV set after each epoch
        dev_accuracy = evaluate_model_with_activation(model, dev_dataset)
        print(f"DEV Accuracy: {dev_accuracy:.2f}%")

        if dev_accuracy > best_dev_accuracy:
            best_dev_accuracy = dev_accuracy

    # Final evaluation on DEVTEST set
    test_dataset = POSTaggingDataset5(test_sentences, word_to_ix, tag_to_ix, pretrained_embeddings, EMBEDDING_DIM, context_window)
    test_accuracy = evaluate_model_with_activation(model, test_dataset)
    print(f"Final DEVTEST Accuracy: {test_accuracy:.2f}%")

def evaluate_model_with_activation(model, dataset):
    model.eval()
    correct_predictions = 0

    with torch.no_grad():
        for context_indices, features, target_index in DataLoader(dataset):
            log_probs = model(context_indices, features)

            predicted_index = torch.argmax(log_probs).item()
            correct_predictions += (predicted_index == target_index.item())

    accuracy_percentage = (correct_predictions / len(dataset)) * 100
    return accuracy_percentage

# Identity activation
print("Running with Identity activation")
train_model_with_activation('twpos-train.tsv', 'twpos-dev.tsv', 'twpos-devtest.tsv', 'twitter-embeddings.txt', context_window=1, activation_function='identity')

# ReLU activation
print("\nRunning with ReLU activation")
train_model_with_activation('twpos-train.tsv', 'twpos-dev.tsv', 'twpos-devtest.tsv', 'twitter-embeddings.txt', context_window=1, activation_function='relu')

# Logistic sigmoid activation
print("\nRunning with Sigmoid activation")
train_model_with_activation('twpos-train.tsv', 'twpos-dev.tsv', 'twpos-devtest.tsv', 'twitter-embeddings.txt', context_window=1, activation_function='sigmoid')

Running with Identity activation
Epoch 1/10, Loss: 15179.4199
DEV Accuracy: 79.98%
Epoch 2/10, Loss: 5749.4629
DEV Accuracy: 80.25%
Epoch 3/10, Loss: 3566.7782
DEV Accuracy: 81.50%
Epoch 4/10, Loss: 2477.3792
DEV Accuracy: 82.08%
Epoch 5/10, Loss: 1901.3412
DEV Accuracy: 81.60%
Epoch 6/10, Loss: 1351.5979
DEV Accuracy: 82.02%
Epoch 7/10, Loss: 1098.5174
DEV Accuracy: 81.62%
Epoch 8/10, Loss: 891.2500
DEV Accuracy: 82.20%
Epoch 9/10, Loss: 679.6741
DEV Accuracy: 81.23%
Epoch 10/10, Loss: 614.7116
DEV Accuracy: 82.08%
Final DEVTEST Accuracy: 83.40%

Running with ReLU activation
Epoch 1/10, Loss: 15884.5369
DEV Accuracy: 79.07%
Epoch 2/10, Loss: 5845.6046
DEV Accuracy: 81.93%
Epoch 3/10, Loss: 3537.6465
DEV Accuracy: 81.25%
Epoch 4/10, Loss: 2398.7428
DEV Accuracy: 81.87%
Epoch 5/10, Loss: 1585.8450
DEV Accuracy: 82.45%
Epoch 6/10, Loss: 1222.4060
DEV Accuracy: 81.73%
Epoch 7/10, Loss: 779.4733
DEV Accuracy: 81.79%
Epoch 8/10, Loss: 644.7292
DEV Accuracy: 82.33%
Epoch 9/10, Loss: 456.2915

# 1.4.3 Experiment with w = 2

In [11]:
print("For context window = 2")
train_model_features_with_embeddings('twpos-train.tsv',
                                     'twpos-dev.tsv',
                                     'twpos-devtest.tsv',
                                     'twitter-embeddings.txt',
                                     context_window=2)

For context window = 2
Epoch 1/10, Loss: 15196.5382
DEV Accuracy: 79.09%
Epoch 2/10, Loss: 5573.0407
DEV Accuracy: 81.21%
Epoch 3/10, Loss: 3132.5023
DEV Accuracy: 80.48%
Epoch 4/10, Loss: 1848.4706
DEV Accuracy: 81.60%
Epoch 5/10, Loss: 1164.6957
DEV Accuracy: 82.49%
Epoch 6/10, Loss: 699.0670
DEV Accuracy: 81.35%
Epoch 7/10, Loss: 410.9679
DEV Accuracy: 82.10%
Epoch 8/10, Loss: 248.2247
DEV Accuracy: 82.27%
Epoch 9/10, Loss: 150.7192
DEV Accuracy: 82.12%
Epoch 10/10, Loss: 104.4585
DEV Accuracy: 82.08%
Final DEVTEST Accuracy: 83.47%


# 1.5 RNN Taggers

In [12]:
class POSTaggingDataset6(Dataset):
    def __init__(self, sentences, word_to_ix, tag_to_ix, pretrained_embeddings, embedding_dim, context_window=1):
        self.sentences = sentences
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix
        self.context_window = context_window
        self.embedding_dim = embedding_dim
        self.pretrained_embeddings = pretrained_embeddings

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]

        # Context window setup (for padding if needed)
        context_indices = []
        labels = []

        for word, tag in sentence:
            # Use <UNK> token if word is not in vocab
            word_index = self.word_to_ix.get(word, self.word_to_ix.get("<UNK>", self.word_to_ix.get("UUUNKKK")))
            context_indices.append(word_index)
            labels.append(self.tag_to_ix[tag])

        # Additional feature extraction for each word
        features = []
        for word, _ in sentence:
            # Feature 1: Capitalization (binary)
            is_capitalized = 1 if word[0].isupper() else 0
            # Feature 2: Common suffixes
            has_suffix_ing = 1 if word.endswith("ing") else 0
            has_suffix_ed = 1 if word.endswith("ed") else 0
            has_suffix_ly = 1 if word.endswith("ly") else 0
            # Feature 3: Special characters
            contains_special_char = 1 if any(char in "!@#?&" for char in word) else 0
            features.append([is_capitalized, has_suffix_ing, has_suffix_ed, has_suffix_ly, contains_special_char])

        # Convert lists to tensors
        context_indices = torch.tensor(context_indices, dtype=torch.long)
        labels = torch.tensor(labels, dtype=torch.long)
        features = torch.tensor(features, dtype=torch.float32)

        return context_indices, features, labels


class RNNTagger(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, num_layers, rnn_type='rnn', bidirectional=False, pretrained_embeddings=None, word_to_ix=None):
        super(RNNTagger, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.rnn_type = rnn_type

        # Embedding layer
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)

        if pretrained_embeddings is not None:
            pretrained_weight = self.initialize_pretrained_weights(vocab_size, embedding_dim, pretrained_embeddings, word_to_ix)
            self.embeddings.weight.data.copy_(torch.tensor(pretrained_weight))

        # Define the RNN (either standard RNN, LSTM, or GRU)
        input_dim = embedding_dim + 5
        if rnn_type == 'rnn':
            self.rnn = nn.RNN(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=bidirectional)
        elif rnn_type == 'lstm':
            self.rnn = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=bidirectional)
        elif rnn_type == 'gru':
            self.rnn = nn.GRU(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=bidirectional)
        else:
            raise ValueError(f"Unsupported RNN type: {rnn_type}")

        # Fully connected layer to predict POS tags
        if bidirectional:
            self.fc = nn.Linear(hidden_dim * 2, tagset_size)
        else:
            self.fc = nn.Linear(hidden_dim, tagset_size)

    def initialize_pretrained_weights(self, vocab_size, embedding_dim, pretrained_embeddings, word_to_ix):
        pretrained_weight = np.random.uniform(-0.01, 0.01, (vocab_size, embedding_dim))
        for word, idx in word_to_ix.items():
            if word in pretrained_embeddings:
                pretrained_weight[idx] = pretrained_embeddings[word]
        return pretrained_weight

    def forward(self, inputs, features):
        # Embedding layer
        embeds = self.embeddings(inputs)

        # Concatenate features along the embedding dimension
        combined_input = torch.cat((embeds, features), dim=2)

        # RNN layer (RNN/LSTM/GRU)
        if self.rnn_type == 'lstm':
            rnn_out, (hidden, cell) = self.rnn(combined_input)
        else:
            rnn_out, hidden = self.rnn(combined_input)

        # Output layer
        tag_scores = self.fc(rnn_out)

        return tag_scores


def train_model_rnn(train_data_path, dev_data_path, test_data_path, embedding_path, rnn_type='rnn', bidirectional=False):
    # Hyperparameters
    EMBEDDING_DIM = 50
    HIDDEN_DIM = 128
    FEATURE_DIM = 5  # 5 additional features
    LEARNING_RATE = 0.02
    EPOCHS = 10
    NUM_LAYERS = 1

    # Load data
    train_sentences = load_data(train_data_path)
    dev_sentences = load_data(dev_data_path)
    test_sentences = load_data(test_data_path)

    # Load pretrained embeddings
    pretrained_embeddings = load_pretrained_embeddings(embedding_path, EMBEDDING_DIM)

    # Create vocabulary and tag set mappings
    word_to_ix = defaultdict(lambda: len(word_to_ix))
    tag_to_ix = defaultdict(lambda: len(tag_to_ix))

    # Add special tokens to the vocabulary
    word_to_ix["<PAD>"]
    word_to_ix["<UNK>"]
    word_to_ix["<START>"]
    word_to_ix["<END>"]

    # Build vocab and tag index from training data only
    for sentence in train_sentences:
        for word, tag in sentence:
            word_to_ix[word]
            tag_to_ix[tag]

    # Create datasets and dataloaders
    train_dataset = POSTaggingDataset6(train_sentences, word_to_ix, tag_to_ix, pretrained_embeddings, EMBEDDING_DIM, context_window=1)
    dev_dataset = POSTaggingDataset6(dev_sentences, word_to_ix, tag_to_ix, pretrained_embeddings, EMBEDDING_DIM, context_window=1)

    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

    # Initialize model and optimizer
    model = RNNTagger(len(word_to_ix), len(tag_to_ix), EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS, rnn_type=rnn_type, bidirectional=bidirectional, pretrained_embeddings=pretrained_embeddings, word_to_ix=word_to_ix)
    optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)
    loss_function = nn.CrossEntropyLoss()

    # Training loop
    best_dev_accuracy = 0.0

    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0.0

        for context_indices, features, target_index in train_loader:
            model.zero_grad()

            # Forward pass
            log_probs = model(context_indices, features)
            log_probs = log_probs.view(-1, len(tag_to_ix))
            target_index = target_index.view(-1)

            # Compute loss
            loss = loss_function(log_probs, target_index)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss:.4f}")

        # Evaluate on DEV set after each epoch
        dev_accuracy = evaluate_model_rnn(model, dev_dataset)
        print(f"DEV Accuracy: {dev_accuracy:.2f}%")

        if dev_accuracy > best_dev_accuracy:
            best_dev_accuracy = dev_accuracy

    # Final evaluation on DEVTEST set
    test_dataset = POSTaggingDataset6(test_sentences, word_to_ix, tag_to_ix, pretrained_embeddings, EMBEDDING_DIM, context_window=1)
    test_accuracy = evaluate_model_rnn(model, test_dataset)
    print(f"Final DEVTEST Accuracy: {test_accuracy:.2f}%")

def evaluate_model_rnn(model, dataset):
    model.eval()
    correct_predictions = 0
    total_predictions = 0  # Track total predictions to calculate accuracy correctly

    with torch.no_grad():
        for context_indices, features, target_index in DataLoader(dataset):
            log_probs = model(context_indices, features)  # Output logits for each token in sequence
            log_probs = log_probs.view(-1, log_probs.size(-1))  # Reshape for comparison
            target_index = target_index.view(-1)

            # Print shapes to debug
            # print("log_probs shape:", log_probs.shape)
            # print("target_index shape:", target_index.shape)

            # Get predictions
            predicted_index = torch.argmax(log_probs, dim=1)

            # Compare predictions with true labels
            correct_predictions += (predicted_index == target_index).sum().item()
            total_predictions += target_index.size(0)

    accuracy_percentage = (correct_predictions / total_predictions) * 100
    print(f"Total correct predictions: {correct_predictions}")
    print(f"Total predictions: {total_predictions}")
    print(f"Calculated Accuracy: {accuracy_percentage}%")
    return accuracy_percentage


# Standard RNN
print("Running Standard RNN")
train_model_rnn('twpos-train.tsv', 'twpos-dev.tsv', 'twpos-devtest.tsv', 'twitter-embeddings.txt', rnn_type='rnn', bidirectional=False)

# Bidirectional RNN
print("\nRunning Bidirectional RNN")
train_model_rnn('twpos-train.tsv', 'twpos-dev.tsv', 'twpos-devtest.tsv', 'twitter-embeddings.txt', rnn_type='rnn', bidirectional=True)

# LSTM
print("\nRunning LSTM")
train_model_rnn('twpos-train.tsv', 'twpos-dev.tsv', 'twpos-devtest.tsv', 'twitter-embeddings.txt', rnn_type='lstm', bidirectional=False)

# Bidirectional LSTM
print("\nRunning Bidirectional LSTM")
train_model_rnn('twpos-train.tsv', 'twpos-dev.tsv', 'twpos-devtest.tsv', 'twitter-embeddings.txt', rnn_type='lstm', bidirectional=True)

# GRU
print("\nRunning GRU")
train_model_rnn('twpos-train.tsv', 'twpos-dev.tsv', 'twpos-devtest.tsv', 'twitter-embeddings.txt', rnn_type='gru', bidirectional=False)

# Bidirectional GRU
print("\nRunning Bidirectional GRU")
train_model_rnn('twpos-train.tsv', 'twpos-dev.tsv', 'twpos-devtest.tsv', 'twitter-embeddings.txt', rnn_type='gru', bidirectional=True)

Running Standard RNN
Epoch 1/10, Loss: 2485.8465
Total correct predictions: 3009
Total predictions: 4821
Calculated Accuracy: 62.41443683883012%
DEV Accuracy: 62.41%
Epoch 2/10, Loss: 1351.8945
Total correct predictions: 3412
Total predictions: 4821
Calculated Accuracy: 70.773698402821%
DEV Accuracy: 70.77%
Epoch 3/10, Loss: 970.3292
Total correct predictions: 3625
Total predictions: 4821
Calculated Accuracy: 75.1918689068658%
DEV Accuracy: 75.19%
Epoch 4/10, Loss: 785.5980
Total correct predictions: 3773
Total predictions: 4821
Calculated Accuracy: 78.26177141671853%
DEV Accuracy: 78.26%
Epoch 5/10, Loss: 669.7385
Total correct predictions: 3826
Total predictions: 4821
Calculated Accuracy: 79.36112839659822%
DEV Accuracy: 79.36%
Epoch 6/10, Loss: 588.5102
Total correct predictions: 3893
Total predictions: 4821
Calculated Accuracy: 80.75088155984236%
DEV Accuracy: 80.75%
Epoch 7/10, Loss: 527.3471
Total correct predictions: 3910
Total predictions: 4821
Calculated Accuracy: 81.103505496