In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, huggingface-hub, datasets
Successfully installed datasets-2.

In [None]:
!wget https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py

--2023-11-08 04:26:22--  https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7502 (7.3K) [text/plain]
Saving to: ‘conlleval.py’


2023-11-08 04:26:22 (93.7 MB/s) - ‘conlleval.py’ saved [7502/7502]



In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import Adam,SGD
from torch.nn.functional import cross_entropy
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [None]:
import datasets

dataset = datasets.load_dataset("conll2003")

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [None]:
import itertools
from collections import Counter
# REFERENCE CODE PROVIDED BY SHOUMIK
word_frequency = Counter(itertools.chain(*dataset['train']['tokens']))  # type: ignore

# Remove words below threshold 2
word2idx = {
    word: frequency
    for word, frequency in word_frequency.items()
    if frequency >= 2
}

word2idx = {
    word: index
    for index, word in enumerate(word_frequency.keys(), start=2)
}

word2idx['[PAD]'] = 0
word2idx['[UNK]'] = 1

In [None]:
dataset = (
    dataset
    .map(lambda x: {
            'input_ids': [
                word2idx.get(word, word2idx['[UNK]'])
                for word in x['tokens']
            ]
        }
    )
)

dataset['train']['input_ids'][:3]

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

[[2, 3, 4, 5, 6, 7, 8, 9, 10], [11, 12], [13, 14]]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'labels', 'input_ids'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'labels', 'input_ids'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'labels', 'input_ids'],
        num_rows: 3453
    })
})

In [None]:
columns_to_remove = ['pos_tags', 'chunk_tags']
for split in dataset.keys():
    dataset[split] = dataset[split].remove_columns(columns_to_remove)

# Rename ner_tags to labels
for split in dataset.keys():
    dataset[split] = dataset[split].rename_column('ner_tags', 'labels')

print(dataset)

ValueError: ignored

In [None]:
label2id = dataset["train"].features["labels"].feature
id2label = {id: label for label, id in enumerate(label2id.names)}
# label2id.names
id2label['PAD'] = -1
id2label

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6,
 'B-MISC': 7,
 'I-MISC': 8,
 'PAD': -1}

 Task 1: Bidirectional LSTM model

In [None]:
class BiLSTMNER(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim=100, hidden_dim=256, linear_dim=128, dropout=0.33, num_layers=1):
        super(BiLSTMNER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                            num_layers=num_layers,
                            dropout=dropout if num_layers > 1 else 0,
                            batch_first = True,
                            bidirectional=True)
        self.linear = nn.Linear(2*hidden_dim, linear_dim)
        self.elu = nn.ELU(alpha = 0.75)
        self.dropout = nn.Dropout(p=dropout)
        self.classifier = nn.Linear(linear_dim, tagset_size)

    def forward(self, sentence, lengths):
        embedded = self.embedding(sentence)

        # Pack the embeddings
        packed_embedded = pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)

        packed_lstm_out, _ = self.lstm(packed_embedded)

        # Unpack the sequence
        lstm_out, _ = pad_packed_sequence(packed_lstm_out, batch_first=True)
        lstm_out = self.dropout(lstm_out)
        linear_out = self.elu(self.linear(lstm_out))
        tag_space = self.classifier(linear_out)

        return tag_space.permute(0,2,1)


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence

def preprocess_data(data):
    input_ids = [torch.tensor(seq) for seq in data['input_ids']]
    labels = [torch.tensor(label) for label in data['labels']]
    return list(zip(input_ids, labels))

def dynamic_padding(batch):
    inputs = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    lengths = torch.tensor([len(inp) for inp in inputs])

    # Dynamic padding in the batch
    inputs = pad_sequence(inputs, batch_first=True)
    labels = pad_sequence(labels, batch_first=True, padding_value=-1)

    return inputs, labels, lengths


# Hyperparameters
BATCH_SIZE = 32
# Preprocess the train, val, and test data
train_data = preprocess_data(dataset['train'])
val_data = preprocess_data(dataset['validation'])
test_data = preprocess_data(dataset['test'])

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=dynamic_padding, num_workers=2)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, collate_fn=dynamic_padding, num_workers=2)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, collate_fn=dynamic_padding, num_workers=2)

In [None]:
vocab_size = max([max(seq) for seq in dataset['train']['input_ids']]) + 1
tagset_size = max([max(seq) for seq in dataset['train']['labels']]) + 1
# Device definition
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
all_labels = [label for sublist in dataset['train']['labels'] for label in sublist]
label_counts = {label: all_labels.count(label) for label in id2label.values() if label != -1}
total_labels = len(all_labels) # We subtract the count of 'PAD' labels
weights = [total_labels / label_counts[id2label[key]] for key in id2label.keys() if key != 'PAD']
# Normalization idea 1: dividing by max weight
# max_weight = max(weights)
# weights = [weight / max_weight for weight in weights]
# Normalization idea 2: dividing by sum of weights
weights = [weight / sum(weights) for weight in weights]

weights_tensor = torch.tensor(weights).to(device)

In [None]:
sample_batch = next(iter(train_loader))
len(sample_batch)

3

In [None]:
model = BiLSTMNER(vocab_size, tagset_size)
model

In [None]:
LEARNING_RATE = 5e-1 #75e-2 #3e-1#5e-1#75e-2#1.0 #5e-1#1e-1
loss_function = torch.nn.CrossEntropyLoss(weight=weights_tensor, ignore_index=-1).to(device)
# Optimizer
# optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
optimizer = SGD(model.parameters(), lr=LEARNING_RATE)
# Move model to the device
model = model.to(device)

In [None]:
# Loss function
# loss_function = torch.nn.CrossEntropyLoss(ignore_index=-1).to(device)
#120 so far
EPOCHS = 20#100
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    # Wrap your training loader with tqdm for progress bar
    for inputs, targets, lengths in train_loader: #tqdm(train_loader, desc=f"Epoch {epoch + 1}/{EPOCHS}"):
        optimizer.zero_grad()

        # Fetch inputs and targets and move them to the current device
        inputs = inputs.to(device)
        targets = targets.to(device)
        lengths = lengths.to(device)

        # Forward pass
        outputs = model(inputs, lengths)

        # Compute loss and backpropagate
        loss = loss_function(outputs,targets) #(outputs.view(-1, tagset_size), targets.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for inputs, targets, lengths in val_loader:
            inputs = inputs.to(device)
            targets = targets.to(device)
            lengths = lengths.to(device)

            outputs = model(inputs, lengths)

            loss = loss_function(outputs,targets) #(outputs.view(-1, tagset_size), targets.view(-1))
            val_loss += loss.item()

    print(f"Epoch {epoch + 1}/{EPOCHS}, Training Loss: {total_loss / len(train_loader)} Validation Loss: {val_loss / len(val_loader)}")


Epoch 1/20, Training Loss: 0.005469360117599525 Validation Loss: 2.287004227712331
Epoch 2/20, Training Loss: 0.005165160213441191 Validation Loss: 2.2760457327584573
Epoch 3/20, Training Loss: 0.0043842436359324945 Validation Loss: 2.337395059049817
Epoch 4/20, Training Loss: 0.004425346395035289 Validation Loss: 2.3204613802518588
Epoch 5/20, Training Loss: 0.003768849798078025 Validation Loss: 2.412500930118569
Epoch 6/20, Training Loss: 0.003826590034966718 Validation Loss: 2.3090612552867547
Epoch 7/20, Training Loss: 0.003562867568888286 Validation Loss: 2.3641352345741447
Epoch 8/20, Training Loss: 0.0035704300531498017 Validation Loss: 2.3746119245032515
Epoch 9/20, Training Loss: 0.003481599730786874 Validation Loss: 2.4231946685686325
Epoch 10/20, Training Loss: 0.0029924438989804504 Validation Loss: 2.418409530529567
Epoch 11/20, Training Loss: 0.0033467773290150556 Validation Loss: 2.3444884500752265
Epoch 12/20, Training Loss: 0.0027910100784341555 Validation Loss: 2.43353

In [None]:
def get_predictions(model, loader, device):
    model.eval()
    all_predictions = []
    with torch.no_grad():
        for inputs, _, lengths in loader: # We don't need targets now
            inputs = inputs.to(device)

            outputs = model(inputs, lengths)
            # Get predictions
            predictions = torch.argmax(outputs, dim=1)

            # Truncate predictions to their original lengths
            truncated_predictions = [pred[:len_].tolist() for pred, len_ in zip(predictions, lengths)]

            all_predictions.extend(truncated_predictions)

    return all_predictions

In [None]:
val_predictions = get_predictions(model, val_loader, device)
test_predictions = get_predictions(model, test_loader, device)

In [None]:
from conlleval import evaluate
import itertools
# labels = ner_tags
# Map the labels back to their corresponding tag strings
idx2tag  = {id:tag for (tag,id) in id2label.items()}
labels = [
list(map(idx2tag.get, labels))
for labels in dataset['validation']['labels']
]
# This is the prediction by your model
preds = [
list(map(idx2tag.get, labels))
for labels in val_predictions
]
precision, recall, f1 = evaluate(itertools.chain(*labels),itertools.chain(*preds))

processed 51362 tokens with 5942 phrases; found: 5347 phrases; correct: 4392.
accuracy:  75.56%; (non-O)
accuracy:  95.06%; precision:  82.14%; recall:  73.91%; FB1:  77.81
              LOC: precision:  82.14%; recall:  85.85%; FB1:  83.95  1920
             MISC: precision:  89.00%; recall:  76.36%; FB1:  82.19  791
              ORG: precision:  79.58%; recall:  67.11%; FB1:  72.82  1131
              PER: precision:  80.47%; recall:  65.74%; FB1:  72.36  1505


In [None]:
from conlleval import evaluate
import itertools
# labels = ner_tags
# Map the labels back to their corresponding tag strings
idx2tag  = {id:tag for (tag,id) in id2label.items()}
labels = [
list(map(idx2tag.get, labels))
for labels in dataset['test']['labels']
]
# This is the prediction by your model
preds = [
list(map(idx2tag.get, labels))
for labels in test_predictions
]
precision, recall, f1 = evaluate(itertools.chain(*labels),itertools.chain(*preds))

processed 46435 tokens with 5648 phrases; found: 5206 phrases; correct: 3665.
accuracy:  68.69%; (non-O)
accuracy:  92.81%; precision:  70.40%; recall:  64.89%; FB1:  67.53
              LOC: precision:  71.74%; recall:  79.14%; FB1:  75.26  1840
             MISC: precision:  74.51%; recall:  64.53%; FB1:  69.16  608
              ORG: precision:  69.66%; recall:  60.69%; FB1:  64.86  1447
              PER: precision:  67.43%; recall:  54.67%; FB1:  60.38  1311


In [None]:
# code to save pytorch model
# Move the model to CPU
model.to('cpu')

# Save the model's state_dict
torch.save(model.state_dict(), 'model_hw4_task1.pth')