In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, huggingface-hub, datasets
Successfully installed datasets-2.1

In [2]:
!wget https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py

--2023-11-09 23:59:07--  https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7502 (7.3K) [text/plain]
Saving to: ‘conlleval.py’


2023-11-09 23:59:07 (87.3 MB/s) - ‘conlleval.py’ saved [7502/7502]



In [3]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import Adam,SGD,AdamW
from torch.nn.functional import cross_entropy
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import numpy as np
import pandas as pd

In [4]:
import datasets

dataset = datasets.load_dataset("conll2003")

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [5]:
import itertools
from collections import Counter
# REFERENCE CODE PROVIDED BY SHOUMIK
word_frequency = Counter(itertools.chain(*dataset['train']['tokens']))  # type: ignore

# Remove words below threshold 2
word2idx = {
    word: frequency
    for word, frequency in word_frequency.items()
    if frequency >= 2
}

word2idx = {
    word: index
    for index, word in enumerate(word_frequency.keys(), start=2)
}

word2idx['[PAD]'] = 0
word2idx['[UNK]'] = 1

In [6]:
dataset = (
    dataset
    .map(lambda x: {
            'input_ids': [
                word2idx.get(word, word2idx['[UNK]'])
                for word in x['tokens']
            ]
        }
    )
)

dataset['train']['input_ids'][:3]

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

[[2, 3, 4, 5, 6, 7, 8, 9, 10], [11, 12], [13, 14]]

In [7]:
dataset['train']['tokens'][:3]

[['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 ['Peter', 'Blackburn'],
 ['BRUSSELS', '1996-08-22']]

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids'],
        num_rows: 3453
    })
})

In [9]:
columns_to_remove = ['pos_tags', 'chunk_tags']
for split in dataset.keys():
    dataset[split] = dataset[split].remove_columns(columns_to_remove)

# Rename ner_tags to labels
for split in dataset.keys():
    dataset[split] = dataset[split].rename_column('ner_tags', 'labels')

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'labels', 'input_ids'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'labels', 'input_ids'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'labels', 'input_ids'],
        num_rows: 3453
    })
})


In [10]:
label2id = dataset["train"].features["labels"].feature
id2label = {id: label for label, id in enumerate(label2id.names)}
id2label['PAD'] = -1
id2label

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6,
 'B-MISC': 7,
 'I-MISC': 8,
 'PAD': -1}

 Task 3: Transformer model

In [38]:
import math
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

class TransformerNER(nn.Module):
    def __init__(self, vocab_size, tagset_size, emb_size=128, nhead=8, ff_size=512, num_layers=6, dropout=0.1, max_seq_length=128):
        super(TransformerNER, self).__init__()
        self.token_embedding = nn.Embedding(vocab_size, emb_size)
        self.pos_encoder = PositionalEncoding(emb_size, max_seq_length)
        self.dropout = nn.Dropout(dropout)
        encoder_layers = nn.TransformerEncoderLayer(d_model=emb_size, nhead=nhead, dim_feedforward=ff_size, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
        self.classifier = nn.Linear(emb_size, tagset_size)
        self.emb_size = emb_size
    def forward(self, src, src_mask):
        src = self.token_embedding(src) * math.sqrt(self.emb_size)
        src = self.pos_encoder(src)
        src = self.dropout(src)
        output = self.transformer_encoder(src, src_key_padding_mask=src_mask)
        logits = self.classifier(output)
        return logits


In [14]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence

def preprocess_data(data):
    input_ids = [torch.tensor(seq) for seq in data['input_ids']]
    labels = [torch.tensor(label) for label in data['labels']]
    return list(zip(input_ids, labels))

def dynamic_padding(batch):
    inputs = [item[0] for item in batch]
    labels = [item[1] for item in batch]

    # Dynamic padding in the batch
    inputs_padded = pad_sequence(inputs, batch_first=False)  # Transformer expects (seq_len, batch, feature)
    labels_padded = pad_sequence(labels, batch_first=False, padding_value=-1)  # padding as -1
    # Create the source mask for the transformer
    # `True` values are where the attention should NOT focus (i.e., padding)
    src_mask = (inputs_padded == 0).transpose(0, 1)

    return inputs_padded, labels_padded, src_mask


# Hyperparameters
BATCH_SIZE = 4#32
# Preprocess the train, val, and test data
train_data = preprocess_data(dataset['train'])
val_data = preprocess_data(dataset['validation'])
test_data = preprocess_data(dataset['test'])

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=dynamic_padding, num_workers=2)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, collate_fn=dynamic_padding, num_workers=2)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, collate_fn=dynamic_padding, num_workers=2)

In [15]:
from collections import Counter

vocab_size = max([max(seq) for seq in dataset['train']['input_ids']]) + 1
tagset_size = max([max(seq) for seq in dataset['train']['labels']]) + 1
# Device definition
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
all_labels = [label for sublist in dataset['train']['labels'] for label in sublist]
label_counts = {label: all_labels.count(label) for label in id2label.values() if label != -1}
total_labels = len(all_labels) # We subtract the count of 'PAD' labels
weights = [total_labels / label_counts[id2label[key]] for key in id2label.keys() if key != 'PAD']
weights = [weight / sum(weights) for weight in weights]

weights_tensor = torch.tensor(weights).to(device)

In [16]:
label_counts

{0: 169578,
 1: 6600,
 2: 4528,
 3: 6321,
 4: 3704,
 5: 7140,
 6: 1157,
 7: 3438,
 8: 1155}

In [17]:
weights

[0.0019872122378459785,
 0.05105870861658262,
 0.07442302934395877,
 0.05331236780089311,
 0.09097934040751764,
 0.04719712561196713,
 0.2912597034308084,
 0.09801846331281132,
 0.291764049237615]

In [18]:
sample_batch = next(iter(train_loader))
len(sample_batch)

3

In [45]:
model = TransformerNER(vocab_size, tagset_size)
model

TransformerNER(
  (token_embedding): Embedding(23625, 128)
  (pos_encoder): PositionalEncoding()
  (dropout): Dropout(p=0.1, inplace=False)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (classifier): Linear(in_features=128, out_features=9, bias=True)
)

In [46]:

LEARNING_RATE = 1e-4 #75e-2 #3e-1 #75e-2#1.0 #5e-1#1e-1
loss_function = torch.nn.CrossEntropyLoss(weight=weights_tensor, ignore_index=-1).to(device)
# Optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
# optimizer = SGD(model.parameters(), lr=LEARNING_RATE)
# Move model to the device
model = model.to(device)

In [26]:
for param_group in optimizer.param_groups:
        param_group['lr'] = 75e-5

In [51]:
# # Loss function
# # loss_function = torch.nn.CrossEntropyLoss(ignore_index=-1).to(device)
# #120 so far
# EPOCHS = 20#100
# for epoch in range(EPOCHS):
#     model.train()
#     total_loss = 0

#     # Wrap your training loader with tqdm for progress bar
#     for inputs, targets, src_masks in train_loader: #tqdm(train_loader, desc=f"Epoch {epoch + 1}/{EPOCHS}"):
#         optimizer.zero_grad()

#         # Fetch inputs and targets and move them to the current device
#         inputs = inputs.to(device)
#         targets = targets.to(device)
#         src_masks = src_masks.to(device)

#         # Forward pass
#         outputs = model(inputs, src_masks)

#         # Compute loss and backpropagate
#         loss = loss_function(outputs.view(-1, tagset_size), targets.view(-1))
#         loss.backward()
#         optimizer.step()

#         total_loss += loss.item()
#     # Validation
#     model.eval()
#     val_loss = 0
#     with torch.no_grad():
#         for inputs, targets, src_masks in val_loader:
#             inputs = inputs.to(device)
#             targets = targets.to(device)
#             src_masks = src_masks.to(device)

#             outputs = model(inputs, src_masks)

#             loss = loss_function(outputs.view(-1, tagset_size), targets.view(-1))
#             val_loss += loss.item()

#     print(f"Epoch {epoch + 1}/{EPOCHS}, Training Loss: {total_loss / len(train_loader)} Validation Loss: {val_loss / len(val_loader)}")
from torch.nn.utils import clip_grad_norm_
EPOCHS = 20
max_grad_norm = 1.0  # Gradient clipping to avoid exploding gradients
# best_val_loss = float('inf')  # Initialize best validation loss to infinity

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for inputs, targets, src_masks in train_loader:
        optimizer.zero_grad()
        inputs = inputs.to(device)
        targets = targets.to(device)
        src_masks = src_masks.to(device)

        # Forward pass
        outputs = model(inputs, src_masks)

        # Compute loss and backpropagate
        loss = loss_function(outputs.view(-1, tagset_size), targets.view(-1))
        loss.backward()

        # Clip gradients to prevent exploding gradient issues
        clip_grad_norm_(model.parameters(), max_grad_norm)

        optimizer.step()

        total_loss += loss.item()

    # Validation step
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for inputs, targets, src_masks in val_loader:
            inputs = inputs.to(device)
            targets = targets.to(device)
            src_masks = src_masks.to(device)

            outputs = model(inputs, src_masks)

            loss = loss_function(outputs.view(-1, tagset_size), targets.view(-1))
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)

    # Save model if validation loss improved
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), 'best_model.pth')
        print(f"Saved best model at epoch {epoch+1}")

    print(f"Epoch {epoch + 1}/{EPOCHS}, Training Loss: {total_loss / len(train_loader)}, Validation Loss: {avg_val_loss}")



Epoch 1/20, Training Loss: 0.25672982032419006, Validation Loss: 1.3076018528075748
Epoch 2/20, Training Loss: 0.2380272679884781, Validation Loss: 1.3175036643977136
Epoch 3/20, Training Loss: 0.21996602557405748, Validation Loss: 1.507679769863923
Epoch 4/20, Training Loss: 0.21389504836245954, Validation Loss: 1.3976437898990324
Epoch 5/20, Training Loss: 0.20527116575259186, Validation Loss: 1.625894182066957
Epoch 6/20, Training Loss: 0.18477508292132552, Validation Loss: 1.6557512455895216
Epoch 7/20, Training Loss: 0.18166712979951888, Validation Loss: 1.5729816806742916
Epoch 8/20, Training Loss: 0.17552787759987484, Validation Loss: 1.641558542251261
Epoch 9/20, Training Loss: 0.16980772537564712, Validation Loss: 1.570450601972129
Epoch 10/20, Training Loss: 0.17381892107437868, Validation Loss: 1.9158893123859742
Epoch 11/20, Training Loss: 0.15373687123350555, Validation Loss: 1.8380141960133847
Epoch 12/20, Training Loss: 0.1462394629470486, Validation Loss: 1.678482179925

In [34]:
def get_predictions(model, loader, device):
    model.eval()
    all_predictions = []
    with torch.no_grad():
        for inputs, _, src_mask in loader: # We don't need targets, but we do need the mask
            inputs = inputs.to(device)
            src_mask = src_mask.to(device)

            # Forward pass, get logits for each token in the sequence
            outputs = model(inputs, src_mask)

            # Get predictions
            predictions = torch.argmax(outputs, dim=2)  # dim=2 because outputs are (seq_length, batch, num_tags)

            # Transpose predictions to match inputs shape
            predictions = predictions.transpose(0, 1)  # Now predictions are (batch, seq_length)

            # Remove padding (convert masks to indices and select non-padded elements)
            for batch_idx, batch in enumerate(predictions):
                # Get the indices where src_mask is False (meaning valid tokens, not padding)
                valid_indices = ~src_mask[batch_idx]
                valid_predictions = batch[valid_indices]
                all_predictions.append(valid_predictions.tolist())

    return all_predictions


In [52]:
val_predictions = get_predictions(model, val_loader, device)
test_predictions = get_predictions(model, test_loader, device)

In [53]:
from conlleval import evaluate
import itertools
# labels = ner_tags
# Map the labels back to their corresponding tag strings
idx2tag  = {id:tag for (tag,id) in id2label.items()}
labels = [
list(map(idx2tag.get, labels))
for labels in dataset['validation']['labels']
]
# This is the prediction by your model
preds = [
list(map(idx2tag.get, labels))
for labels in val_predictions
]
precision, recall, f1 = evaluate(itertools.chain(*labels),itertools.chain(*preds))

processed 51362 tokens with 5942 phrases; found: 5926 phrases; correct: 3983.
accuracy:  67.22%; (non-O)
accuracy:  92.97%; precision:  67.21%; recall:  67.03%; FB1:  67.12
              LOC: precision:  79.77%; recall:  78.77%; FB1:  79.27  1814
             MISC: precision:  74.73%; recall:  76.36%; FB1:  75.54  942
              ORG: precision:  54.21%; recall:  63.39%; FB1:  58.44  1568
              PER: precision:  61.30%; recall:  53.31%; FB1:  57.03  1602


In [54]:
from conlleval import evaluate
import itertools
# labels = ner_tags
# Map the labels back to their corresponding tag strings
idx2tag  = {id:tag for (tag,id) in id2label.items()}
labels = [
list(map(idx2tag.get, labels))
for labels in dataset['test']['labels']
]
# This is the prediction by your model
preds = [
list(map(idx2tag.get, labels))
for labels in test_predictions
]
precision, recall, f1 = evaluate(itertools.chain(*labels),itertools.chain(*preds))

processed 46435 tokens with 5648 phrases; found: 5249 phrases; correct: 3018.
accuracy:  54.23%; (non-O)
accuracy:  90.06%; precision:  57.50%; recall:  53.43%; FB1:  55.39
              LOC: precision:  74.43%; recall:  71.04%; FB1:  72.70  1592
             MISC: precision:  63.24%; recall:  65.67%; FB1:  64.43  729
              ORG: precision:  46.68%; recall:  51.17%; FB1:  48.82  1821
              PER: precision:  47.15%; recall:  32.28%; FB1:  38.33  1107
