### 1.  Reading the CoNLL corpus

In [277]:
def read_data(file_path, has_tags=True):
    sentences = []
    tags = []
    original_sentences = []  
    
    with open(file_path, 'r') as file:
        sentence = []
        tag = []
        original_sentence = []  
        for line in file:
            if line.strip() == "":  
                if sentence:  
                    sentences.append(sentence)
                    if has_tags: 
                        tags.append(tag)
                    original_sentences.append(original_sentence) 
                    sentence = []
                    tag = []
                    original_sentence = []  
                continue
            
            components = line.strip().split()
            if has_tags:
                index, word, ner_tag = components
                tag.append(ner_tag)
            else:
                index, word = components
            
            sentence.append(word)
            original_sentence.append(word) 
        
        if sentence:
            sentences.append(sentence)
            if has_tags: 
                tags.append(tag)
            original_sentences.append(original_sentence) 
    
    if has_tags:
        return sentences, tags, original_sentences
    else:
        return sentences, original_sentences

In [278]:
train_file_path = 'data/train'
dev_file_path = 'data/dev'
test_file_path = 'data/test'

train_sentences, train_tags, train_original_sentences = read_data(train_file_path)
dev_sentences, dev_tags, dev_original_sentences = read_data(dev_file_path)
test_sentences, test_original_sentences = read_data(test_file_path, has_tags=False)



### 2. Datasets and Dataloaders

#### 2.1 Create a Vocabulary and convert Text to Indices

In [279]:
from collections import Counter

def build_vocab(sentences, min_freq=2):
    word_counts = Counter(word for sentence in sentences for word in sentence)
    
    vocab = [word for word, count in word_counts.items() if count >= min_freq]
    
    vocab.append('<UNK>')
    
    word_to_idx = {word: idx for idx, word in enumerate(vocab)}
    
    return vocab, word_to_idx


vocab, word_to_idx = build_vocab(train_sentences)

print(vocab[:10])
print(word_to_idx['<UNK>']) 

['EU', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.', 'Peter', 'Blackburn']
11983


#### 2.2 Encode Labels

In [280]:
def encode_sentences(sentences, word_to_idx):
    encoded_sentences = []
    
    for sentence in sentences:
        encoded_sentence = [word_to_idx.get(word, word_to_idx['<UNK>']) for word in sentence]
        encoded_sentences.append(encoded_sentence)
    
    return encoded_sentences

train_encoded_sentences = encode_sentences(train_sentences, word_to_idx)
dev_encoded_sentences = encode_sentences(dev_sentences, word_to_idx)
test_encoded_sentences = encode_sentences(test_sentences, word_to_idx)


In [281]:
def build_tag_vocab(tags):
    unique_tags = set(tag for tag_list in tags for tag in tag_list)
    tag_to_idx = {tag: idx for idx, tag in enumerate(unique_tags)}
    return unique_tags, tag_to_idx

unique_tags, tag_to_idx = build_tag_vocab(train_tags)

def encode_tags(tags, tag_to_idx):
    encoded_tags = [[tag_to_idx[tag] for tag in tag_list] for tag_list in tags]
    return encoded_tags

train_encoded_tags = encode_tags(train_tags, tag_to_idx)
dev_encoded_tags = encode_tags(dev_tags, tag_to_idx)


#### 2.3 Create PyTorch Datasets

In [282]:
import torch
from torch.utils.data import Dataset

class NERDataset(Dataset):
    def __init__(self, sentences, tags=None):
        self.sentences = sentences
        self.tags = tags

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = torch.tensor(self.sentences[idx], dtype=torch.long)
        
        if self.tags is not None:
            tag = torch.tensor(self.tags[idx], dtype=torch.long)
            return sentence, tag, len(sentence)
        else:
            return sentence, len(sentence)


train_dataset = NERDataset(train_encoded_sentences, train_encoded_tags)
dev_dataset = NERDataset(dev_encoded_sentences, dev_encoded_tags)
test_dataset = NERDataset(test_encoded_sentences)


In [283]:

from torch.nn.utils.rnn import pad_sequence

def pad_collate(batch):
    if len(batch[0]) == 3: 
        sentences, tags, lengths = zip(*batch)
        sentences_padded = pad_sequence(sentences, batch_first=True, padding_value=word_to_idx['<UNK>'])
        tags_padded = pad_sequence(tags, batch_first=True, padding_value=tag_to_idx.get('O', 0))
        return sentences_padded, tags_padded, torch.tensor(lengths)
    else: 
        sentences, lengths = zip(*batch)
        sentences_padded = pad_sequence(sentences, batch_first=True, padding_value=word_to_idx['<UNK>'])
        return sentences_padded, torch.tensor(lengths)


#### 2.4 Create DataLoaders

In [284]:
from torch.utils.data import DataLoader

BATCH_SIZE= 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_collate)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=pad_collate)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=pad_collate)


### 3. Model 

In [285]:
# Hyperparameters
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 128
DROPOUT = 0.33

In [286]:
import torch

device = "mps"

## TASK 2


In [287]:
import gzip
import numpy as np

def load_glove_embeddings(file_path):
    embeddings_index = {}
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_embeddings_path = 'glove.6B.100d.gz'
glove_embeddings = load_glove_embeddings(glove_embeddings_path)


In [288]:
def prepare_embeddings_matrix(vocab, word_to_idx, glove_embeddings, embedding_dim):
    num_words = len(vocab)
    embedding_matrix = np.zeros((num_words, embedding_dim))
    for word, idx in word_to_idx.items():
        if word in glove_embeddings:
            embedding_matrix[idx] = glove_embeddings[word]
        else:
            embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))
    return embedding_matrix

embedding_matrix = prepare_embeddings_matrix(vocab, word_to_idx, glove_embeddings, EMBEDDING_DIM)
embedding_matrix_tensor = torch.FloatTensor(embedding_matrix).to(device)



In [289]:
import torch
import torch.nn as nn

class BiLSTM_NER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, lstm_hidden_dim, output_dim, dropout, embeddings):
        super(BiLSTM_NER, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embeddings, freeze=True)
        
        self.bilstm = nn.LSTM(embedding_dim, lstm_hidden_dim, batch_first=True, bidirectional=True)
        
        self.dropout = nn.Dropout(dropout)
        
        self.linear = nn.Linear(lstm_hidden_dim * 2, output_dim)
        self.elu = nn.ELU()
        self.classifier = nn.Linear(output_dim, len(tag_to_idx))

    def forward(self, sentence):
        embedded = self.embedding(sentence)
        lstm_out, _ = self.bilstm(embedded)
        
        lstm_out = self.dropout(lstm_out)
        
        linear_out = self.linear(lstm_out)
        elu_out = self.elu(linear_out)
        scores = self.classifier(elu_out)
        return scores

model = BiLSTM_NER(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, embedding_matrix_tensor).to(device)


In [290]:
# Hyperparameters
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau


N_EPOCHS = 40

# Loss Function
loss_function = nn.CrossEntropyLoss()

optimizer = optim.RMSprop(model.parameters(), lr=0.0025, alpha=0.99)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.3, patience=2)


# Training loop
for epoch in range(N_EPOCHS):
    model.train()
    total_loss = 0
    
    for sentence, tags, lengths in train_loader:
        sentence, tags = sentence.to(device), tags.to(device)
        model.zero_grad()
        
        tag_scores = model(sentence)
        
        tag_scores = tag_scores.view(-1, tag_scores.shape[-1]) 
        tags = tags.view(-1) 
        
        loss = loss_function(tag_scores, tags)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{N_EPOCHS}, Loss: {total_loss/len(train_loader)}")
    
    scheduler.step(total_loss/len(train_loader))

Epoch 1/40, Loss: 0.13455049336147207
Epoch 2/40, Loss: 0.05663739720474619
Epoch 3/40, Loss: 0.035877034489089236
Epoch 4/40, Loss: 0.024050709157030403
Epoch 5/40, Loss: 0.017513726052564026
Epoch 6/40, Loss: 0.012738870379370031
Epoch 7/40, Loss: 0.010463419123125801
Epoch 8/40, Loss: 0.008816287012473901
Epoch 9/40, Loss: 0.0073117489103294376
Epoch 10/40, Loss: 0.006505328522218681
Epoch 11/40, Loss: 0.005903841208056934
Epoch 12/40, Loss: 0.005216828587459155
Epoch 13/40, Loss: 0.004843631145530202
Epoch 14/40, Loss: 0.004306939234626966
Epoch 15/40, Loss: 0.004181493900632282
Epoch 16/40, Loss: 0.004066386770169466
Epoch 17/40, Loss: 0.0036569800873129154
Epoch 18/40, Loss: 0.0037084387664563768
Epoch 19/40, Loss: 0.0031682204449304074
Epoch 20/40, Loss: 0.0032181271551474026
Epoch 21/40, Loss: 0.0029513299575590936
Epoch 22/40, Loss: 0.0030349240246740393
Epoch 23/40, Loss: 0.0029052025128777055
Epoch 24/40, Loss: 0.0028517244742355566
Epoch 25/40, Loss: 0.002917672924855081
Ep

In [291]:
idx_to_vocab = {idx: word for word, idx in word_to_idx.items()}
idx_to_tag = {idx: tag for tag, idx in tag_to_idx.items()}

def write_predictions_to_file_with_glove(model, data_loader, idx_to_tag, output_file_path, original_sentences):
    model.eval()
    predictions = []
    sentence_counter = 0 

    with torch.no_grad():
        for batch in data_loader:
            if len(batch) == 3: 
                sentences, lengths = batch[0], batch[2]
            else: 
                sentences, lengths = batch[0], batch[1]

            sentences = sentences.to(device)
            outputs = model(sentences)
            predicted_tag_indices = torch.argmax(outputs, dim=2)

            for i, length in enumerate(lengths):
                original_sentence = original_sentences[sentence_counter]
                sentence_counter += 1  

                for j in range(length.item()): 
                    original_word = original_sentence[j] if j < len(original_sentence) else "<PAD>"
                    predicted_tag_index = predicted_tag_indices[i][j].item()
                    predicted_tag = idx_to_tag[predicted_tag_index]
                    predictions.append(f"{j+1} {original_word} {predicted_tag}\n")
                predictions.append("\n")

    with open(output_file_path, 'w') as writer:
        writer.writelines(predictions)

    print(f"Predictions written to {output_file_path}")


In [292]:
output_file_path_dev = 'dev2.out'
write_predictions_to_file_with_glove(model, dev_loader, idx_to_tag, output_file_path_dev, dev_sentences)

output_file_path_test = 'test2.out'
write_predictions_to_file_with_glove(model, test_loader, idx_to_tag, output_file_path_test, test_sentences)


predicted_file_path_glove = output_file_path_dev
gold_standard_file_path = 'data/dev'

!python eval.py -p {predicted_file_path_glove} -g {gold_standard_file_path}

Predictions written to dev2.out
Predictions written to test2.out
processed 51578 tokens with 5942 phrases; found: 5517 phrases; correct: 4649.
accuracy:  96.27%; precision:  84.27%; recall:  78.24%; FB1:  81.14
              LOC: precision:  89.76%; recall:  83.51%; FB1:  86.52  1709
             MISC: precision:  84.22%; recall:  75.27%; FB1:  79.50  824
              ORG: precision:  76.92%; recall:  74.05%; FB1:  75.46  1291
              PER: precision:  84.35%; recall:  77.52%; FB1:  80.79  1693


In [293]:
import torch
torch.save(model.state_dict(), 'blstm2.pt')