### 1.  Reading the CoNLL corpus

In [246]:
def read_data(file_path, has_tags=True):
    sentences = []
    tags = []
    original_sentences = []  
    
    with open(file_path, 'r') as file:
        sentence = []
        tag = []
        original_sentence = [] 
        for line in file:
            if line.strip() == "":  
                if sentence:  
                    sentences.append(sentence)
                    if has_tags: 
                        tags.append(tag)
                    original_sentences.append(original_sentence)  
                    sentence = []
                    tag = []
                    original_sentence = [] 
                continue
            
            components = line.strip().split()
            if has_tags:
                index, word, ner_tag = components
                tag.append(ner_tag)
            else:
                index, word = components
            
            sentence.append(word)
            original_sentence.append(word)  
        
        if sentence:
            sentences.append(sentence)
            if has_tags: 
                tags.append(tag)
            original_sentences.append(original_sentence)  
    
    if has_tags:
        return sentences, tags, original_sentences
    else:
        return sentences, original_sentences

In [247]:
train_file_path = 'data/train'
dev_file_path = 'data/dev'
test_file_path = 'data/test'

train_sentences, train_tags, train_original_sentences = read_data(train_file_path)
dev_sentences, dev_tags, dev_original_sentences = read_data(dev_file_path)
test_sentences, test_original_sentences = read_data(test_file_path, has_tags=False)



### 2. Datasets and Dataloaders

#### 2.1 Create a Vocabulary and convert Text to Indices

In [248]:
from collections import Counter

def build_vocab(sentences, min_freq=2):

    word_counts = Counter(word for sentence in sentences for word in sentence)
    

    vocab = [word for word, count in word_counts.items() if count >= min_freq]
    
    vocab.append('<UNK>')
    
  
    word_to_idx = {word: idx for idx, word in enumerate(vocab)}
    
    return vocab, word_to_idx


vocab, word_to_idx = build_vocab(train_sentences)

print(vocab[:10])
print(word_to_idx['<UNK>'])  

['EU', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.', 'Peter', 'Blackburn']
11983


#### 2.2 Encode Labels

In [249]:
def encode_sentences(sentences, word_to_idx):
    encoded_sentences = []
    
    for sentence in sentences:
        encoded_sentence = [word_to_idx.get(word, word_to_idx['<UNK>']) for word in sentence]
        encoded_sentences.append(encoded_sentence)
    
    return encoded_sentences

train_encoded_sentences = encode_sentences(train_sentences, word_to_idx)
dev_encoded_sentences = encode_sentences(dev_sentences, word_to_idx)
test_encoded_sentences = encode_sentences(test_sentences, word_to_idx)


In [250]:
def build_tag_vocab(tags):
    unique_tags = set(tag for tag_list in tags for tag in tag_list)
    tag_to_idx = {tag: idx for idx, tag in enumerate(unique_tags)}
    return unique_tags, tag_to_idx

unique_tags, tag_to_idx = build_tag_vocab(train_tags)

def encode_tags(tags, tag_to_idx):
    encoded_tags = [[tag_to_idx[tag] for tag in tag_list] for tag_list in tags]
    return encoded_tags

train_encoded_tags = encode_tags(train_tags, tag_to_idx)
dev_encoded_tags = encode_tags(dev_tags, tag_to_idx)


#### 2.3 Create PyTorch Datasets

In [251]:
import torch
from torch.utils.data import Dataset

class NERDataset(Dataset):
    def __init__(self, sentences, tags=None, char_sentences=None):
        self.sentences = sentences
        self.tags = tags
        self.char_sentences = char_sentences  
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        sentence = torch.tensor(self.sentences[idx], dtype=torch.long)


        if self.tags is not None:
            tag = torch.tensor(self.tags[idx], dtype=torch.long)
            if self.char_sentences is not None:
                char_sentence = torch.tensor(self.char_sentences[idx], dtype=torch.long)
                return sentence, tag, char_sentence, len(sentence)
            else:
                return sentence, tag, len(sentence)
        else:
            if self.char_sentences is not None:
                char_sentence = torch.tensor(self.char_sentences[idx], dtype=torch.long)
                return sentence, char_sentence, len(sentence)
            else:
                return sentence, len(sentence)


train_char_sequences = []
for sentence in train_sentences:
    for word in sentence:
        char_sequence = []
        for char in word:
            if char in word_to_idx:
                char_sequence.append(word_to_idx[char])
            else:
                char_sequence.append(word_to_idx['<UNK>']) 
        train_char_sequences.append(char_sequence)

dev_char_sequences = []
for sentence in dev_sentences:
    for word in sentence:
        char_sequence = []
        for char in word:
            if char in word_to_idx:
                char_sequence.append(word_to_idx[char])
            else:
                char_sequence.append(word_to_idx['<UNK>'])  
        dev_char_sequences.append(char_sequence)

test_char_sequences = []
for sentence in test_sentences:
    for word in sentence:
        char_sequence = []
        for char in word:
            if char in word_to_idx:
                char_sequence.append(word_to_idx[char])
            else:
                char_sequence.append(word_to_idx['<UNK>']) 
        test_char_sequences.append(char_sequence)

train_dataset = NERDataset(train_encoded_sentences, train_encoded_tags, train_char_sequences)
dev_dataset = NERDataset(dev_encoded_sentences, dev_encoded_tags, dev_char_sequences)
test_dataset = NERDataset(test_encoded_sentences, char_sentences=test_char_sequences)


#### 2.4 Create DataLoaders

In [252]:
from torch.utils.data import DataLoader

BATCH_SIZE= 8


In [253]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader


def pad_collate(batch):
    if len(batch[0]) == 4: 
        sentences, tags, char_sentences, lengths = zip(*batch)
        tags_padded = pad_sequence(tags, batch_first=True, padding_value=tag_to_idx['O'])
    else:  
        sentences, char_sentences, lengths = zip(*batch)
        tags_padded = None  

    sentences_padded = pad_sequence(sentences, batch_first=True, padding_value=word_to_idx['<UNK>'])
    char_sentences_padded = pad_sequence(char_sentences, batch_first=True, padding_value=0) 
    
    return (sentences_padded, tags_padded, char_sentences_padded, lengths) if tags_padded is not None else (sentences_padded, char_sentences_padded, lengths)



train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_collate)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=pad_collate)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=pad_collate)


### 3. Model 

In [254]:
# Hyperparameters
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 128
# DROPOUT = 0.33



In [255]:
import torch

device = "mps"



## TASK 2


In [256]:
import gzip
import numpy as np

def load_glove_embeddings(file_path):
    embeddings_index = {}
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_embeddings_path = 'glove.6B.100d.gz'
glove_embeddings = load_glove_embeddings(glove_embeddings_path)


In [257]:
def prepare_embeddings_matrix(vocab, word_to_idx, glove_embeddings, embedding_dim):
    num_words = len(vocab)
    embedding_matrix = np.zeros((num_words, embedding_dim))
    for word, idx in word_to_idx.items():
        if word in glove_embeddings:
            embedding_matrix[idx] = glove_embeddings[word]
        else:
            embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))
    return embedding_matrix

embedding_matrix = prepare_embeddings_matrix(vocab, word_to_idx, glove_embeddings, EMBEDDING_DIM)
embedding_matrix_tensor = torch.FloatTensor(embedding_matrix).to(device)



In [258]:
import torch
import torch.nn as nn
import torch.nn.functional as F
class BiLSTM_CNN_NER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, lstm_hidden_dim, output_dim, dropout, embeddings, char_vocab_size, char_embedding_dim, num_filters, kernel_sizes):
        super(BiLSTM_CNN_NER, self).__init__()
        
        self.embedding = nn.Embedding.from_pretrained(embeddings, freeze=True)
        
        self.char_embedding = nn.Embedding(char_vocab_size, char_embedding_dim)
        
        self.conv_layers = nn.ModuleList([
            nn.Conv1d(in_channels=char_embedding_dim, out_channels=num_filters, kernel_size=kernel_size, padding=1)
            for kernel_size in kernel_sizes
        ])
        
        self.bilstm = nn.LSTM(embedding_dim + num_filters * len(kernel_sizes), lstm_hidden_dim, batch_first=True, bidirectional=True)
        
        self.dropout = nn.Dropout(dropout)
        
        self.linear = nn.Linear(lstm_hidden_dim*2, output_dim)
        
        self.elu = nn.ELU()
        
        self.classifier = nn.Linear(output_dim, len(tag_to_idx))
        
    def forward(self, sentence, char_sentence):

        embedded = self.embedding(sentence)

        char_embedded = self.char_embedding(char_sentence)
        char_embedded = char_embedded.permute(0, 2, 1) 
        char_conv_outputs = [self.elu(conv(char_embedded)) for conv in self.conv_layers]
        char_pooled = [F.max_pool1d(conv_output, conv_output.size(2)).squeeze(2) for conv_output in char_conv_outputs]
        char_output = torch.cat(char_pooled, dim=1)

        combined_embedded = torch.cat((embedded, char_output.unsqueeze(1).repeat(1, embedded.size(1), 1)), dim=2)

        lstm_out, _ = self.bilstm(combined_embedded)
        
        lstm_out = self.dropout(lstm_out)

        linear_out = self.linear(lstm_out)
        
        elu_out = self.elu(linear_out)
        
        scores = self.classifier(elu_out)
        
        return scores


In [259]:
CHAR_EMBEDDING_DIM = 30
NUM_FILTERS = 125
KERNEL_SIZES = [1, 2]  
DROPOUT = 0.5

char_vocab = set()
for sentence in train_sentences:
    for word in sentence:
        for char in word:
            char_vocab.add(char)
            
char_vocab_size = len(char_vocab)

model = BiLSTM_CNN_NER(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, embedding_matrix_tensor,
                       len(char_vocab), CHAR_EMBEDDING_DIM, NUM_FILTERS, KERNEL_SIZES).to(device)


In [260]:
# Hyperparameters
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau


N_EPOCHS = 20

# Loss Function
loss_function = nn.CrossEntropyLoss()

# Optimizer
optimizer = optim.SGD(model.parameters(), lr=1)

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2)

for epoch in range(N_EPOCHS):
    model.train()
    total_loss = 0
    
    for sentence, tags, char_sentence, lengths in train_loader:
        sentence, tags, char_sentence = sentence.to(device), tags.to(device), char_sentence.to(device)
        model.zero_grad()
        
        tag_scores = model(sentence, char_sentence)
        
        tag_scores = tag_scores.view(-1, tag_scores.shape[-1]) 
        tags = tags.view(-1)  
        
        loss = loss_function(tag_scores, tags)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{N_EPOCHS}, Loss: {total_loss/len(train_loader)}")
    
    scheduler.step(total_loss/len(train_loader))

Epoch 1/20, Loss: 0.1845587582904905
Epoch 2/20, Loss: 0.12110883954976036
Epoch 3/20, Loss: 0.09920751727101658
Epoch 4/20, Loss: 0.08324684729012427
Epoch 5/20, Loss: 0.07344832893792966
Epoch 6/20, Loss: 0.0650795348627958
Epoch 7/20, Loss: 0.05706630968175661
Epoch 8/20, Loss: 0.051433596504716
Epoch 9/20, Loss: 0.045055111765510775
Epoch 10/20, Loss: 0.04209915597676106
Epoch 11/20, Loss: 0.03824761379713922
Epoch 12/20, Loss: 0.03505686715116904
Epoch 13/20, Loss: 0.03272666550936113
Epoch 14/20, Loss: 0.02948963659299231
Epoch 15/20, Loss: 0.026937129281593183
Epoch 16/20, Loss: 0.0252216634872496
Epoch 17/20, Loss: 0.023576928900788413
Epoch 18/20, Loss: 0.02205564538088821
Epoch 19/20, Loss: 0.02029309493741178
Epoch 20/20, Loss: 0.019358200460580812


In [261]:
idx_to_vocab = {idx: word for word, idx in word_to_idx.items()}
idx_to_tag = {idx: tag for tag, idx in tag_to_idx.items()}

def write_predictions_to_file(model, data_loader, idx_to_tag, output_file_path, original_sentences):
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch_idx, batch in enumerate(data_loader):
            if len(batch) == 4: 
                sentence_tensors, _, char_sentence_tensors, lengths = batch
            else:  
                sentence_tensors, char_sentence_tensors, lengths = batch

            sentence_tensors = sentence_tensors.to(device)
            char_sentence_tensors = char_sentence_tensors.to(device)
            outputs = model(sentence_tensors, char_sentence_tensors)
            predicted_tag_indices = torch.argmax(outputs, dim=2)

            for i, length in enumerate(lengths):
                original_sentence = original_sentences[batch_idx * data_loader.batch_size + i]
                for j in range(length):
                    original_word = original_sentence[j]
                    predicted_tag_index = predicted_tag_indices[i][j].item()
                    predicted_tag = idx_to_tag[predicted_tag_index]
                    predictions.append(f"{j+1} {original_word} {predicted_tag}\n")
                predictions.append("\n")

    with open(output_file_path, 'w') as writer:
        writer.writelines(predictions)

    print(f"Predictions written to {output_file_path}")

In [262]:
dev_output_file_path = 'dev3.out'
write_predictions_to_file(model, dev_loader, idx_to_tag, dev_output_file_path, dev_sentences)

test_output_file_path = 'test3.out'
write_predictions_to_file(model, test_loader, idx_to_tag, test_output_file_path, test_sentences)

predicted_file_path_glove = output_file_path_glove
gold_standard_file_path = 'data/dev'

!python eval.py -p {dev_output_file_path} -g {gold_standard_file_path}

Predictions written to dev3.out
Predictions written to test3.out
processed 51578 tokens with 5942 phrases; found: 5426 phrases; correct: 4455.
accuracy:  95.90%; precision:  82.10%; recall:  74.97%; FB1:  78.38
              LOC: precision:  91.01%; recall:  82.14%; FB1:  86.35  1658
             MISC: precision:  78.39%; recall:  75.16%; FB1:  76.74  884
              ORG: precision:  75.35%; recall:  68.83%; FB1:  71.94  1225
              PER: precision:  80.17%; recall:  72.20%; FB1:  75.98  1659


In [263]:
import torch
torch.save(model.state_dict(), 'blstm3.pt')