## TASK1

### 1.  Reading the CoNLL corpus

In [297]:
def read_data(file_path, has_tags=True):
    sentences = []
    tags = []
    original_sentences = []  
    
    with open(file_path, 'r') as file:
        sentence = []
        tag = []
        original_sentence = []  
        for line in file:
            if line.strip() == "":  
                if sentence:  
                    sentences.append(sentence)
                    if has_tags: 
                        tags.append(tag)
                    original_sentences.append(original_sentence)  
                    sentence = []
                    tag = []
                    original_sentence = []
                continue
            
            components = line.strip().split()
            if has_tags:
                index, word, ner_tag = components
                tag.append(ner_tag)
            else:
                index, word = components
            
            sentence.append(word)
            original_sentence.append(word) 
        
        if sentence:
            sentences.append(sentence)
            if has_tags: 
                tags.append(tag)
            original_sentences.append(original_sentence) 
    
    if has_tags:
        return sentences, tags, original_sentences
    else:
        return sentences, original_sentences

In [298]:
train_file_path = 'data/train'
dev_file_path = 'data/dev'
test_file_path = 'data/test'

train_sentences, train_tags, train_original_sentences = read_data(train_file_path)
dev_sentences, dev_tags, dev_original_sentences = read_data(dev_file_path)
test_sentences, test_original_sentences = read_data(test_file_path, has_tags=False)



### 2. Datasets and Dataloaders

#### 2.1 Create a Vocabulary and convert Text to Indices

In [299]:
from collections import Counter

def build_vocab(sentences, min_freq=2):
    word_counts = Counter(word for sentence in sentences for word in sentence)
    
    vocab = [word for word, count in word_counts.items() if count >= min_freq]
    
    vocab.append('<UNK>')
    
    word_to_idx = {word: idx for idx, word in enumerate(vocab)}
    
    return vocab, word_to_idx

vocab, word_to_idx = build_vocab(train_sentences)

#### 2.2 Encode Labels

In [300]:
def encode_sentences(sentences, word_to_idx):
    encoded_sentences = []
    
    for sentence in sentences:
        encoded_sentence = [word_to_idx.get(word, word_to_idx['<UNK>']) for word in sentence]
        encoded_sentences.append(encoded_sentence)
    
    return encoded_sentences

train_encoded_sentences = encode_sentences(train_sentences, word_to_idx)
dev_encoded_sentences = encode_sentences(dev_sentences, word_to_idx)
test_encoded_sentences = encode_sentences(test_sentences, word_to_idx)


In [301]:
def build_tag_vocab(tags):
    unique_tags = set(tag for tag_list in tags for tag in tag_list)
    tag_to_idx = {tag: idx for idx, tag in enumerate(unique_tags)}
    return unique_tags, tag_to_idx

unique_tags, tag_to_idx = build_tag_vocab(train_tags)

def encode_tags(tags, tag_to_idx):
    encoded_tags = [[tag_to_idx[tag] for tag in tag_list] for tag_list in tags]
    return encoded_tags

train_encoded_tags = encode_tags(train_tags, tag_to_idx)
dev_encoded_tags = encode_tags(dev_tags, tag_to_idx)


#### 2.3 Create PyTorch Datasets

In [302]:
import torch
from torch.utils.data import Dataset

class NERDataset(Dataset):
    def __init__(self, sentences, tags=None):
        self.sentences = sentences
        self.tags = tags
        self.indices = list(range(len(sentences)))

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = torch.tensor(self.sentences[idx], dtype=torch.long)
        index = self.indices[idx]  # Get the original index

        if self.tags is not None:
            tag = torch.tensor(self.tags[idx], dtype=torch.long)
            return sentence, tag, len(sentence), index
        else:
            return sentence, len(sentence), index


train_dataset = NERDataset(train_encoded_sentences, train_encoded_tags)
dev_dataset = NERDataset(dev_encoded_sentences, dev_encoded_tags)
test_dataset = NERDataset(test_encoded_sentences)


In [303]:
from torch.nn.utils.rnn import pad_sequence

def pad_collate(batch):
    sentences = [item[0] for item in batch]  
    sentences_padded = pad_sequence(sentences, batch_first=True, padding_value=word_to_idx['<UNK>'])
    
    lengths = torch.tensor([item[2] for item in batch]) 
    
    if len(batch[0]) == 4: 
        indices = [item[3] for item in batch] 
    else:
        indices = [item[2] for item in batch] 
    

    if any(isinstance(item[1], torch.Tensor) for item in batch):
        tags = [item[1] for item in batch] 
        tags_padded = pad_sequence(tags, batch_first=True, padding_value=tag_to_idx['O'])
    else:
        tags_padded = None

    return sentences_padded, tags_padded, lengths, torch.tensor(indices)





#### 2.4 Create DataLoaders

In [304]:
from torch.utils.data import DataLoader

BATCH_SIZE= 8
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_collate)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=pad_collate)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=pad_collate)


### 3. Model 

In [305]:
import torch

device = "mps"

In [306]:
import torch.nn as nn

class BiLSTM_NER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, lstm_hidden_dim, output_dim, dropout_rate):
        super(BiLSTM_NER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(dropout_rate)  
        self.bilstm = nn.LSTM(embedding_dim, lstm_hidden_dim, batch_first=True,
                              bidirectional=True)
        self.linear = nn.Linear(lstm_hidden_dim*2, output_dim)
        self.elu = nn.ELU()
        self.classifier = nn.Linear(output_dim, len(tag_to_idx))

    def forward(self, sentence):
        embedded = self.embedding(sentence)
        embedded = self.dropout(embedded)  
        lstm_out, _ = self.bilstm(embedded)
        lstm_out = self.dropout(lstm_out) 
        linear_out = self.linear(lstm_out)
        elu_out = self.elu(linear_out)
        scores = self.classifier(elu_out)
        return scores

#### 3.1 Initializing hyperparameters

In [307]:
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 128
DROPOUT = 0.33

model = BiLSTM_NER(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT).to(device)


In [308]:
import torch.optim as optim

loss_function = nn.CrossEntropyLoss()

optimizer = optim.SGD(model.parameters(), lr=1)

In [309]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.75, patience=6)


### 4. Training the model

In [310]:
N_EPOCHS = 30

for epoch in range(N_EPOCHS):
    model.train()
    total_loss = 0
    
    for sentence, tags, lengths, _ in train_loader:  
        sentence, tags = sentence.to(device), tags.to(device)
        model.zero_grad()
        
        tag_scores = model(sentence)
        
        tag_scores = tag_scores.view(-1, tag_scores.shape[-1])
        tags = tags.view(-1)
        
        loss = loss_function(tag_scores, tags)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{N_EPOCHS}, Loss: {total_loss/len(train_loader)}")
    
    scheduler.step(total_loss/len(train_loader))



Epoch 1/30, Loss: 0.2591114677866501
Epoch 2/30, Loss: 0.18519444295116716
Epoch 3/30, Loss: 0.1515665519465727
Epoch 4/30, Loss: 0.12803727640445356
Epoch 5/30, Loss: 0.11448936686943521
Epoch 6/30, Loss: 0.10434963478256522
Epoch 7/30, Loss: 0.09812651221688118
Epoch 8/30, Loss: 0.09198295711335928
Epoch 9/30, Loss: 0.08548992523787347
Epoch 10/30, Loss: 0.08288807847801488
Epoch 11/30, Loss: 0.0771229753947619
Epoch 12/30, Loss: 0.07527470086197287
Epoch 13/30, Loss: 0.07226784489581733
Epoch 14/30, Loss: 0.06932442297283341
Epoch 15/30, Loss: 0.06702786403421905
Epoch 16/30, Loss: 0.06430369081410998
Epoch 17/30, Loss: 0.0634121158669167
Epoch 18/30, Loss: 0.05967053835600288
Epoch 19/30, Loss: 0.058559700190919585
Epoch 20/30, Loss: 0.05607296030791009
Epoch 21/30, Loss: 0.05589241783453694
Epoch 22/30, Loss: 0.05291236705582076
Epoch 23/30, Loss: 0.05369510544263216
Epoch 24/30, Loss: 0.05181910722526913
Epoch 25/30, Loss: 0.05054031701357449
Epoch 26/30, Loss: 0.0496195637520473

#### 4.1 Writing predictions to a file

In [311]:
idx_to_vocab = {idx: word for word, idx in word_to_idx.items()}
idx_to_tag = {idx: tag for tag, idx in tag_to_idx.items()}

def write_predictions_to_file(model, data_loader, idx_to_tag, output_file_path, original_sentences, original_sentence_lengths):
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in data_loader:
            if len(batch) == 4:  # Tags are included in the batch
                sentences, tags, lengths, indices = batch
            else:  # No tags are included, as in the test set
                sentences, lengths, indices = batch

            sentences = sentences.to(device)
            outputs = model(sentences)
            predicted_tag_indices = torch.argmax(outputs, dim=2)

            for i, index in enumerate(indices):
                original_index = index.item()  # Ensure you're getting the correct index as an integer.
                sentence_length = original_sentence_lengths[original_index]  # Use the original length for accurate indexing.
                for j in range(sentence_length):
                    original_word = original_sentences[original_index][j]
                    predicted_tag_index = predicted_tag_indices[i][j].item()
                    predicted_tag = idx_to_tag[predicted_tag_index]
                    predictions.append(f"{j+1} {original_word} {predicted_tag}\n")
                predictions.append("\n")

    with open(output_file_path, 'w') as writer:
        writer.writelines(predictions)

    print(f"Predictions written to {output_file_path}")



dev_original_lengths = [len(sentence) for sentence in dev_original_sentences]
test_original_lengths = [len(sentence) for sentence in test_original_sentences]

# Example usage for dev set
output_file_path = 'dev1.out'
write_predictions_to_file(model, dev_loader, idx_to_tag, output_file_path, dev_original_sentences, dev_original_lengths)

# Example usage for test set
output_file_path = 'test1.out'
write_predictions_to_file(model, test_loader, idx_to_tag, output_file_path, test_original_sentences, test_original_lengths)





Predictions written to dev1.out
Predictions written to test1.out


### 5. Evaluation

In [312]:
predicted_file_path = 'dev1.out'
gold_standard_file_path = 'data/dev'

!python eval.py -p {predicted_file_path} -g {gold_standard_file_path}


processed 51578 tokens with 5942 phrases; found: 5592 phrases; correct: 4613.
accuracy:  95.89%; precision:  82.49%; recall:  77.63%; FB1:  79.99
              LOC: precision:  86.97%; recall:  86.50%; FB1:  86.74  1827
             MISC: precision:  83.72%; recall:  74.73%; FB1:  78.97  823
              ORG: precision:  76.85%; recall:  66.59%; FB1:  71.35  1162
              PER: precision:  81.01%; recall:  78.28%; FB1:  79.62  1780


#### 5.1 Saving the model

In [313]:
import torch
model_save_path = 'blstm1.pt'
torch.save(model.state_dict(), model_save_path)