In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import time
import math

from collections import defaultdict


In [3]:
word2idx = {}
tag2idx={}
word2idx['<PAD>'] = 0
word2idx['<UNK>'] = 1
tag2idx['<PAD>'] = 0

def read_data(file_path):
    """
    Reads the data from the file and returns the sentences, words and tags in separate lists.
    """
    with open(file_path, "r") as f:
        data = f.read().strip().split("\n\n")
    sentences = []
    words = []
    tags = []
    for sentence in data:
        sentence_words = []
        sentence_tags = []
        for line in sentence.strip().split("\n"):
            line = line.strip().split()
            sentence_words.append(line[1])
            sentence_tags.append(line[2])
            # add words and tags to the dictionaries
            if line[1] not in word2idx:
                word2idx[line[1]] = len(word2idx)
            if line[2] not in tag2idx:
                tag2idx[line[2]] = len(tag2idx)
        words.append(sentence_words)
        tags.append(sentence_tags)
        sentences.append(sentence_words)
    return sentences, words, tags

# read the train and dev data
train_sentences, train_words, train_tags = read_data("data/train")
dev_sentences, dev_words, dev_tags = read_data("data/dev")

In [154]:
class NERModel(nn.Module):
    def __init__(self, word2idx, tag2idx, pretrained_embeddings,embedding_dim=100, hidden_dim=256, output_dim=128, dropout=0.33):
        super(NERModel, self).__init__()
        self.word2idx = word2idx
        self.tag2idx = tag2idx
        self.word_embeddings = pretrained_embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,num_layers=1,batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, hidden_dim)
        self.elu = nn.ELU()
        self.classifier = nn.Linear(hidden_dim, len(tag2idx))
        
    def forward(self, sentence, length):
#         embeds = self.word_embeddings(sentence)
#         embeds = nn.utils.rnn.pack_padded_sequence(embeds, length, batch_first=True, enforce_sorted=False)
#         lstm_out, _ = self.lstm(embeds) ##.view(len(sentence), 1, -1)
#         lstm_out, _ = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
#         lstm_out = self.dropout(lstm_out)
#         fc_out = self.fc(lstm_out)
#         elu_out = self.elu(fc_out)
#         tag_space = self.classifier(elu_out.view(1, -1, hidden_dim*2)) ##
# #         tag_scores = nn.functional.log_softmax(tag_space, dim=1)
#         return tag_space

        embeds = self.word_embeddings(sentence)
        embeds = nn.utils.rnn.pack_padded_sequence(embeds, length, batch_first=True, enforce_sorted=False)
        lstm_out, _ = self.lstm(embeds)  # shape: batch_size x sequence_length x (hidden_dim * 2)
        lstm_out, _ = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
        lstm_out = self.dropout(lstm_out)
        fc_out = self.fc(lstm_out)  # shape: batch_size x sequence_length x hidden_dim
        elu_out = self.elu(fc_out)  # shape: batch_size x sequence_length x hidden_dim
        tag_space = self.classifier(elu_out)  # shape: batch_size x sequence_length x len(tag2idx)
        return tag_space


class NERDataset(Dataset):
    def __init__(self, file_path, word_to_idx, label_to_idx):
        self.word_to_idx = word_to_idx
        self.label_to_idx = label_to_idx
        self.sentences, self.labels = self.read_data(file_path)
        self.max_len=113
        
    def read_data(self, file_path):
        sentences = []
        labels = []
        with open(file_path, 'r') as f:
            words = []
            tags = []
            for line in f:
                line = line.strip()
                if line:
                    parts = line.split()
                    word = parts[1]
                    tag = parts[2]
                    words.append(word)
                    tags.append(tag)
                else:
                    if words:
                        sentences.append(words)
                        labels.append(tags)
                        words = []
                        tags = []
        return sentences, labels
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, index):
        sentence = self.sentences[index]
        label = self.labels[index]
        sentence_idx = [self.word_to_idx.get(word, 0) for word in sentence]
        label_idx = [self.label_to_idx[tag] for tag in label]
        
        pad_len = self.max_len - len(sentence)
        sentence_idx += [self.word_to_idx['<PAD>']] * pad_len
        label_idx += [self.label_to_idx['<PAD>']] * pad_len
        
        return torch.LongTensor(sentence_idx), torch.LongTensor(label_idx), len(sentence)


In [155]:
def collate_fn(batch):
    inputs = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    lengths = [item[2] for item in batch]
    
    # Pad sequences to max_len
    max_len = max(lengths)
    inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=word2idx['<PAD>'])
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=word2idx['<PAD>'])
    
    return inputs, labels, lengths

In [158]:
train_dataset = NERDataset('data/train', word2idx,tag2idx)
dev_dataset = NERDataset('data/dev', word2idx, tag2idx)
train_loader = DataLoader(train_dataset, batch_size=20, shuffle=True,drop_last=True)
dev_loader = DataLoader(dev_dataset, batch_size=20, drop_last=True)

In [163]:
def train(model, data_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    total_correct = 0
    total = 0
    num_classes = len(model.tag2idx)
    for batch in data_loader:
        sentences, tags, lengths = batch
        print(len(sentences), len(tags), lengths)        
        sentences = sentences.to(device)
        tags = tags.to(device)
        lengths = torch.tensor(lengths, dtype=torch.int64).to(device)
        optimizer.zero_grad()
        output = model(sentences, lengths)
        # create a mask to ignore the padding tokens while computing the loss
        mask = (sentences != word2idx['<PAD>'])
        mask = mask.float()
        loss = criterion(output.view(-1, num_classes), tags.view(-1)) * mask.view(-1)
        loss = loss.sum() / mask.sum()  # average the loss over non-padding tokens
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        predictions = torch.argmax(output, dim=1)
        total_correct += torch.sum(predictions == tags)
        total += tags.numel()
        
    return total_loss / len(data_loader), total_correct / total

def evaluate(model, data_loader, device):
    model.eval()
    total_correct = 0
    total = 0
    with torch.no_grad():
        for X, y in data_loader:
            X = X.to(device)
            y = y.to(device)
            output = model(X)
            predictions = torch.argmax(output, dim=2)
            total_correct += torch.sum(predictions == y)
            total += y.numel()
    return total_correct / total

# embedding = nn.Embedding.from_pretrained(torch.from_numpy(embeddings).to(device, dtype=torch.float), padding_idx=0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = NERModel(word2idx, tag2idx).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
num_epochs = 10
for epoch in range(num_epochs):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
#     dev_acc = evaluate(model, dev_loader, device)
    print(f'Epoch {epoch + 1}, Train Acc {train_acc}')

20 20 [5, 5, 9, 43, 1, 6, 28, 8, 7, 27, 10, 7, 7, 7, 1, 8, 24, 8, 36, 10]


ValueError: Expected input batch_size (860) to match target batch_size (2400).

In [4]:
len(train_words)

14987

10

In [147]:
tag2idx

{'B-ORG': 0,
 'O': 1,
 'B-MISC': 2,
 'B-PER': 3,
 'I-PER': 4,
 'B-LOC': 5,
 'I-ORG': 6,
 'I-MISC': 7,
 'I-LOC': 8,
 '<PAD>': 0}

In [None]:
# embeddings = np.zeros((len(word2idx), 100))
# with open('glove.6B.100d.txt', 'r', encoding='utf-8') as f:
#     for line in f:
#         parts = line.split()
#         word = parts[0]
#         if word in word2idx:
#             idx = word2idx[word]
#             embedding = np.array(parts[1:], dtype=np.float32)
#             embeddings[idx] = embedding