In [16]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.utils.data as data_utils
import torch.nn.utils.rnn as rnn_utils
import time
import math

from collections import defaultdict


In [3]:
word2idx = {'<PAD>': 0, '<UNK>': 1}
tag2idx = {'<PAD>': 0}

def read_data(file_path):
    """
    Reads the data from the file and returns the sentences, words and tags in separate lists.
    """
    with open(file_path, "r") as f:
        data = f.read().strip().split("\n\n")
    sentences = []
    words = []
    tags = []
    for sentence in data:
        sentence_words = []
        sentence_tags = []
        for line in sentence.strip().split("\n"):
            line = line.strip().split()
            sentence_words.append(line[1])
            sentence_tags.append(line[2])
            # add words and tags to the dictionaries
            if line[1] not in word2idx:
                word2idx[line[1]] = len(word2idx)
            if line[2] not in tag2idx:
                tag2idx[line[2]] = len(tag2idx)
        words.append(sentence_words)
        tags.append(sentence_tags)
        sentences.append(sentence_words)
    return sentences, words, tags

# read the train and dev data
train_sentences, train_words, train_tags = read_data("data/train")

In [9]:
sentences_idx = [[word2idx[word] for word in sentence] for sentence in train_sentences]
tags_idx = [[tag2idx[tag] for tag in tag_seq] for tag_seq in train_tags]

In [12]:
padded_sentences = rnn_utils.pad_sequence([torch.tensor(seq) for seq in sentences_idx], batch_first=True, padding_value=word2idx["<PAD>"])
padded_tags = rnn_utils.pad_sequence([torch.tensor(seq) for seq in tags_idx], batch_first=True, padding_value=tag2idx["<PAD>"])

In [17]:
dataset = data_utils.TensorDataset(padded_sentences, padded_tags)
dataloader = data_utils.DataLoader(dataset, batch_size=20, shuffle=True)

In [9]:
# train_data = []
# for i in range(len(train_tags)):
#     train_data.append((train_words[i],train_tags[i] ))

In [18]:
# class NERDataset(Dataset):
#     def __init__(self, data, word2idx, tag2idx):
#         self.data = data
#         self.word2idx = word2idx
#         self.tag2idx = tag2idx

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         sentence, tags = self.data[idx]
#         sentence_tensor = torch.tensor([self.word2idx[word] for word in sentence])
#         tags_tensor = torch.tensor([self.tag2idx[tag] for tag in tags])
#         return sentence_tensor, tags_tensor

# train_dataset = NERDataset(train_data, word2idx, tag2idx)
# train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

In [19]:
# Define hyperparameters
embedding_dim = 100
num_lstm_layers = 1
lstm_hidden_dim = 256
lstm_dropout = 0.33
linear_output_dim = 128
num_epochs = 10
learning_rate = 0.01

# Define the BLSTM model
class BLSTM(nn.Module):
    def __init__(self, embedding_dim, num_lstm_layers, lstm_hidden_dim, lstm_dropout, linear_output_dim):
        super(BLSTM, self).__init__()
        self.embedding_dim = embedding_dim
        self.num_lstm_layers = num_lstm_layers
        self.lstm_hidden_dim = lstm_hidden_dim
        self.lstm_dropout = lstm_dropout
        self.linear_output_dim = linear_output_dim
        
        # Define the layers of the model
        self.embedding = nn.Embedding(num_embeddings=len(word2idx), embedding_dim=self.embedding_dim)
        self.lstm = nn.LSTM(input_size=self.embedding_dim, hidden_size=self.lstm_hidden_dim, num_layers=self.num_lstm_layers, batch_first=True, dropout=self.lstm_dropout, bidirectional=True)
        self.linear = nn.Linear(in_features=self.lstm_hidden_dim*2, out_features=self.linear_output_dim)
        self.activation = nn.ELU()
        self.classifier = nn.Linear(in_features=self.linear_output_dim, out_features=len(tag2idx))
    
    def forward(self, sentence):
        embedded = self.embedding(sentence)
        lstm_output, _ = self.lstm(embedded)
        linear_output = self.activation(self.linear(lstm_output))
        logits = self.classifier(linear_output)
        return logits

In [23]:
# Initialize the BLSTM model and define the loss function and optimizer
model = BLSTM(embedding_dim, num_lstm_layers, lstm_hidden_dim, lstm_dropout, linear_output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
print_every = 1000

# Train the BLSTM model
for epoch in range(2):
    total_loss = 0
    total_correct=0
    total_samples=0
    for i, (sentence, tags_tensor) in enumerate(dataloader):
        # Convert data to PyTorch tensors
        optimizer.zero_grad()
        logits = model(sentence)
        loss = criterion(logits.view(-1, len(tag2idx)), tags_tensor.unsqueeze(0).view(-1))
        _, predictions = torch.max(logits, 2)
        total_correct += (predictions == tags_tensor).sum().item()       
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_samples += predictions.numel()
        
        if i % print_every == 0:
            print(f"Epoch {epoch+1}/{2}, Step {i+1}/{len(dataloader)}, Loss: {loss.item():.4f} ")
    
    accuracy = total_correct / total_samples
    print(f"Epoch {epoch+1}/{2}, Accuracy: {accuracy:.4f}")
    print()

Epoch 1/2, Step 1/750, Loss: 2.4585 
Epoch 1/2, Accuracy: 0.9196
total_samples =  1693531
Epoch 2/2, Step 1/750, Loss: 0.2206 
Epoch 2/2, Accuracy: 0.9779
total_samples =  1693531


In [42]:
import torch
tensor1 = torch.tensor([2, 2, 2, 1, 2, 1, 1, 2, 2, 2])
tensor2 = torch.tensor([[2, 2, 2, 2, 2, 2, 2, 2, 2, 2]])
t=tensor2.squeeze(0)
torch.sum(t == tensor1)

tensor(7)

In [47]:
tensor1 = torch.tensor([[6, 6, 2, 2, 2, 5, 5, 2]])
tensor2 = torch.tensor([[3, 8, 2, 2, 2, 2, 2, 2]])

matching = (tensor1 == tensor2).sum().item()

In [22]:
len(dataloader)

750

In [24]:
len(padded_sentences)

14987