In [130]:
import lzma
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from seqeval.metrics import classification_report, f1_score

In [131]:
def parse_train_file(filepath):
    sentences = []
    labels = []
    with lzma.open(filepath, 'rt', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                tags_part, tokens_part = line.split('\t', 1)
            except ValueError:
                continue
            tag_seq = tags_part.split()
            token_seq = tokens_part.split()
            if len(tag_seq) != len(token_seq):
                continue
            sentences.append(token_seq)
            labels.append(tag_seq)
    return sentences, labels

In [132]:
def train_test_split(sentences, labels, test_ratio=0.1, seed=42):
    random.seed(seed)
    combined = list(zip(sentences, labels))
    random.shuffle(combined)
    split_idx = int(len(combined) * (1 - test_ratio))
    train_data = combined[:split_idx]
    test_data = combined[split_idx:]
    train_sents, train_tags = zip(*train_data)
    test_sents, test_tags = zip(*test_data)
    return list(train_sents), list(train_tags), list(test_sents), list(test_tags)

In [133]:
class NERDataset(Dataset):
    def __init__(self, sentences, labels, word2idx, tag2idx, max_len=50):
        self.sentences = sentences
        self.labels = labels
        self.word2idx = word2idx
        self.tag2idx = tag2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sent = self.sentences[idx]
        tag_seq = self.labels[idx]

        x = [self.word2idx.get(w, self.word2idx['<UNK>']) for w in sent]
        y = [self.tag2idx.get(t, self.tag2idx['O']) for t in tag_seq]

        if len(x) < self.max_len:
            pad_len = self.max_len - len(x)
            x += [self.word2idx['<PAD>']] * pad_len
            y += [self.tag2idx['<PAD>']] * pad_len
        else:
            x = x[:self.max_len]
            y = y[:self.max_len]

        return torch.tensor(x), torch.tensor(y)

In [134]:
class BiLSTMTagger(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim=100, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim, tagset_size)

    def forward(self, x):
        emb = self.embedding(x)
        lstm_out, _ = self.lstm(emb)
        logits = self.fc(lstm_out)
        return logits

In [135]:
#https://git.wmi.amu.edu.pl/kubapok/en-ner-conll-2003
train_sents, train_tags = parse_train_file('data/en-ner-conll-2003/train/train.tsv.xz')
train_sents, train_tags, test_sents, test_tags = train_test_split(train_sents, train_tags, test_ratio=0.1)

print(f"Train sentences: {len(train_sents)}")
print(f"Test sentences: {len(test_sents)}")
print("Sample sentence:")
print(train_sents[0])
print("Sample labels:")
print(train_tags[0])

# 6. Budujemy słowniki
all_words = set(w for sent in train_sents for w in sent)
all_tags = set(t for tags in train_tags for t in tags)
all_tags.add('<PAD>')  # dodajemy pad do tagów

word2idx = {w: i+2 for i, w in enumerate(sorted(all_words))}
word2idx['<PAD>'] = 0
word2idx['<UNK>'] = 1

tag2idx = {t: i for i, t in enumerate(sorted(all_tags))}
idx2tag = {i: t for t, i in tag2idx.items()}

Train sentences: 850
Test sentences: 95
Sample sentence:
['Saudi', 'Arabia', 'executes', 'Pakistani', 'man', '.', '</S>', 'DUBAI', '1996-08-25', '</S>', 'Saudi', 'Arabia', 'executed', 'on', 'Sunday', 'a', 'Pakistani', 'man', 'accused', 'of', 'belonging', 'to', 'an', 'armed', 'gang', 'of', 'robbers', ',', 'Saudi', 'television', 'reported', '.', '</S>', 'It', 'quoted', 'an', 'Interior', 'Ministry', 'statement', 'as', 'saying', 'Shabir', 'Ahmad', 'Muhammad', 'Jalil', 'was', 'executed', 'in', 'Mecca', '.', '</S>', 'He', 'was', 'the', '26th', 'person', 'executed', 'this', 'year', 'in', 'the', 'kingdom', '.', '</S>', 'Saudi', 'Arabia', 'beheads', 'convicted', 'drug', 'smugglers', ',', 'rapists', ',', 'murderers', 'and', 'other', 'criminals', '.', '</S>']
Sample labels:
['B-LOC', 'I-LOC', 'O', 'B-MISC', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', '

In [136]:
max_len = 100
train_dataset = NERDataset(train_sents, train_tags, word2idx, tag2idx, max_len=max_len)
test_dataset = NERDataset(test_sents, test_tags, word2idx, tag2idx, max_len=max_len)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [137]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BiLSTMTagger(len(word2idx), len(tag2idx)).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=tag2idx['<PAD>'])
optimizer = optim.Adam(model.parameters())

In [138]:
epochs = 30
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(x_batch)
        outputs = outputs.view(-1, outputs.shape[-1])
        y_batch = y_batch.view(-1)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss/len(train_loader):.4f}")

Epoch 1/30 - Loss: 1.6933
Epoch 2/30 - Loss: 0.8315
Epoch 3/30 - Loss: 0.7484
Epoch 4/30 - Loss: 0.7033
Epoch 5/30 - Loss: 0.6580
Epoch 6/30 - Loss: 0.6035
Epoch 7/30 - Loss: 0.5466
Epoch 8/30 - Loss: 0.4910
Epoch 9/30 - Loss: 0.4370
Epoch 10/30 - Loss: 0.3825
Epoch 11/30 - Loss: 0.3334
Epoch 12/30 - Loss: 0.2898
Epoch 13/30 - Loss: 0.2501
Epoch 14/30 - Loss: 0.2179
Epoch 15/30 - Loss: 0.1896
Epoch 16/30 - Loss: 0.1654
Epoch 17/30 - Loss: 0.1455
Epoch 18/30 - Loss: 0.1278
Epoch 19/30 - Loss: 0.1122
Epoch 20/30 - Loss: 0.0980
Epoch 21/30 - Loss: 0.0861
Epoch 22/30 - Loss: 0.0760
Epoch 23/30 - Loss: 0.0664
Epoch 24/30 - Loss: 0.0582
Epoch 25/30 - Loss: 0.0519
Epoch 26/30 - Loss: 0.0455
Epoch 27/30 - Loss: 0.0406
Epoch 28/30 - Loss: 0.0355
Epoch 29/30 - Loss: 0.0316
Epoch 30/30 - Loss: 0.0280


In [139]:
def evaluate(model, loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for x_batch, y_batch in loader:
            x_batch = x_batch.to(device)
            outputs = model(x_batch)
            preds = torch.argmax(outputs, dim=-1).cpu().numpy()
            y_true = y_batch.numpy()

            for pred_seq, true_seq in zip(preds, y_true):
                length = sum(t != tag2idx['<PAD>'] for t in true_seq)
                pred_tags = [idx2tag[idx] for idx in pred_seq[:length]]
                true_tags = [idx2tag[idx] for idx in true_seq[:length]]

                all_preds.append(pred_tags)
                all_labels.append(true_tags)

    print(classification_report(all_labels, all_preds))
    print("F1 score:", f1_score(all_labels, all_preds))

evaluate(model, test_loader)


              precision    recall  f1-score   support

         LOC       0.70      0.75      0.72       360
        MISC       0.65      0.66      0.66       211
         ORG       0.47      0.58      0.52       303
         PER       0.62      0.47      0.54       206

   micro avg       0.60      0.63      0.62      1080
   macro avg       0.61      0.61      0.61      1080
weighted avg       0.61      0.63      0.62      1080

F1 score: 0.6171273221567739
