In [None]:
import matplotlib.pyplot as plt
import numpy as np
import operator
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

# from torchcrf import CRF
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, mean_squared_error, cohen_kappa_score
from sklearn.model_selection import train_test_split

In [None]:
from functions.file import *

lines = get_lines_from_ner_corpus("../../Corpus/korpusi.txt")
write_lines_to_csv(lines, "korpusi.csv")

In [None]:
import pandas as pd

data_df = pd.read_csv("korpusi.csv")

In [None]:
print("Total number of sentences in the dataset: {:,}".format(data_df["Sentence #"].nunique()))
print("Total tokens in the dataset: {:,}".format(data_df.shape[0]))

In [None]:
ner_counts = data_df["NER_Tag"].value_counts()
ner_counts

In [None]:
word_counts = data_df.groupby("Sentence #")["Word"].agg(["count"])
word_counts = word_counts.rename(columns={"count": "Word count"})

In [None]:
# all_words = list(set(data_df["Word"].values))
# all_tags = list(set(data_df["NER_Tag"].values))
#
# print("Number of unique words: {}".format(data_df["Word"].nunique()))
# print("Number of unique tags : {}".format(data_df["NER_Tag"].nunique()))
unique_words = sorted(list(set(data_df["Word"].str.lower().values)))
unique_tags = sorted(list(set(data_df["NER_Tag"].values)))

print(f"Unique words: {len(unique_words)}")
print(f"Unique tags: {len(unique_tags)}")
print(f"Tags: {unique_tags}")

In [None]:
word2index = {word:idx + 2 for idx, word in enumerate(all_words) }
word2index["<UNK>"]=0
word2index["<PAD>"]=1
word2index = dict([(k, v) for k, v in sorted(word2index.items(), key=lambda item: item[1])])

index2word = {idx:word for word,idx in word2index.items()}

In [None]:
tag2index = {tag: idx + 1 for idx, tag in enumerate(all_tags)}
tag2index["<PAD>"] = 0
tag2index = dict([(k, v) for k, v in sorted(tag2index.items(), key=lambda item: item[1])])

index2tag = {idx: word for word, idx in tag2index.items()}

In [None]:
def to_tuples(data):
    iterator = zip(data["Word"].values.tolist(),
                   data["NER_Tag"].values.tolist()
                   )
    return [(word, tag) for word, tag in iterator]

sentences = data_df.groupby("Sentence #").apply(to_tuples, include_groups=False).tolist()

print(sentences[0])

In [None]:
X = [[word[0] for word in sentence] for sentence in sentences]
Y = [[word[1] for word in sentence] for sentence in sentences]
print("X[0]:", X[0])
print("Y[0]:", Y[0])

X = [[word2index[word] for word in sentence] for sentence in X]
Y = [[tag2index[tag] for tag in sentence] for sentence in Y]
print("X[0]:", X[0])
print("Y[0]:", Y[0])

In [None]:
MAX_SENTENCE = word_counts.max().iloc[0]
print("Longest sentence in the corpus contains {} words.".format(MAX_SENTENCE))

In [None]:
X = [sentence + [word2index["<PAD>"]] * (MAX_SENTENCE - len(sentence)) for sentence in X]
Y = [sentence + [tag2index["<PAD>"]] * (MAX_SENTENCE - len(sentence)) for sentence in Y]
print("X[0]:", X[0])
print("Y[0]:", Y[0])

In [38]:
# Group by Sentence ID and calculate the length of each sentence
sentence_lengths = data_df.groupby("Sentence #")["Word"].count().reset_index()
sentence_lengths.columns = ["Sentence ID", "Length"]


In [None]:
TAG_COUNT = len(tag2index)

y = np.array(Y)
Y = [np.eye(TAG_COUNT)[sentence] for sentence in Y]
print("X[0]:", X[0])
print("y[0]:", y[0])
print("Y[0]:", Y[0])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=1234)

print("Number of sentences in the training dataset: {}".format(len(X_train)))
print("Number of sentences in the test dataset : {}".format(len(X_test)))

In [None]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
class NERDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float)  # one-hot labels

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create datasets
train_dataset = NERDataset(X_train, y_train)
test_dataset  = NERDataset(X_test, y_test)

# DataLoaders
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [None]:
class BiLSTM_NER(nn.Module):
    def __init__(self, vocab_size, tag_count, embedding_dim=100, hidden_dim=64):
        super(BiLSTM_NER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=word2index["<PAD>"])
        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=hidden_dim,
                            batch_first=True,
                            bidirectional=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim*2, tag_count)  # bidirectional -> *2
        self.softmax = nn.Softmax(dim=2)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.dropout(x)
        x = self.fc(x)
        x = self.softmax(x)
        return x

In [None]:
VOCAB_SIZE = len(word2index)
TAG_COUNT  = len(tag2index)

model = BiLSTM_NER(VOCAB_SIZE, TAG_COUNT, embedding_dim=200, hidden_dim=64).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=tag2index["<PAD>"])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
EPOCHS = 3

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)

        y_batch_indices = torch.argmax(y_batch, dim=2)

        outputs = outputs.view(-1, TAG_COUNT)
        y_batch_indices = y_batch_indices.view(-1)

        loss = criterion(outputs, y_batch_indices)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss/len(train_loader):.4f}")


In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from functions.file import *

In [2]:
lines = get_lines_from_ner_corpus("../../Corpus/korpusi.txt")
write_lines_to_csv(lines, "korpusi.csv")

In [3]:
data_df = pd.read_csv("korpusi.csv")

In [4]:
print("Dataset shape:", data_df.shape)
print("Sample data:")
print(data_df.head())

Dataset shape: (1003903, 3)
Sample data:
   Sentence #   Word NER_Tag
0           1  Bujar   B-PER
1           1  Gjoka   I-PER
2           1      :       O
3           1  Gjuha       O
4           1  është       O


In [5]:
data_df = data_df.dropna()  # Remove any NaN values
print(f"After cleaning: {len(data_df)} rows")

After cleaning: 1003903 rows


In [6]:
# Analyze class distribution
ner_counts = data_df["NER_Tag"].value_counts()
print("\nNER Tag Distribution:")
print(ner_counts)


NER Tag Distribution:
NER_Tag
O           914052
B-PER        20190
I-DATE_0     13790
I-PER        12599
B-VEND_0     11689
B-VEND_1      8764
I-ORG         6459
B-DATE_0      6135
B-ORG         4526
I-VEND_1      1615
I-EVENT       1038
B-EVENT        648
I-DATE_1       470
B-DATE_1       459
B-PRO          343
I-VEND_0       323
I-RRUGE        279
I-SHESH        171
B-RRUGE        140
B-SHESH        112
I-PRO          101
Name: count, dtype: int64


In [7]:
unique_words = sorted(list(set(data_df["Word"].str.lower().values)))
unique_tags = sorted(list(set(data_df["NER_Tag"].values)))

print(f"Unique words: {len(unique_words)}")
print(f"Unique tags: {len(unique_tags)}")
print(f"Tags: {unique_tags}")

Unique words: 55578
Unique tags: 21
Tags: ['B-DATE_0', 'B-DATE_1', 'B-EVENT', 'B-ORG', 'B-PER', 'B-PRO', 'B-RRUGE', 'B-SHESH', 'B-VEND_0', 'B-VEND_1', 'I-DATE_0', 'I-DATE_1', 'I-EVENT', 'I-ORG', 'I-PER', 'I-PRO', 'I-RRUGE', 'I-SHESH', 'I-VEND_0', 'I-VEND_1', 'O']


In [8]:
word2idx = {"<PAD>": 0, "<UNK>": 1}
for i, word in enumerate(unique_words):
    word2idx[word] = i + 2

In [9]:
tag2idx = {"<PAD>": 0}
for i, tag in enumerate(unique_tags):
    tag2idx[tag] = i + 1

In [10]:
idx2word = {v: k for k, v in word2idx.items()}
idx2tag = {v: k for k, v in tag2idx.items()}
print(f"Final vocab sizes - Words: {len(word2idx)}, Tags: {len(tag2idx)}")

Final vocab sizes - Words: 55580, Tags: 22


In [11]:
# Convert to sentences
def to_tuples(group):
    return list(zip(group["Word"].values, group["NER_Tag"].values))

In [12]:
sentences = data_df.groupby("Sentence #").apply(to_tuples,include_groups=False).tolist()
print(f"Number of sentences: {len(sentences)}")
print(f"Sample sentence: {sentences[0][:5]}...")

Number of sentences: 37785
Sample sentence: [('Bujar', 'B-PER'), ('Gjoka', 'I-PER'), (':', 'O'), ('Gjuha', 'O'), ('është', 'O')]...


In [13]:
# Convert to sequences
X_words = [[word for word, tag in sentence] for sentence in sentences]
y_tags = [[tag for word, tag in sentence] for sentence in sentences]

In [15]:
# Convert to indices with careful bounds checking
def words_to_indices(word_sequences):
    result = []
    for sequence in word_sequences:
        indices = []
        for word in sequence:
            word_lower = word.lower()
            if word_lower in word2idx:
                indices.append(word2idx[word_lower])
            else:
                indices.append(word2idx["<UNK>"])
        result.append(indices)
    return result

def tags_to_indices(tag_sequences):
    result = []
    for sequence in tag_sequences:
        indices = []
        for tag in sequence:
            if tag in tag2idx:
                indices.append(tag2idx[tag])
            else:
                print(f"Warning: Unknown tag '{tag}', using <PAD>")
                indices.append(tag2idx["<PAD>"])
        result.append(indices)
    return result

In [17]:
X_indices = words_to_indices(X_words)
y_indices = tags_to_indices(y_tags)

In [18]:
# Verify indices are in correct range
max_word_idx = max([max(seq) if seq else 0 for seq in X_indices])
max_tag_idx = max([max(seq) if seq else 0 for seq in y_indices])

In [19]:
print(f"Max word index: {max_word_idx} (vocab size: {len(word2idx)})")
print(f"Max tag index: {max_tag_idx} (tag count: {len(tag2idx)})")

assert max_word_idx < len(word2idx), f"Word index out of range: {max_word_idx} >= {len(word2idx)}"
assert max_tag_idx < len(tag2idx), f"Tag index out of range: {max_tag_idx} >= {len(tag2idx)}"

Max word index: 55579 (vocab size: 55580)
Max tag index: 21 (tag count: 22)


In [20]:
# Calculate sequence lengths
seq_lengths = [len(seq) for seq in X_indices]
max_len = max(seq_lengths)
avg_len = np.mean(seq_lengths)
p95_len = int(np.percentile(seq_lengths, 95))

print(f"Sequence lengths - Max: {max_len}, Avg: {avg_len:.1f}, 95th percentile: {p95_len}")

Sequence lengths - Max: 333, Avg: 26.6, 95th percentile: 56


In [21]:
# Use 95th percentile to reduce padding
# MAX_LEN = p95_len

# Use absolute max length
MAX_LEN = max_len

In [22]:
# Pad sequences
def pad_sequences(sequences, max_len, pad_value):
    padded = []
    for seq in sequences:
        if len(seq) >= max_len:
            padded.append(seq[:max_len])  # Truncate
        else:
            padded.append(seq + [pad_value] * (max_len - len(seq)))  # Pad
    return padded

In [23]:
X_padded = pad_sequences(X_indices, MAX_LEN, word2idx["<PAD>"])
y_padded = pad_sequences(y_indices, MAX_LEN, tag2idx["<PAD>"])

In [24]:
# Verify padded sequences
print(f"Padded sequence length: {len(X_padded[0])}")
print(f"All X sequences same length: {len(set(len(seq) for seq in X_padded)) == 1}")
print(f"All y sequences same length: {len(set(len(seq) for seq in y_padded)) == 1}")

Padded sequence length: 333
All X sequences same length: True
All y sequences same length: True


In [25]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_padded, y_padded, test_size=0.2, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=42
)

In [26]:
print(f"Data splits - Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

Data splits - Train: 27205, Val: 3023, Test: 7557


In [27]:
# Convert to tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [28]:
X_train_t = torch.tensor(X_train, dtype=torch.long).to(device)
X_val_t = torch.tensor(X_val, dtype=torch.long).to(device)
X_test_t = torch.tensor(X_test, dtype=torch.long).to(device)
y_train_t = torch.tensor(y_train, dtype=torch.long).to(device)
y_val_t = torch.tensor(y_val, dtype=torch.long).to(device)
y_test_t = torch.tensor(y_test, dtype=torch.long).to(device)

In [29]:
# Dataset class
class NERDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [30]:
# Model definition
class BiLSTM_NER(nn.Module):
    def __init__(self, vocab_size, tag_count, embedding_dim=128, hidden_dim=64, dropout=0.3,num_layers=1):
        super(BiLSTM_NER, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=word2idx["<PAD>"])
        self.lstm = nn.LSTM(
            embedding_dim, hidden_dim,
            num_layers=num_layers,
            batch_first=True, bidirectional=True, dropout=dropout
        )
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_dim * 2, tag_count)

    def forward(self, x):
        # x shape: (batch, seq_len)
        embedded = self.embedding(x)  # (batch, seq_len, embed_dim)
        lstm_out, _ = self.lstm(embedded)  # (batch, seq_len, hidden_dim*2)
        lstm_out = self.dropout(lstm_out)
        output = self.classifier(lstm_out)  # (batch, seq_len, tag_count)
        return output


In [31]:
# Create datasets and loaders
train_dataset = NERDataset(X_train_t, y_train_t)
val_dataset = NERDataset(X_val_t, y_val_t)
test_dataset = NERDataset(X_test_t, y_test_t)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [32]:
vocab_size = len(word2idx)
tag_count = len(tag2idx)

In [33]:
# Initialize model
model = BiLSTM_NER(vocab_size, tag_count, embedding_dim=200, hidden_dim=128,num_layers=1,dropout=0).to(device)

In [34]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=tag2idx["<PAD>"])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [35]:
print("Model initialized successfully!")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

Model initialized successfully!
Model parameters: 11,459,574


In [36]:
# Check if any index in X_batch is out of range
invalid_indices = (X_batch >= vocab_size).nonzero(as_tuple=True)
if len(invalid_indices[0]) > 0:
    print(f"Invalid indices found: {X_batch[invalid_indices]}")
    print(f"Max index in X_batch: {X_batch.max()}, Vocab size: {vocab_size}")

NameError: name 'X_batch' is not defined

In [37]:
 # Training loop
epochs = 3
best_val_f1 = 0

for epoch in range(epochs):
    # Training
    model.train()
    train_loss = 0

    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()

        outputs = model(X_batch)  # (batch, seq_len, tag_count)
        outputs = outputs.view(-1, tag_count)  # (batch*seq_len, tag_count)
        targets = y_batch.view(-1)  # (batch*seq_len,)

        loss = criterion(outputs, targets)
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        train_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    y_true, y_pred = [], []

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            outputs = model(X_batch)
            outputs_flat = outputs.view(-1, tag_count)
            targets_flat = y_batch.view(-1)

            loss = criterion(outputs_flat, targets_flat)
            val_loss += loss.item()

            # Get predictions
            predictions = torch.argmax(outputs, dim=2)
            y_true.extend(y_batch.cpu().numpy().flatten())
            y_pred.extend(predictions.cpu().numpy().flatten())

    # Calculate metrics (excluding padding)
    mask = np.array(y_true) != tag2idx["<PAD>"]
    y_true_clean = np.array(y_true)[mask]
    y_pred_clean = np.array(y_pred)[mask]

    val_acc = accuracy_score(y_true_clean, y_pred_clean)
    val_f1 = f1_score(y_true_clean, y_pred_clean, average='weighted', zero_division=0)

    print(f"Epoch {epoch+1}/{epochs}:")
    print(f"  Train Loss: {train_loss/len(train_loader):.4f}")
    print(f"  Val Loss: {val_loss/len(val_loader):.4f}")
    print(f"  Val Acc: {val_acc:.4f}, Val F1: {val_f1:.4f}")

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), 'best_model.pth')

    print("-" * 40)

KeyboardInterrupt: 

In [None]:
# Load best model and test
model.load_state_dict(torch.load('best_model.pth'))
model.eval()

y_true_test, y_pred_test = [], []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        predictions = torch.argmax(outputs, dim=2)

        y_true_test.extend(y_batch.cpu().numpy().flatten())
        y_pred_test.extend(predictions.cpu().numpy().flatten())

In [None]:
# Final test metrics
mask = np.array(y_true_test) != tag2idx["<PAD>"]
y_true_final = np.array(y_true_test)[mask]
y_pred_final = np.array(y_pred_test)[mask]

test_acc = accuracy_score(y_true_final, y_pred_final)
test_f1 = f1_score(y_true_final, y_pred_final, average='weighted', zero_division=0)

print("=" * 50)
print("FINAL TEST RESULTS:")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print("=" * 50)

# Classification report
tag_names = [idx2tag[i] for i in range(1, len(idx2tag))]  # Exclude <PAD>
print("\nClassification Report:")
print(classification_report(y_true_final, y_pred_final,
                          labels=list(range(1, len(tag2idx))),
                          target_names=tag_names, zero_division=0))

In [None]:
model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        outputs = model(X_batch)
        predicted = torch.argmax(outputs, dim=2)

        y_true.extend(torch.argmax(y_batch, dim=2).cpu().numpy().flatten())
        y_pred.extend(predicted.cpu().numpy().flatten())

# Compute metrics
from sklearn.metrics import accuracy_score, f1_score

mask = np.array(y_true) != tag2index["<PAD>"]  # ignore padding
y_true_masked = np.array(y_true)[mask]
y_pred_masked = np.array(y_pred)[mask]

print("Accuracy:", accuracy_score(y_true_masked, y_pred_masked))
print("F1 Score:", f1_score(y_true_masked, y_pred_masked, average='weighted'))
