In [49]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [50]:
class NERModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout_rate=0.5):
        super(NERModel, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Multiply hidden_dim by 2 for bidirectional LSTM

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.lstm(embedded)
        output = self.dropout(output)
        output = self.dropout(output)
        output = self.fc(output)
        return output

In [51]:
# Определение кастомного класса для загрузки датасета
class NERDataset(Dataset):
    def __init__(self, filepath):
        self.data = pd.read_csv(filepath, encoding='ISO-8859-1').fillna(method='ffill')

        # Create a list of unique words and tags
        self.words = list(set(self.data["Word"].values))
        self.tags = list(set(self.data["Tag"].values))

        # Create dictionaries for mapping words and tags to integers
        self.word2idx = {w: i+1 for i, w in enumerate(self.words)}
        self.tag2idx = {t: i for i, t in enumerate(self.tags)}

        self.sentences = self._get_sentences()

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, index):
        sentence = self.sentences[index]
        words = [word[0] for word in sentence]
        tags = [word[1] for word in sentence]

        # Convert words and tags to numerical values using the dictionaries
        x = [self.word2idx[w] for w in words]
        y = [self.tag2idx[t] for t in tags]

        return torch.tensor(x), torch.tensor(y)

    def _get_sentences(self):
        # группировка данных по номеру предложения
        grouped = self.data.groupby("Sentence #")
        sentences = []
        for _, group in grouped:
            words = group["Word"].values.tolist()
            tags = group["Tag"].values.tolist()
            sentence = list(zip(words, tags))
            sentences.append(sentence)
        return sentences

In [52]:
# определение гиперпараметров модели
BATCH_SIZE = 32
EMBEDDING_DIM = 32
HIDDEN_DIM = 64
LEARNING_RATE = 0.1
EPOCHS = 4

dataset = NERDataset("C:/Users/qwe11/PycharmProjects/Ner_task/ner_datasetreference.csv")


In [53]:
# Разделение датасета на тренировочный и тестовый
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

In [54]:
def collate_fn(batch):
    x = [item[0] for item in batch]
    y = [item[1] for item in batch]
    x_lengths = [len(seq) for seq in x]
    y_lengths = [len(seq) for seq in y]

    # Добавление паддингов для одинаковой длинны текстов
    x = nn.utils.rnn.pad_sequence(x, batch_first=True)
    y = nn.utils.rnn.pad_sequence(y, batch_first=True)

    # Создание маски, чтобы игнорировать значения заполнения при расчете потерь
    x_mask = torch.arange(x.size(1))[None, :] < torch.tensor(x_lengths)[:, None]
    y_mask = torch.arange(y.size(1))[None, :] < torch.tensor(y_lengths)[:, None]

    return x, y, x_mask, y_mask

In [55]:
#Определение загрузчиков батчей данных дляя обучения модели
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,collate_fn = collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

In [56]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#Обучение модели проверка качетва
for epoch in range(EPOCHS):
    train_loss = 0.0
    val_loss = 0.0
    model.to(device)

    model.train()  # Set the model to training mode

    for batch in train_loader:
        x, y, x_mask, y_mask = batch
        x = x.to(device)
        y = y.to(device)
        x_mask = x_mask.to(device)
        y_mask = y_mask.to(device)

        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output.view(-1, len(dataset.tags)), y.view(-1))
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * x.size(0)

    train_loss /= len(train_dataset)

    model.eval()  # Перевод в режим валидации
    with torch.no_grad():
        val_loss = 0.0
        predictions = []
        targets = []
        for batch in val_loader:
            x, y, x_mask, y_mask = batch
            x = x.to(device)
            y = y.to(device)
            x_mask = x_mask.to(device)
            y_mask = y_mask.to(device)

            output = model(x)
            loss = criterion(output.view(-1, len(dataset.tags)), y.view(-1))
            val_loss += loss.item() * x.size(0)

            # Calculate additional metrics
            predicted_labels = torch.argmax(output, dim=2)
            true_labels = y.view(-1).cpu().numpy()
            predicted_labels = predicted_labels.view(-1).cpu().numpy()

            predictions.extend(predicted_labels)
            targets.extend(true_labels)

        val_loss /= len(val_dataset)

        accuracy = accuracy_score(targets, predictions)
        precision = precision_score(targets, predictions, average='macro',zero_division=1)
        recall = recall_score(targets, predictions, average='macro',zero_division=1)
        f1 = f1_score(targets, predictions, average='macro',zero_division=1)

    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
    print("Accuracy: {:.4f}".format(accuracy))
    print("Precision: {:.4f}".format(precision))
    print("Recall: {:.4f}".format(recall))
    print("F1-Score: {:.4f}".format(f1))


Epoch 1, Train Loss: 0.1806, Val Loss: 0.1296
Accuracy: 0.9675
Precision: 0.8687
Recall: 0.4504
F1-Score: 0.4818
Epoch 2, Train Loss: 0.1794, Val Loss: 0.1261
Accuracy: 0.9688
Precision: 0.8614
Recall: 0.4481
F1-Score: 0.4822
Epoch 3, Train Loss: 0.1815, Val Loss: 0.1293
Accuracy: 0.9684
Precision: 0.8674
Recall: 0.4378
F1-Score: 0.4638
Epoch 4, Train Loss: 0.1800, Val Loss: 0.1271
Accuracy: 0.9683
Precision: 0.8720
Recall: 0.4475
F1-Score: 0.4841


In [57]:
text = "This is Vladlena Toloknova, i ;live in Russia Moscow and study in university"

# Tokenize the text
tokenized_text = tokenizer.encode(text, add_special_tokens=True)

# Convert the tokenized text to a tensor and add a batch dimension
input_tensor = torch.tensor(tokenized_text).unsqueeze(0).to(device)

# Pass the tensor through the model to get the predicted tags
model.eval()
with torch.no_grad():
    output = model(input_tensor)[0]

# Convert the predicted tags to their corresponding tag labels using the dataset's tag list
predicted_tags = torch.argmax(output, dim=-1).squeeze().tolist()
predicted_labels = [dataset.tags[idx] for idx in predicted_tags]
print(predicted_labels)

['I-per', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'B-per', 'I-org', 'B-geo', 'O']
