In [1]:
from transformers import DistilBertForTokenClassification, DistilBertTokenizerFast, Trainer, TrainingArguments
import torch

# Загрузка предобученной модели и токенизатора
model = DistilBertForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=3)
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [2]:
import json
import torch
from transformers import DistilBertTokenizerFast

from sklearn.model_selection import train_test_split

# Загрузка данных
with open("/kaggle/input/semeval-2025-task-3-mu-shroom-dataset/qa_data_output.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Разделение на train и val в отношении 9:1
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

# Проверка размеров
print(f"Train size: {len(train_data)}, Validation size: {len(val_data)}")

# Сохранение данных для проверки
with open("train_data.json", "w", encoding="utf-8") as train_file:
    json.dump(train_data, train_file, ensure_ascii=False, indent=4)

with open("val_data.json", "w", encoding="utf-8") as val_file:
    json.dump(val_data, val_file, ensure_ascii=False, indent=4)

# Инициализация токенизатора
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

import torch

import torch

def create_dataset(data, tokenizer, max_length=512):
    input_ids = []
    attention_masks = []
    labels_list = []
    offset_mappings = []  # To store the offset mappings

    for item in data:
        # Конкатенация question и hallucinated_answer
        text = 'query: ' + item["question"] + "\n answer: " + item["hallucinated_answer"]
        if 'hallucination' not in item.keys():
            continue
        hallucinations = item["hallucination"].split('\n')  # Разделение галлюцинаций
        hallucinations = [h.strip("- ").strip('"').strip() for h in hallucinations]  # Убираем маркеры списка и пробелы

        # Токенизация текста
        encoding = tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_offsets_mapping=True,
            return_tensors="pt"
        )

        # Метки (все токены по умолчанию O: 0)
        labels = [0] * max_length  # 0: O, 1: B-HALLUCINATION, 2: I-HALLUCINATION
        offsets = encoding["offset_mapping"][0].tolist()

        # Проставление меток для каждой галлюцинации
        for hallucination in hallucinations:
            hallucination_start = text.rfind(hallucination)
            if hallucination_start == -1:
                continue  # Если галлюцинация не найдена в тексте
            hallucination_end = hallucination_start + len(hallucination)

            for idx, (offset_start, offset_end) in enumerate(offsets):
                if offset_start >= hallucination_start and offset_end <= hallucination_end:
                    if labels[idx] == 0:
                        labels[idx] = 1  # B-HALLUCINATION
                    else:
                        labels[idx] = 2  # I-HALLUCINATION

        # Учитываем padding токены
        for idx, mask_value in enumerate(encoding["attention_mask"][0].tolist()):
            if mask_value == 0:
                labels[idx] = 0  # Метка для padding токенов

        # Сохраняем токены, маску и метки
        input_ids.append(encoding["input_ids"][0])
        attention_masks.append(encoding["attention_mask"][0])
        labels_list.append(torch.tensor(labels))
        offset_mappings.append(offsets)  # Save the offsets

    return {
        "input_ids": torch.stack(input_ids),
        "attention_mask": torch.stack(attention_masks),
        "labels": torch.stack(labels_list),
        "offset_mappings": offset_mappings  # Return offset mappings
    }



# Создание обучающего набора
train_dataset = create_dataset(train_data, tokenizer)
val_dataset = create_dataset(val_data, tokenizer)

# Проверка размеров
print(f"Input IDs shape: {train_dataset['input_ids'].shape}")
print(f"Attention mask shape: {train_dataset['attention_mask'].shape}")
print(f"Labels shape: {train_dataset['labels'].shape}")


Train size: 9000, Validation size: 1000
Input IDs shape: torch.Size([8905, 512])
Attention mask shape: torch.Size([8905, 512])
Labels shape: torch.Size([8905, 512])


In [3]:
from torch.utils.data import Dataset

class QADataset(Dataset):
    def __init__(self, dataset):
        self.input_ids = dataset["input_ids"]
        self.attention_mask = dataset["attention_mask"]
        self.labels = dataset["labels"]
        self.offset_mappings = dataset["offset_mappings"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        if idx >= len(self.input_ids) or idx < 0:
            raise IndexError(f"Invalid index: {idx}")
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx],
            "offset_mappings": self.offset_mappings[idx]
        }


In [4]:
from transformers import Trainer, TrainingArguments

model = DistilBertForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

training_args = TrainingArguments(
    output_dir="./results",          # Папка для сохранения моделей
    num_train_epochs=3,              # Количество эпох
    per_device_train_batch_size=16,  # Размер батча
    per_device_eval_batch_size=16,   # Размер батча для валидации
    warmup_steps=500,                # Количество шагов для прогрева
    weight_decay=0.01,               # Коэффициент L2-регуляризации
    logging_dir="./logs",            # Папка для логов
    evaluation_strategy="epoch",     # Валидация после каждой эпохи
    save_strategy="epoch",           # Сохранение модели после каждой эпохи
    logging_steps=10,                # Логирование каждые 10 шагов
    load_best_model_at_end=True      # Загрузка лучшей модели в конце
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=QADataset(train_dataset),
    eval_dataset=QADataset(val_dataset)
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111301841111122, max=1.0)…

Epoch,Training Loss,Validation Loss
1,0.5234,0.537923
2,0.4081,0.512118
3,0.2301,0.676637


TrainOutput(global_step=1671, training_loss=0.4508122793130715, metrics={'train_runtime': 768.726, 'train_samples_per_second': 34.752, 'train_steps_per_second': 2.174, 'total_flos': 3490460678016000.0, 'train_loss': 0.4508122793130715, 'epoch': 3.0})

In [19]:
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")


('./trained_model/tokenizer_config.json',
 './trained_model/special_tokens_map.json',
 './trained_model/vocab.txt',
 './trained_model/added_tokens.json',
 './trained_model/tokenizer.json')

In [20]:
results = trainer.evaluate()
print(results)


{'eval_loss': 0.5121184587478638, 'eval_runtime': 8.6005, 'eval_samples_per_second': 114.993, 'eval_steps_per_second': 7.209, 'epoch': 3.0}


In [5]:
from transformers import TrainingArguments
from datetime import datetime

# Получение текущей даты и времени
current_datetime = datetime.now()

training_args = TrainingArguments(
    run_name=f"RUN {current_datetime}",
    output_dir=f"./results",
    evaluation_strategy="steps",
    eval_steps=500,           # Запуск валидации каждые 500 шагов
    logging_steps=5,        # Логи каждые 500 шагов
    save_strategy="steps",    # Сохранение модели каждые 500 шагов
    save_steps=500,
    warmup_steps=500,                # Количество шагов для прогрева
    weight_decay=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    load_best_model_at_end=True,  # Загрузка лучшей модели в конце
    #metric_for_best_model="f1_macro",  # Метрика для выбора лучшей модели
    #greater_is_better=True
)




In [8]:
from sklearn.metrics import f1_score, jaccard_score

def compute_metrics(pred, attention_mask):
    predictions, labels = pred
    
    # Предсказания и метки
    predictions = np.argmax(predictions, axis=2)

    # Убираем padding токены на основе attention_mask
    true_predictions = [
        [p for (p, m) in zip(prediction, mask) if m == 1]
        for prediction, mask in zip(predictions, attention_mask)
    ]
    true_labels = [
        [l for (l, m) in zip(label, mask) if m == 1]
        for label, mask in zip(labels, attention_mask)
    ]

    # Переводим списки в одномерные массивы
    true_predictions = [item for sublist in true_predictions for item in sublist]
    true_labels = [item for sublist in true_labels for item in sublist]

    # Метрики
    f1_macro = f1_score(true_labels, true_predictions, average="macro")
    f1_micro = f1_score(true_labels, true_predictions, average="micro")
    iou = jaccard_score(true_labels, true_predictions, average="macro")

    return {
        "f1_macro": f1_macro,
        "f1_micro": f1_micro,
        "iou": iou,
    }


In [9]:
from transformers import Trainer
import wandb
wandb.init(project='your_project_name', name='new_run_name')  # Set unique name for each run

del model
model = DistilBertForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=QADataset(train_dataset),
    eval_dataset=QADataset(val_dataset),
    #compute_metrics=compute_metrics
)


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111276015555581, max=1.0)…

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
trainer.train()


Step,Training Loss,Validation Loss
500,0.0098,0.011726
1000,0.011,0.010699


TrainOutput(global_step=1114, training_loss=0.05715645355451685, metrics={'train_runtime': 509.9673, 'train_samples_per_second': 34.924, 'train_steps_per_second': 2.184, 'total_flos': 2326973785344000.0, 'train_loss': 0.05715645355451685, 'epoch': 2.0})

In [11]:
from torch.utils.data import DataLoader
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Evaluate on the validation dataset
eval_dataloader = DataLoader(QADataset(val_dataset), batch_size=16)

# Collect predictions and labels
predictions = []
labels = []
attention_masks = []

model.eval()  # Set model to evaluation mode
with torch.no_grad():
    for batch in eval_dataloader:
        # Get batch data
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        label_ids = batch["labels"].to(device)

        # Make predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.append(logits.detach().cpu().numpy())
        labels.append(label_ids.detach().cpu().numpy())
        attention_masks.append(attention_mask.detach().cpu().numpy())

# Combine batches into arrays
predictions = np.concatenate(predictions, axis=0)
labels = np.concatenate(labels, axis=0)
attention_masks = np.concatenate(attention_masks, axis=0)


Using device: cuda


In [28]:
def compute_metrics(pred, attention_mask, dataset):
    predictions, labels = pred

    # Predicted labels
    predictions = np.argmax(predictions, axis=2)

    # Filter predictions and labels using attention_mask
    true_predictions = []
    true_labels = []

    for i, (prediction, label, mask) in enumerate(zip(predictions, labels, attention_masks)):
        # Filter by attention mask
        filtered_preds = [p for p, m in zip(prediction, mask) if m == 1]
        filtered_labels = [l for l, m in zip(label, mask) if m == 1]

        # Find start index of 'answer: ' for further filtering
        text = tokenizer.decode(dataset[i]["input_ids"], skip_special_tokens=True)
        answer_start_index = text.find("answer: ") + len("answer: ")

        # Apply additional filtering for tokens after 'answer: '
        offsets = dataset[i]["offset_mappings"]
        filtered_preds = [
            p for p, (start, _) in zip(filtered_preds, offsets) if start >= answer_start_index
        ]
        filtered_labels = [
            l for l, (start, _) in zip(filtered_labels, offsets) if start >= answer_start_index
        ]

        true_predictions.extend(filtered_preds)
        true_labels.extend(filtered_labels)

    # Compute metrics
    print(set(true_labels), set(true_predictions))
    f1_macro = f1_score(true_labels, true_predictions, average="macro")
    f1_micro = f1_score(true_labels, true_predictions, average="micro")
    iou = jaccard_score(true_labels, true_predictions, average="macro")

    return {
        "f1_macro": f1_macro,
        "f1_micro": f1_micro,
        "iou": iou,
    }


In [39]:
from sklearn.metrics import classification_report, f1_score, jaccard_score

def compute_metrics(pred, attention_mask, dataset):
    predictions, labels = pred

    # Predicted labels
    predictions = np.argmax(predictions, axis=2)

    # Filter predictions and labels using attention_mask
    true_predictions = []
    true_labels = []

    for i, (prediction, label, mask) in enumerate(zip(predictions, labels, attention_mask)):
        # Filter by attention mask
        filtered_preds = [p for p, m in zip(prediction, mask) if m == 1]
        filtered_labels = [l for l, m in zip(label, mask) if m == 1]

        # Find start index of 'answer: ' for further filtering
        text = tokenizer.decode(dataset[i]["input_ids"], skip_special_tokens=True)
        answer_start_index = text.find("answer: ") + len("answer: ")

        # Apply additional filtering for tokens after 'answer: '
        offsets = dataset[i]["offset_mappings"]
        filtered_preds = [
            p for p, (start, _) in zip(filtered_preds, offsets) if start >= answer_start_index
        ]
        filtered_labels = [
            l for l, (start, _) in zip(filtered_labels, offsets) if start >= answer_start_index
        ]

        true_predictions.extend(filtered_preds)
        true_labels.extend(filtered_labels)

    # Compute overall metrics
    true_labels = [min(1, i) for i in true_labels]
    f1_macro = f1_score(true_labels, true_predictions, average="macro")
    f1_micro = f1_score(true_labels, true_predictions, average="micro")
    iou = jaccard_score(true_labels, true_predictions, average="macro")

    # Detailed class-specific metrics
    class_report = classification_report(true_labels, true_predictions, output_dict=True)
    print(classification_report(true_labels, true_predictions))

    return {
        "f1_macro": f1_macro,
        "f1_micro": f1_micro,
        "iou": iou,
        "class_report": class_report,
    }


In [40]:
# Pack predictions, labels, and attention_mask into the expected format
pred = (predictions, labels)
#print(predictions)
metrics = compute_metrics(pred, attention_masks, QADataset(val_dataset))

# Print the results
print(f"F1 Macro: {metrics['f1_macro']}")
print(f"F1 Micro: {metrics['f1_micro']}")
print(f"IoU: {metrics['iou']}")
#print(f"Report: {metrics['class_report']}")


              precision    recall  f1-score   support

           0       0.95      0.98      0.97     34944
           1       0.77      0.57      0.65      3911

    accuracy                           0.94     38855
   macro avg       0.86      0.77      0.81     38855
weighted avg       0.93      0.94      0.94     38855

F1 Macro: 0.8103671921170389
F1 Micro: 0.9396988804529661
IoU: 0.7108394632004718


In [None]:
QADataset(train_dataset)[2]

In [30]:
train_data[2]

{'knowledge': 'Robert Albert Diaco (born February 19, 1973) is an American football coach and former player. He is currently the defensive coordinator at Nebraska. Nebraska also has the most wins and the highest winning percentage of any program over the last 50 years.',
 'question': 'What position does Bob Diaco hold with the football team hat has the most wins and the highest winning percentage of any program over the last 50 years?',
 'right_answer': 'defensive coordinator',
 'hallucinated_answer': 'Bob Diaco is a quarterback.',
 'hallucination': '"quarterback"'}