https://youtu.be/u--UVvH-LIQ

# Выбираем пакет данных для обучения

In [17]:
from datasets import load_dataset

# Набор данных Вопросы-Ответы
# Вопросы-Ответы (Question Answering)
# Задача для практики с моделями, которые должны найти ответ в контексте (extractive QA).
dataset_squad = load_dataset("squad")
dataset_squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [18]:
dataset_squad["train"][0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

Разбиваю чать Train на Train/Test

In [19]:
# 1. Извлекаем существующий спилит
train_data = dataset_squad["train"]

# 2. Разбиваем на два сплита
# test_size=0.1
# seeseed=42 для воспроизводимости
test_split = train_data.train_test_split(test_size=0.1, seed=42)

# 3. Обновляем исходный объект DatasetDict
dataset_squad["train"] = test_split["train"]
dataset_squad["test"] = test_split["test"]

dataset_squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 78839
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 8760
    })
})

In [20]:

# Для удобной визуализации можно применить pandas
datframe_squad = dataset_squad["train"].to_pandas()
datframe_squad.head()

Unnamed: 0,id,title,context,question,answers
0,57263127ec44d21400f3dbf9,Korean_War,After the formation of the People's Republic o...,To show their strength in the international Co...,"{'text': ['promoted Communist revolutions'], '..."
1,5733f93e4776f41900661602,"Punjab,_Pakistan",There are 48 departments in Punjab government....,Who heads each government department?,{'text': ['a Provincial Minister (Politician) ...
2,56bfae97a10cfb1400551236,Beyoncé,Beyoncé and husband Jay Z are friends with Pre...,What did they attend in July 2013?,"{'text': ['a rally'], 'answer_start': [840]}"
3,56bfa087a10cfb14005511da,Beyoncé,"On January 7, 2012, Beyoncé gave birth to her ...",What was the child's name?,"{'text': ['Blue Ivy Carter'], 'answer_start': ..."
4,570ceadbfed7b91900d45ad5,Gymnastics,General gymnastics enables people of all ages ...,What kind of routines do general gymnastic gro...,"{'text': ['synchronized, choreographed routine..."


# Токенизация данных
BETR это популярная модель, но существуют более легкие и быстрые модели с такой же точностью и лучше использовать их
https://youtu.be/u--UVvH-LIQ?t=563

In [21]:
from transformers import AutoTokenizer

modelcheckpoint = "microsoft/miniLM-L12-H384-uncased"
tokenizer = AutoTokenizer.from_pretrained(modelcheckpoint)

In [22]:
# Пример: данные до токенизации
test1 = dataset_squad["train"]["context"][:1]
print(test1)
# Пример: токенизированные данные
test2 = tokenizer(dataset_squad["train"]["context"][:1])
print(test2)

["After the formation of the People's Republic of China in 1949, the Chinese government named the Western nations, led by the United States, as the biggest threat to its national security. Basing this judgment on China's century of humiliation beginning in the early 19th century, American support for the Nationalists during the Chinese Civil War, and the ideological struggles between revolutionaries and reactionaries, the Chinese leadership believed that China would become a critical battleground in the United States' crusade against Communism. As a countermeasure and to elevate China's standing among the worldwide Communist movements, the Chinese leadership adopted a foreign policy that actively promoted Communist revolutions throughout territories on China's periphery."]
{'input_ids': [[101, 2044, 1996, 4195, 1997, 1996, 2111, 1005, 1055, 3072, 1997, 2859, 1999, 4085, 1010, 1996, 2822, 2231, 2315, 1996, 2530, 3741, 1010, 2419, 2011, 1996, 2142, 2163, 1010, 2004, 1996, 5221, 5081, 200

In [23]:
# До обработки
dataset_squad['test'][0]

{'id': '573173d8497a881900248f0c',
 'title': 'Egypt',
 'context': 'The Pew Forum on Religion & Public Life ranks Egypt as the fifth worst country in the world for religious freedom. The United States Commission on International Religious Freedom, a bipartisan independent agency of the US government, has placed Egypt on its watch list of countries that require close monitoring due to the nature and extent of violations of religious freedom engaged in or tolerated by the government. According to a 2010 Pew Global Attitudes survey, 84% of Egyptians polled supported the death penalty for those who leave Islam; 77% supported whippings and cutting off of hands for theft and robbery; and 82% support stoning a person who commits adultery.',
 'question': 'What percentage of Egyptians polled support death penalty for those leaving Islam?',
 'answers': {'text': ['84%'], 'answer_start': [468]}}

In [24]:
from datasets import DatasetDict

# Функция для токенизации обучающих данных
def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts = examples["context"]
    answers = examples["answers"]

    tokenized_examples = tokenizer(
        questions,
        contexts,
        max_length=384,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        sample_index = sample_map[i]
        answer = answers[sample_index]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        sequence_ids = tokenized_examples.sequence_ids(i)
        context_start = next((j for j, sid in enumerate(sequence_ids) if sid == 1), None)
        context_end = next((j for j, sid in reversed(list(enumerate(sequence_ids))) if sid == 1), None)

        if context_start is None:
            tokenized_examples["start_positions"].append(0)
            tokenized_examples["end_positions"].append(0)
            continue

        if offsets[context_start][0] > start_char or offsets[context_end][1] < end_char:
            # Ответ вне текущего токенизированного сплита контекста
            tokenized_examples["start_positions"].append(0)
            tokenized_examples["end_positions"].append(0)
        else:
            # Находим стартовый токен
            start_token_idx = context_start
            while start_token_idx < len(offsets) and offsets[start_token_idx][0] <= start_char:
                start_token_idx += 1
            tokenized_examples["start_positions"].append(start_token_idx - 1)

            # Находим конечный токен
            end_token_idx = context_end
            while end_token_idx >= context_start and offsets[end_token_idx][1] >= end_char:
                end_token_idx -= 1
            tokenized_examples["end_positions"].append(end_token_idx + 1)

    return tokenized_examples

def preprocess_evaluation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts = examples["context"]
    # Получаем answers безопасно, так как в test сплите их может не быть
    answers = examples.get("answers", None)

    # Токенизация (return_offsets_mapping=True - обязательно)
    tokenized_examples = tokenizer(
        questions,
        contexts,
        max_length=384,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Сохраняем ID исходного примера
    sample_map = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples["offset_mapping"]

    tokenized_examples["example_id"] = []
    tokenized_examples["start_positions"] = [] # Нужно для Validation Loss!
    tokenized_examples["end_positions"] = []   # Нужно для Validation Loss!

    for i in range(len(tokenized_examples["input_ids"])):
        sample_index = sample_map[i]
        
        # 1. Сохраняем example_id
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # 2. Обнуляем смещения для токенов вопроса (как вы делали раньше - для постобработки)
        sequence_ids = tokenized_examples.sequence_ids(i)
        offset_mapping[i] = [
            (o if sequence_ids[j] == 1 else None)
            for j, o in enumerate(offset_mapping[i])
        ]
        
        # 3. Добавляем метки (start/end positions) для Validation Loss
        if answers and answers[sample_index].get("answer_start"):
            answer = answers[sample_index]
            start_char = answer["answer_start"][0]
            end_char = start_char + len(answer["text"][0])

            context_start = next((j for j, sid in enumerate(sequence_ids) if sid == 1), None)
            context_end = next((j for j, sid in reversed(list(enumerate(sequence_ids))) if sid == 1), None)

            # Находим позиции токенов (логика из функции обучения)
            if context_start is None or offset_mapping[i][context_start] is None:
                 start_pos, end_pos = 0, 0
            elif offset_mapping[i][context_start][0] > start_char or offset_mapping[i][context_end][1] < end_char:
                start_pos, end_pos = 0, 0
            else:
                start_token_idx = context_start
                while start_token_idx < len(offset_mapping[i]) and offset_mapping[i][start_token_idx] is not None and offset_mapping[i][start_token_idx][0] <= start_char:
                    start_token_idx += 1
                start_pos = start_token_idx - 1

                end_token_idx = context_end
                while end_token_idx >= context_start and offset_mapping[i][end_token_idx] is not None and offset_mapping[i][end_token_idx][1] >= end_char:
                    end_token_idx -= 1
                end_pos = end_token_idx + 1

            tokenized_examples["start_positions"].append(start_pos)
            tokenized_examples["end_positions"].append(end_pos)
        else:
            # Для примеров без ответа или тестового набора
            tokenized_examples["start_positions"].append(0)
            tokenized_examples["end_positions"].append(0)
            
    return tokenized_examples

# Токенизация обучающего сплита
tokenized_train = dataset_squad["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=dataset_squad["train"].column_names
)

# Токенизация валидационного и тестового сплитов
tokenized_val = dataset_squad["validation"].map(
    preprocess_evaluation_examples,
    batched=True,
    remove_columns=dataset_squad["validation"].column_names
)

tokenized_test = dataset_squad["test"].map(
    preprocess_evaluation_examples,
    batched=True,
    remove_columns=dataset_squad["test"].column_names
)

tokenized_dataset = DatasetDict({
    "train": tokenized_train,
    "validation": tokenized_val,
    "test": tokenized_test
})

print(tokenized_dataset)

Map: 100%|██████████| 78839/78839 [00:21<00:00, 3678.79 examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 79675
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'example_id', 'start_positions', 'end_positions'],
        num_rows: 10784
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'example_id', 'start_positions', 'end_positions'],
        num_rows: 8849
    })
})





In [25]:
# После обработки
tokenized_dataset['train'][0]

{'input_ids': [101,
  2000,
  2265,
  2037,
  3997,
  1999,
  1996,
  2248,
  4750,
  2929,
  1010,
  2054,
  2106,
  2859,
  2079,
  1029,
  102,
  2044,
  1996,
  4195,
  1997,
  1996,
  2111,
  1005,
  1055,
  3072,
  1997,
  2859,
  1999,
  4085,
  1010,
  1996,
  2822,
  2231,
  2315,
  1996,
  2530,
  3741,
  1010,
  2419,
  2011,
  1996,
  2142,
  2163,
  1010,
  2004,
  1996,
  5221,
  5081,
  2000,
  2049,
  2120,
  3036,
  1012,
  6403,
  2290,
  2023,
  8689,
  2006,
  2859,
  1005,
  1055,
  2301,
  1997,
  21171,
  2927,
  1999,
  1996,
  2220,
  3708,
  2301,
  1010,
  2137,
  2490,
  2005,
  1996,
  17934,
  2076,
  1996,
  2822,
  2942,
  2162,
  1010,
  1998,
  1996,
  17859,
  11785,
  2090,
  24517,
  1998,
  4668,
  12086,
  1010,
  1996,
  2822,
  4105,
  3373,
  2008,
  2859,
  2052,
  2468,
  1037,
  4187,
  2645,
  16365,
  1999,
  1996,
  2142,
  2163,
  1005,
  16282,
  2114,
  15523,
  1012,
  2004,
  1037,
  4675,
  4168,
  3022,
  5397,
  1998,
  2000,
  34

In [None]:
# После обработки
tokenized_dataset['validation'][0]

# Метрики

In [26]:
# Для метрик
import collections 
import numpy as np
import evaluate # Загрузка метрики

# Функция постобработки
# Эта функция сопоставляет логиты с исходными текстами, выбирает лучший ответ и преобразует его в текст.
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size=20, max_answer_length=30):
    all_start_logits, all_end_logits = raw_predictions
    
    # Сопоставляем токенизированные "features" (сплиты) с оригинальными "examples"
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # Инициализация словаря для хранения лучших ответов
    predictions = collections.OrderedDict()

    for example_index, example in enumerate(examples):
        feature_indices = features_per_example[example_index]
        min_null_score = None # Для SQuAD v2, но полезно для отслеживания
        valid_answers = []
        
        context = example["context"]
        
        # Перебор всех токенизированных сплитов, относящихся к этому примеру
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            offset_mapping = features[feature_index]["offset_mapping"]

            # Выбираем N лучших стартовых и конечных логитов
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # 1. Фильтруем: end должна быть позже start
                    if end_index < start_index:
                        continue
                    # 2. Фильтруем: длина ответа не должна превышать max_answer_length
                    if end_index - start_index + 1 > max_answer_length:
                        continue
                    
                    # 3. Фильтруем: убеждаемся, что мы в пределах контекста (не в CLS, SEP, QUESTION)
                    # sequence_ids уже был обнулен для вопроса в pre-processing, 
                    # поэтому проверяем, что смещения не None
                    
                    start_char_span = offset_mapping[start_index]
                    end_char_span = offset_mapping[end_index]
                    
                    if start_char_span is None or end_char_span is None:
                        continue
                    
                    # Извлекаем текстовый ответ, используя смещения
                    start_char = start_char_span[0]
                    end_char = end_char_span[1]
                    answer_text = context[start_char:end_char]
                    
                    # Добавляем валидный ответ
                    valid_answers.append({
                        "score": start_logits[start_index] + end_logits[end_index],
                        "text": answer_text,
                        "start_logit": start_logits[start_index],
                        "end_logit": end_logits[end_index],
                    })

        # Выбираем лучший ответ
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
            predictions[example["id"]] = best_answer["text"]
        else:
            # Если не найдено валидного ответа, возвращаем пустую строку
            predictions[example["id"]] = ""

    return predictions

# Сохраняем исходные наборы данных (нужны для постобработки)
validation_features = tokenized_dataset["validation"]
validation_set = dataset_squad["validation"] 
metric = evaluate.load("squad") 

import traceback

# Фабричная функция для compute_metrics (включает try/except для надежности)
def make_qa_compute_metrics(raw_examples, tokenized_features, squad_metric):
    
    def compute_qa_metrics(p):
        try:
            start_logits, end_logits = p.predictions
            
            predictions = postprocess_qa_predictions(
                examples=raw_examples,
                features=tokenized_features,
                raw_predictions=(start_logits, end_logits),
                n_best_size=20,
                max_answer_length=30
            )
            
            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
            references = [{"id": ex["id"], "answers": ex["answers"]} for ex in raw_examples]
            
            return squad_metric.compute(predictions=formatted_predictions, references=references)
        
        except Exception as e:
            # Выводит ошибку, если постобработка сбоит
            print(f"\n!!! ОШИБКА ВЫЧИСЛЕНИЯ МЕТРИК: {e} !!!")
            return {"f1": 0.0, "exact_match": 0.0}

    return compute_qa_metrics

# Создаем функцию, которую будем передавать в Trainer
qa_compute_metrics_fn = make_qa_compute_metrics(
    validation_set, 
    validation_features, 
    metric
)

# Сбор всех данных воедино

In [27]:
# 1. Подготовка модели и зависимостей
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer  # рассматриваем как задачу классификации последовательностей

model = AutoModelForQuestionAnswering.from_pretrained(modelcheckpoint)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at microsoft/miniLM-L12-H384-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
# 2. Определение аргументов обучения
training_args = TrainingArguments(
    output_dir="local_models/minilm_squad",  # Директория для сохранения результатов
    eval_strategy="epoch",          # Оценивать после каждой эпохи
    learning_rate=2e-5,             # Скорость обучения
    per_device_train_batch_size=16, # Размер батча для обучения
    per_device_eval_batch_size=16,  # Размер батча для валидации
    num_train_epochs=3,             # Количество эпох
    weight_decay=0.01,              # Регуляризация
    push_to_hub=False,              # Не загружать на Hugging Face Hub
    
)

In [29]:
# 3. Инициализация
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=qa_compute_metrics_fn,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer # Передаем токенизатор, чтобы Trainer мог его сохранить
)

  trainer = Trainer(


In [30]:
# Запуск обучения https://youtu.be/u--UVvH-LIQ?t=1068
trainer.train()

Epoch,Training Loss,Validation Loss,Exact Match,F1
1,1.0142,0.969763,81.02176,88.863876
2,0.8235,0.92608,83.150426,90.226364
3,0.7165,0.931661,83.472091,90.456945


TrainOutput(global_step=14940, training_loss=0.9447985811246288, metrics={'train_runtime': 2902.5403, 'train_samples_per_second': 82.35, 'train_steps_per_second': 5.147, 'total_flos': 1.17275044876416e+16, 'train_loss': 0.9447985811246288, 'epoch': 3.0})

| Метрика | Значение | Интерпретация |
| --- | --- | --- |
| **Training Loss** | **1.0474** | Уровень ошибки модели на обучающем наборе данных. Это число будет снижаться с каждой эпохой. |
| **Validation Loss** | **0.993388** | Уровень ошибки модели на **валидационном** наборе данных. Тот факт, что оно **меньше**, чем `Training Loss`, указывает на то, что модель **не переобучается** (пока) и хорошо обобщает данные. |
| **Exact Match (EM)** | **81.24%** | В **81.24%** случаев предсказанный ответ **абсолютно точно** совпал с одним из правильных ответов золотого стандарта. |
| **F1 Score** | **88.77%** | Среднее гармоническое между точностью и полнотой совпадения слов. Это высокий показатель, говорящий о том, что модель очень хорошо находит правильные границы ответа в контексте. |

# Сохранение модели

In [31]:
trainer.save_model("local_models/minilm_squad")
tokenizer.save_pretrained("local_models/minilm_squad")

('local_models/minilm_squad\\tokenizer_config.json',
 'local_models/minilm_squad\\special_tokens_map.json',
 'local_models/minilm_squad\\vocab.txt',
 'local_models/minilm_squad\\added_tokens.json',
 'local_models/minilm_squad\\tokenizer.json')

In [32]:
raw_predictions = trainer.predict(tokenized_dataset["validation"])

# Использование обученной модели

In [33]:
# Необученная модель

from transformers import pipeline

qa_orig_pipeline = pipeline(
    "question-answering",
    model=modelcheckpoint,
    tokenizer=modelcheckpoint
)

# Пример контекста из вашего датасета SQuAD
context_text = (
    "Astana, formerly known as Nur-Sultan, has been the capital of Kazakhstan since 1997."
    "The city is located on the banks of the Ishim River and is known for its futuristic architecture, including the Baiterek Tower."
)

question_text = "Near which river is the city located?"

# Выполняем инференс
result = qa_orig_pipeline(
    question=question_text,
    context=context_text
)

# Выводим результат
print(f"Контекст: {context_text}")
print(f"Вопрос: {question_text}")
print("---")
print(f"Ответ модели: {result['answer']}")
print(f"Оценка уверенности (Score): {result['score']:.4f}")
print(f"Позиция ответа (Start/End): {result['start']}-{result['end']}")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at microsoft/miniLM-L12-H384-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


Контекст: Astana, formerly known as Nur-Sultan, has been the capital of Kazakhstan since 1997.The city is located on the banks of the Ishim River and is known for its futuristic architecture, including the Baiterek Tower.
Вопрос: Near which river is the city located?
---
Ответ модели: formerly known as Nur-Sultan
Оценка уверенности (Score): 0.0007
Позиция ответа (Start/End): 8-36


In [34]:
# Обученная модель

from transformers import pipeline

modelcheckpoint_new = "local_models/minilm_squad"
qa_pipeline = pipeline(
    "question-answering",
    model=modelcheckpoint_new,
    tokenizer=modelcheckpoint_new
)

# Пример контекста из вашего датасета SQuAD
context_text = (
    "Astana, formerly known as Nur-Sultan, has been the capital of Kazakhstan since 1997."
    "The city is located on the banks of the Ishim River and is known for its futuristic architecture, including the Baiterek Tower."
)

question_text = "Since what year has Astana been the capital of Kazakhstan?"

# Выполняем инференс
result = qa_pipeline(
    question=question_text,
    context=context_text
)

# Выводим результат
print(f"Контекст: {context_text}")
print(f"Вопрос: {question_text}")
print("---")
print(f"Ответ модели: {result['answer']}")
print(f"Оценка уверенности (Score): {result['score']:.4f}")
print(f"Позиция ответа (Start/End): {result['start']}-{result['end']}")

Device set to use cuda:0


Контекст: Astana, formerly known as Nur-Sultan, has been the capital of Kazakhstan since 1997.The city is located on the banks of the Ishim River and is known for its futuristic architecture, including the Baiterek Tower.
Вопрос: Since what year has Astana been the capital of Kazakhstan?
---
Ответ модели: 1997
Оценка уверенности (Score): 0.9829
Позиция ответа (Start/End): 79-83


In [35]:
# Обученная модель

from transformers import pipeline

modelcheckpoint_new = "local_models/minilm_squad"
qa_pipeline = pipeline(
    "question-answering",
    model=modelcheckpoint_new,
    tokenizer=modelcheckpoint_new
)

# Пример контекста из вашего датасета SQuAD
context_text = (
    "Astana, formerly known as Nur-Sultan, has been the capital of Kazakhstan since 1997."
    "The city is located on the banks of the Ishim River and is known for its futuristic architecture, including the Baiterek Tower."
)

question_text = "Near which river is the city located?"

# Выполняем инференс
result = qa_pipeline(
    question=question_text,
    context=context_text
)

# Выводим результат
print(f"Контекст: {context_text}")
print(f"Вопрос: {question_text}")
print("---")
print(f"Ответ модели: {result['answer']}")
print(f"Оценка уверенности (Score): {result['score']:.4f}")
print(f"Позиция ответа (Start/End): {result['start']}-{result['end']}")

Device set to use cuda:0


Контекст: Astana, formerly known as Nur-Sultan, has been the capital of Kazakhstan since 1997.The city is located on the banks of the Ishim River and is known for its futuristic architecture, including the Baiterek Tower.
Вопрос: Near which river is the city located?
---
Ответ модели: Ishim River
Оценка уверенности (Score): 0.7141
Позиция ответа (Start/End): 124-135
