# Stanford Question Answering Dataset (SQuAD)

In [None]:
# Загрузка датасета из файла JSON
with open('data/aqa_v1.0/1_dbidaf/train.json', 'r', encoding='utf-8') as file:
    data_t = json.load(file)

# Преобразование данных в плоский формат
records = []
for item in data_t['data']:
    # title = item['title']
    for paragraph in item['paragraphs']:
        # context = paragraph['context']
        for qa in paragraph['qas']:
            question = qa['question']
            for answer in qa['answers']:
                record = {
                    # 'title': title,
                    # 'context': context,
                    'question': question,
                    # 'answer_start': answer['answer_start'],
                    'answer_text': answer['text']
                }
                records.append(record)

# Создание DataFrame
df = pd.DataFrame(records)

# Вывод первых 10 строк для проверки
df.head(10)

# Cagle wiki questions dataset

In [None]:
# with open('data/qas/S08_question_answer_pairs.txt', 'r', encoding='utf-8') as file:
#     data_t = json.load(file)
file_path = 'data/qas/S08_question_answer_pairs.txt'
# Создание DataFrame
# Создание DataFrame
df = pd.read_csv(file_path, delimiter='\t')

# Выбор только необходимых столбцов
df_filtered = df[['Question', 'Answer']]

df_filtered = df_filtered.drop_duplicates(subset=['Question'])

# Вывод первых 10 строк для проверки
df_filtered.head(15)

In [None]:
df_filtered['Answer'] = df_filtered['Answer'].str.lower().str.strip('.')

# Подсчет количества строк с ответом "yes" и "no"
answer_counts = df_filtered['Answer'].value_counts()

# Вывод результатов
yes_count = answer_counts.get('yes', 0)
no_count = answer_counts.get('no', 0)
total_rows = df_filtered.shape[0]

# Вывод общего количества строк

total_rows, yes_count, no_count

In [None]:
df_filtered['AnswerLength'] = df_filtered['Answer'].str.len()

# Сортировка датафрейма по длине ответов и выбор 5 самых длинных
longest_answers = df_filtered.sort_values(by='AnswerLength', ascending=False).head(15)

# Вывод 5 самых длинных ответов
longest_answers[['Question', 'Answer']]

In [None]:

# Функция для загрузки и предварительной обработки данных из файла
def load_and_preprocess(file_path):
    df = pd.read_csv(file_path, delimiter='\t', encoding='ISO-8859-1')
    df_filtered = df[['Question', 'Answer']]
    return df_filtered.drop_duplicates(subset=['Question'])

# Пути к файлам
file_paths = ['data/qas/S08_question_answer_pairs.txt', 'data/qas/S09_question_answer_pairs.txt', 'data/qas/S10_question_answer_pairs.txt']

# Загрузка и объединение данных из всех файлов
combined_df = pd.concat([load_and_preprocess(file_path) for file_path in file_paths], ignore_index=True)

# Удаление дубликатов после объединения
combined_df = combined_df.drop_duplicates(subset=['Question'])

# Вывод первых 10 строк для проверки
combined_df.head(10)

In [None]:
# Подсчет количества строк с ответом "yes" и "no"
answer_counts = df_filtered['Answer'].value_counts()

# Вывод результатов
yes_count = answer_counts.get('yes', 0)
no_count = answer_counts.get('no', 0)
total_rows = df_filtered.shape[0]

# Вывод общего количества строк

total_rows, yes_count, no_count

In [3]:
import pandas as pd
from openai import OpenAI

In [4]:
# Функция для загрузки и предварительной обработки данных из файла
def load_and_preprocess(file_path):
    df = pd.read_csv(file_path, delimiter='\t', encoding='ISO-8859-1')
    df_filtered = df[['Question', 'Answer']]
    return df_filtered.drop_duplicates(subset=['Question'])

In [20]:
file_paths = ['data/qas/S08_quest000ion_answer_pairs.txt', 'data/qas/S09_question_answer_pairs.txt', 'data/qas/S10_question_answer_pairs.txt']

combined_df = pd.concat([load_and_preprocess(file_path) for file_path in file_paths], ignore_index=True)
combined_df = combined_df.drop_duplicates(subset=['Question'])
combined_df = combined_df.loc[combined_df['Answer'].str.lower().isin(['yes', 'no'])]

combined_df.head(10)

Unnamed: 0,Question,Answer
0,Was Abraham Lincoln the sixteenth President of...,yes
1,Did Lincoln sign the National Banking Act of 1...,yes
2,Did his mother die of pneumonia?,no
8,Did Lincoln beat John C. Breckinridge in the 1...,yes
9,Was Abraham Lincoln the first President of the...,No
10,Did Lincoln start his political career in 1832?,Yes
11,Did Lincoln ever represent Alton & Sangamon Ra...,Yes
15,Did Lincoln win the election of 1860?,Yes
51,Is it true that he became a professor in 1820?,yes
52,Was Lorenzo Romano Amedeo Carlo Avogadro an It...,yes


In [24]:
print(f"Количество вопросов: {len(combined_df)}")

Количество вопросов: 787


In [None]:
# Настройка клиента OpenAI для работы с локальным сервером
client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")
output_path = 'data/qas/combined_dataset_with_responses_t1.json'

In [30]:
# Функция для получения ответа от модели
def get_model_response(question):
    try:
        completion = client.chat.completions.create(
            model="lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF",
            
            messages=[
                {"role": "system", "content": "You are a helpful assistant. You have to provide complete answer without explicitly saying 'yes' or 'no'."},
                {"role": "user", "content": question}
            ],
            temperature=1,
        )
        response = completion.choices[0].message.content
        return response
    except Exception as e:
        print(f"Error with question: {question}\n{e}\n")
        return "Error"

In [32]:
try:
    existing_df = pd.read_json(output_path, orient='records', lines=True)
    combined_df = combined_df.merge(existing_df, on=['Question', 'Answer'], how='left')
except FileNotFoundError:
    combined_df['ModelResponse'] = None

for index, row in combined_df.iterrows():
    if pd.isna(row['ModelResponse']):
        combined_df.at[index, 'ModelResponse'] = get_model_response(row['Question'])
        combined_df.to_json(output_path, orient='records', lines=True)
        print(f"Processed {index + 1}/{len(combined_df)}")

print(f"Итоговый датасет сохранен в {output_path}")

Processed 384/787
Processed 385/787
Processed 386/787
Processed 387/787
Processed 388/787
Processed 389/787
Processed 390/787
Processed 391/787
Processed 392/787
Processed 393/787
Processed 394/787
Processed 395/787
Processed 396/787
Processed 397/787
Processed 398/787
Processed 399/787
Processed 400/787
Processed 401/787
Processed 402/787
Processed 403/787
Processed 404/787
Processed 405/787
Processed 406/787
Processed 407/787
Processed 408/787
Processed 409/787
Processed 410/787
Processed 411/787
Processed 412/787
Processed 413/787
Processed 414/787
Processed 415/787
Processed 416/787
Processed 417/787
Processed 418/787
Processed 419/787
Processed 420/787
Processed 421/787
Processed 422/787
Processed 423/787
Processed 424/787
Processed 425/787
Processed 426/787
Processed 427/787
Processed 428/787
Processed 429/787
Processed 430/787
Processed 431/787
Processed 432/787
Processed 433/787
Processed 434/787
Processed 435/787
Processed 436/787
Processed 437/787
Processed 438/787
Processed 

In [34]:
combined_df

Unnamed: 0,Question,Answer,ModelResponse_x,ModelResponse_y,ModelResponse
0,Was Abraham Lincoln the sixteenth President of...,yes,"Yes, Abraham Lincoln was indeed the 16th Presi...","Yes, Abraham Lincoln was indeed the 16th Presi...","Yes, Abraham Lincoln was indeed the 16th Presi..."
1,Did Lincoln sign the National Banking Act of 1...,yes,"No, Abraham Lincoln did not sign the National ...","No, Abraham Lincoln did not sign the National ...","No, Abraham Lincoln did not sign the National ..."
2,Did his mother die of pneumonia?,no,"I apologize, but there is no information provi...","I apologize, but there is no information provi...","I apologize, but there is no information provi..."
3,Did Lincoln beat John C. Breckinridge in the 1...,yes,"Actually, Abraham Lincoln did not face John C....","Actually, Abraham Lincoln did not face John C....","Actually, Abraham Lincoln did not face John C...."
4,Was Abraham Lincoln the first President of the...,No,"No, Abraham Lincoln was not the first Presiden...","No, Abraham Lincoln was not the first Presiden...","No, Abraham Lincoln was not the first Presiden..."
...,...,...,...,...,...
782,Was it likely that the xylophone reached Europ...,yes,,,"The xylophone, an ancient instrument originati..."
783,Does kelon increase or decrease tone quality o...,no,,,"Kelon, which is a type of tuning material used..."
784,Do the different species of zebras interbreed?,no,,,"The three main species of zebras, the plains z..."
785,Do zebras sleep standing up?,yes,,,"Zebras, being equines, do not typically sleep ..."


In [57]:
# Загрузка двух файлов с разными температурами
df1 = pd.read_json('data/qas/combined_dataset_with_responses_t1.json', orient='records', lines=True)
df2 = pd.read_json('data/qas/combined_dataset_with_responses.json', orient='records', lines=True)

# Проверка существующих столбцов и удаление лишних
columns_to_drop = ['ModelResponse_x', 'ModelResponse_y']
for col in columns_to_drop:
    if col in df1.columns:
        df1 = df1.drop(columns=[col])
    if col in df2.columns:
        df2 = df2.drop(columns=[col])

# Объединение датафреймов
combined_df = pd.concat([df1, df2], ignore_index=True)


# Сохранение объединенного датафрейма
combined_output_path = 'data/qas/combined_dataset.json'
combined_df.to_json(combined_output_path, orient='records', lines=True)
print(f"Объединенный датасет сохранен в {combined_output_path}")

Объединенный датасет сохранен в data/qas/combined_dataset.json


In [58]:
len(df1), len(df2), len(combined_df)

(787, 787, 1574)

In [59]:
combined_df

Unnamed: 0,Question,Answer,ModelResponse
0,Was Abraham Lincoln the sixteenth President of...,yes,"Yes, Abraham Lincoln was indeed the 16th Presi..."
1,Did Lincoln sign the National Banking Act of 1...,yes,"No, Abraham Lincoln did not sign the National ..."
2,Did his mother die of pneumonia?,no,"I apologize, but there is no information provi..."
3,Did Lincoln beat John C. Breckinridge in the 1...,yes,"Actually, Abraham Lincoln did not face John C...."
4,Was Abraham Lincoln the first President of the...,No,"No, Abraham Lincoln was not the first Presiden..."
...,...,...,...
1569,Was it likely that the xylophone reached Europ...,yes,It's possible that the xylophone may have reac...
1570,Does kelon increase or decrease tone quality o...,no,Kelvon (not Kelon) is not a specific type of x...
1571,Do the different species of zebras interbreed?,no,"Yes, the three main species of zebras - plains..."
1572,Do zebras sleep standing up?,yes,"No, zebras do not sleep standing up. Like many..."


In [64]:
# Загрузка промежуточных данных, если файл существует
output_path = 'data/qas/combined_dataset_with_responses_and_classification.json'
try:
    existing_df = pd.read_json(output_path, orient='records', lines=True)
    # Приведение столбца ModelResponse к строковому типу данных
    existing_df['ModelResponse'] = existing_df['ModelResponse'].astype(str)
    combined_df = combined_df.merge(existing_df, on=['Question', 'Answer', 'ModelResponse'], how='left')
    combined_df['Classification'] = combined_df['Classification'].fillna(existing_df['Classification'])
except FileNotFoundError:
    combined_df['Classification'] = None

In [65]:
combined_df

Unnamed: 0,Question,Answer,ModelResponse,Classification
0,Was Abraham Lincoln the sixteenth President of...,yes,"Yes, Abraham Lincoln was indeed the 16th Presi...",
1,Did Lincoln sign the National Banking Act of 1...,yes,"No, Abraham Lincoln did not sign the National ...",
2,Did his mother die of pneumonia?,no,"I apologize, but there is no information provi...",
3,Did Lincoln beat John C. Breckinridge in the 1...,yes,"Actually, Abraham Lincoln did not face John C....",
4,Was Abraham Lincoln the first President of the...,No,"No, Abraham Lincoln was not the first Presiden...",
...,...,...,...,...
1579,Was it likely that the xylophone reached Europ...,yes,It's possible that the xylophone may have reac...,
1580,Does kelon increase or decrease tone quality o...,no,Kelvon (not Kelon) is not a specific type of x...,
1581,Do the different species of zebras interbreed?,no,"Yes, the three main species of zebras - plains...",
1582,Do zebras sleep standing up?,yes,"No, zebras do not sleep standing up. Like many...",


In [66]:
# Функция для классификации ответов
def classify_response(question, model_response):
    prompt = f"Question: {question}\nAnswer: {model_response}\nClassify the answer as 'yes', 'no', or 'neither'."
    try:
        completion = client.chat.completions.create(
            model="lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF",
            messages=[
                {"role": "system", "content": "You are a helpful assistant. You have to provide answer 'yes', 'no' or 'neither'."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.1,
        )
        response = completion.choices[0].message.content.strip().lower()
        if 'yes' in response:
            return 'yes'
        elif 'no' in response:
            return 'no'
        else:
            return 'neither'
    except Exception as e:
        print(f"Error with question: {question}\n{e}\n")
        return "neither"

# Применение функции к каждому вопросу и ответу в датафрейме и сохранение результатов
for index, row in combined_df.iterrows():
    if pd.isna(row['Classification']):
        combined_df.at[index, 'Classification'] = classify_response(row['Question'], row['ModelResponse'])
        combined_df.to_json(output_path, orient='records', lines=True)
        print(f"Processed {index + 1}/{len(combined_df)}")

# Сохранение итогового датафрейма в JSON
classified_output_path = 'data/qas/combined_dataset_with_classification.json'
combined_df.to_json(classified_output_path, orient='records', lines=True)

print(f"Итоговый датасет с классификацией сохранен в {classified_output_path}")

Processed 1/1584
Processed 2/1584
Processed 3/1584
Processed 4/1584
Processed 5/1584
Processed 6/1584
Processed 7/1584
Processed 8/1584
Processed 9/1584
Processed 10/1584
Processed 11/1584
Processed 12/1584
Processed 13/1584
Processed 14/1584
Processed 15/1584
Processed 16/1584
Processed 17/1584
Processed 18/1584
Processed 19/1584
Processed 20/1584
Processed 21/1584
Processed 22/1584
Processed 23/1584
Processed 24/1584
Processed 25/1584
Processed 26/1584
Processed 27/1584
Processed 28/1584
Processed 29/1584
Processed 30/1584
Processed 31/1584
Processed 32/1584
Processed 33/1584
Processed 34/1584
Processed 35/1584
Processed 36/1584
Processed 37/1584
Processed 38/1584
Processed 39/1584
Processed 40/1584
Processed 41/1584
Processed 42/1584
Processed 43/1584
Processed 44/1584
Processed 45/1584
Processed 46/1584
Processed 47/1584
Processed 48/1584
Processed 49/1584
Processed 50/1584
Processed 51/1584
Processed 52/1584
Processed 53/1584
Processed 54/1584
Processed 55/1584
Processed 56/1584
P