# Create training dataset

To create a training dataset we will extract each question and answer pair from the LLM raw outputs

## Import packages

In [1]:
import pandas as pd
import pickle
import re

## Load data

In [2]:
with open('../../data/intermediate/qa.pkl','rb') as f:
    raw_data = pickle.load(f)

In [3]:
data = pd.DataFrame.from_dict(raw_data, orient='index', columns=['text'])
data.index.name = 'concept_code'

## Define extraction function

In [4]:
def extract_qa(text):
    q_pattern = re.compile(r'Q[0-9]+:\s*(.*?\?)', re.MULTILINE)
    a_pattern = re.compile(r'^\s*A\d*:\s*(.+?)(?=^\s*Q\d+:|\Z)', re.MULTILINE | re.DOTALL)
    questions = re.findall(q_pattern, text)
    answers = re.findall(a_pattern,text)
    if len(questions)!=len(answers):
        qa_list = -1
    else:
        qa_list = [{'question':q.strip(),'answer':a.strip()} for q,a in zip(questions,answers)]
    return qa_list

In [5]:
data['qa'] = data['text'].apply(lambda x: extract_qa(x))

In [6]:
# Explode Q&A pairs
exploded_df = data.explode(column=['qa'])

In [7]:
# Remove some noise data that didn't produce qa pairs
exploded_df = exploded_df[~exploded_df['qa'].isna()]

In [8]:
exploded_df['question'] = exploded_df['qa'].apply(lambda x: x['question'])
exploded_df['answer'] = exploded_df['qa'].apply(lambda x: x['answer'])

## Creating Training Data

In [9]:
training_data = exploded_df[['question','answer']].reset_index(drop=True)

In [None]:
# Visualize training data
training_data.head(10)

## Save Training Data

In [13]:
training_data.to_pickle('../../data/training_data.pkl')
training_data.to_csv('../../data/training_data.csv', index=False)