# Generate Evaluation Q&A Pairs

This notebook obtains a subset of training Q&A pairs and generate a single Q&A to be used for training evaluation.

## Import Packages

In [5]:
import pandas as pd
import pickle
import random
import anthropic

## Define Functions

In [6]:
def save_to_pickle(data, file_path):
    """
    Save data to a pickle file.

    Parameters:
    - data: The data to be saved.
    - file_path: The path to the pickle file (e.g., '*.pkl').
    """
    with open(file_path, 'wb') as file:
        pickle.dump(data, file)
    print(f"Data successfully saved to {file_path}")

def generate_eval(
        qa, 
        anthropic_client, 
        eval_generation_prompt,
        system_prompt="You are an expert AI trainer specialized in generating evaluation datasets based on training data.",
        save_progress=True
    ):
    """
    Generate evaluation Q&A from training Q&A pairs.

    Parameters:
    - qa: str, Q&A pairs for a given concept.
    - anthropic_client: object, anthropic client.
    - eval_generation_prompt: str, the prompt to guide the agent for evaluation Q&A generation.
    - system_prompt: str, the system prompt defining the agent role.
    - save_progress: bool, boolean to save progress after every model response.

    Returns:
    - eval_qa: str, Q&A to be used during model evaluation.
    """
    # Construct the full prompt for the LLM
    input_text = f"{eval_generation_prompt}\n---\nTraining Q&A pairs: {qa}"

    try:
        response = anthropic_client.messages.create(
            model="claude-3-haiku-20240307", #cheapest model. For this task we can probably also use Llama models.
            max_tokens=2048,
            system=system_prompt,
            messages=[
                {"role": "user", "content": input_text}
            ]
        )
        eval_qa = response.content[0].text
        if save_progress:
            save_to_pickle(eval_qa,'../../data/intermediate/eval_qa_intermediate.pkl')

    except Exception as e:
        print(f"Error processing: {e}")
        eval_qa = -1
    return eval_qa

## Load data

In [7]:
with open('../../data/intermediate/qa.pkl','rb') as f:
    qa = pickle.load(f)

# Store qa in a dataframe
qa = pd.DataFrame.from_dict(qa, orient='index', columns=['qa'])
qa.index.name = 'concept_code'

## Generate Evaluation Q&A

In [8]:
# Define Anthropic client
anthropic_client = anthropic.Anthropic()

In [9]:
# Define Q&A generation prompt
eval_generation_prompt = '''Task Description: You will be provided Q&A pairs which are part of a training dataset for LLM finetuning. Generate 1 specific question and its corresponding answer that can be used for training evaluation of the Q&A pairs provided.

You MUST generate the Q&A pair in the following format:
Q1: [Question]
A1: [Answer that can be directly verified from the text]

Important guidelines:
- If there are multiple training question and answers, try to create a evaluation question and answer pair that combines them as much as possible
- Keep the question specific and unambiguous
- Avoid yes/no questions
- Remain faithful to the original content while varying verbal form or structure
'''

In [10]:
# Define evaluation dataset size
n_eval = int(len(qa)*0.05)
# Define list of potential indeces
all_idx = list(qa.index)

In [11]:
# Subset training data
eval_idx = []
for _ in range(n_eval):
    found_valid_idx = False
    while not found_valid_idx:
        # choose random index
        idx = random.choice(all_idx)
        # Remove index from all possible indices to choose from
        all_idx.remove(idx)
        # check if qa text does not contain words such as context or text or information which would make the entry unaccapetable
        qa_text = qa.loc[idx,'qa']
        if ('context' not in qa_text) and ('text' not in qa_text) and ('information' not in qa_text):
            # Append index to the eval list of indices
            eval_idx.append(idx)
            # Stop iterating
            found_valid_idx = True
        else:
            # Try finding another index
            found_valid_idx = False

In [12]:
eval_qa = qa[qa.index.isin(eval_idx)]
eval_qa = eval_qa.rename({'qa':'training_qa'}, axis=1)

In [None]:
# Generate evaluation Q&A pairs
eval_qa['evaluation_qa'] = eval_qa['training_qa'].apply(lambda x: generate_eval(x,anthropic_client,eval_generation_prompt,save_progress=False))

In [None]:
eval_qa = eval_qa.drop(['training_qa'],axis=1)

## Save Results

In [None]:
save_to_pickle(eval_qa,'../../data/intermediate/eval_qa.pkl')