# Generate Q&A pairs from knowledge graph encodings

This notebook showcases how to load knowledge graph embeddings and generate Q&A pairs for LLM finetuning.

Q&A pairs generated should be carefully evaluated for accuracy.

## Import Packages

In [1]:
import pickle
import anthropic

## Define Functions

In [3]:
def save_to_pickle(data, file_path):
    """
    Save data to a pickle file.

    Parameters:
    - data: The data to be saved.
    - file_path: The path to the pickle file (e.g., '*.pkl').
    """
    with open(file_path, 'wb') as file:
        pickle.dump(data, file)
    print(f"Data successfully saved to {file_path}")

def generate_qa(
        summarized_texts, 
        anthropic_client, 
        qa_generation_prompt,
        system_prompt="You are a medical oncology journal editor.",
        save_progress=True
    ):
    """
    Generate Q&A from summarized GLAM encodings.

    Parameters:
    - summarized_texts: dict, mapping each node to its GLAM text encoding summarized.
    - anthropic_client: object, anthropic client.
    - qa_generation_prompt: str, the prompt to guide the agent for Q&A generation.
    - system_prompt: str, the system prompt defining the agent role.
    - save_progress: bool, boolean to save progress after every model response.

    Returns:
    - qa: dict, mapping each node to its Q&A generated.
    """
    qa = {}

    for node, text in summarized_texts.items():
        print(f"Processing node: {node}...")
        # Construct the full prompt for the LLM
        input_text = f"{qa_generation_prompt}\n---\nContext: {text}"

        try:
            response = anthropic_client.messages.create(
                model="claude-3-haiku-20240307", #cheapest model. For this task we can probably also use Llama models.
                max_tokens=2048,
                system=system_prompt,
                messages=[
                    {"role": "user", "content": input_text}
                ]
            )
            qa[node] = response.content[0].text

            if save_progress:
                save_to_pickle(qa,'../../data/intermediate/qa_intermediate.pkl')

        except Exception as e:
            print(f"Error processing node {node}: {e}")
            qa[node] = None  # Handle errors gracefully

    return qa

## Load data

In [2]:
with open('../../data/intermediate/summarized_texts.pkl','rb') as f:
    summarized_texts = pickle.load(f)

## Generate Q&A

In [6]:
# Define Anthropic client
anthropic_client = anthropic.Anthropic()

In [5]:
# Define Q&A generation prompt
qa_generation_prompt = '''Task Description: The provided context contains information about oncology related drugs, procedures, etc. Your task is to thoroughly analyze the text and generate comprehensive Q&A pairs that capture ALL the key information provided. Consider the following aspects, but create questions for ANY important information present in the text:

- Drug classification and characteristics
- Mechanism of action and targets
- Regulatory approvals and timelines
- Clinical indications and approved uses
- Administration details
- Historical significance and development

Format Requirements:
Q1: [Comprehensive question that allows for detailed answer]
A1: [Complete answer synthesizing all relevant information from the text]

Important guidelines:
- Extract ALL relevant information from the text - don't miss any key details
- Create questions that allow for comprehensive answers rather than single-fact responses
- Combine related information into coherent Q&A pairs
- Answers should synthesize information while staying true to the source text
- If multiple related pieces of information exist, combine them into one Q&A pair'''

In [None]:
# Generate Q&A
qa = generate_qa(summarized_texts,anthropic_client,qa_generation_prompt)

## Save Results

In [None]:
save_to_pickle(qa,'../../data/intermediate/qa.pkl')