# Baseline

This is demo code for Baseline.

Input files include text, image, csv table and the MC questions. The output are the MC answer.

Input files include response from model 1 (chatGPT_response.txt) and the MC questions. The output are the MC answer.

## Package installation

In [None]:
%pip install python-docx
%pip install anthropic
# !pip install mistralai
%pip install tiktoken

In [None]:
import os
import fnmatch
import pandas as pd
from google.colab import drive

from openai import OpenAI
# import google.generativeai as genai
import anthropic
# from mistralai import Mistral
from google import genai
from google.genai import types

import csv
import json
import docx
import openpyxl
import base64
import chardet
from typing import Union, List, Dict
import tiktoken
import time
import numpy as np


## File Reading Functions

In [None]:
def file_to_text(file_path: str) -> str:
    """
    Converts a DOCX, XLSX, CSV, JSON, RMD (R Markdown), or PY file
    into a text string suitable for LLM input.

    :param file_path: The path to the file to be converted.
    :return: A single string containing the file’s textual contents.
    """
    _, ext = os.path.splitext(file_path)
    ext = ext.lower()

    if ext == ".docx":
        return _docx_to_text(file_path)
    elif ext == ".xlsx":
        return _xlsx_to_text(file_path)
    elif ext == ".csv":
        return _csv_to_text(file_path)
    elif ext == ".json":
        return _json_to_text(file_path)
    elif ext == ".rmd":
        return _rmarkdown_to_text(file_path)
    elif ext == ".md":
        return _rmarkdown_to_text(file_path)
    elif ext == ".py":
        return _python_to_text(file_path)
    elif ext == ".txt":
        return _txt_to_text(file_path)
    else:
        raise ValueError(f"Unsupported file extension: {ext}")


def _docx_to_text(file_path: str) -> str:
    """
    Reads a DOCX file and extracts all text paragraphs.
    Requires 'python-docx' (pip install python-docx).
    """
    if docx is None:
        raise ImportError("Missing dependency 'python-docx'. Install via `pip install python-docx`.")

    doc = docx.Document(file_path)
    paragraphs = [para.text for para in doc.paragraphs]
    return "\n".join(paragraphs)


def _xlsx_to_text(file_path: str) -> str:
    """
    Reads an XLSX file and concatenates all cells as text.
    Requires 'openpyxl' (pip install openpyxl).
    """
    if openpyxl is None:
        raise ImportError("Missing dependency 'openpyxl'. Install via `pip install openpyxl`.")

    wb = openpyxl.load_workbook(file_path, data_only=True)
    all_text = []
    for sheet_name in wb.sheetnames:
        sheet = wb[sheet_name]
        all_text.append(f"--- Sheet: {sheet_name} ---")
        for row in sheet.iter_rows(values_only=True):
            row_text = [str(cell) if cell is not None else "" for cell in row]
            all_text.append("\t".join(row_text))

    return "\n".join(all_text)


def _csv_to_text(file_path: str) -> str:
    """
    Reads a CSV file line by line and returns its text representation.
    """
    rows = []
    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            rows.append("\t".join(row))
    return "\n".join(rows)


def _json_to_text(file_path: str) -> str:
    """
    Reads a JSON file and returns its pretty-printed JSON string.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return json.dumps(data, indent=2, ensure_ascii=False)


def _rmarkdown_to_text(file_path: str) -> str:
    """
    R Markdown files are basically text files with embedded code.
    We’ll just read the raw text for simplicity.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    return content

def _txt_to_text(file_path: str) -> str:
    """
    Reads a text file and returns its content as a string.
    Automatically detects encoding to avoid decoding errors.
    """
    #import chardet
    try:
        # Detect the file encoding
        with open(file_path, 'rb') as file:
            raw_data = file.read()
            detected = chardet.detect(raw_data)
            encoding = detected['encoding']

        # Read the file with the detected encoding
        with open(file_path, 'r', encoding=encoding) as file:
            return file.read()
    except Exception as e:
        raise RuntimeError(f"Error reading file {file_path}: {e}")

def _python_to_text(file_path: str) -> str:
    """
    Python (.py) files are plain text. Just read the entire file content.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        code = f.read()
    return code

def encode_image(image_path: str) -> tuple[str, str]:
    """
    Encode an image file to base64 and return its MIME type.
    """
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"Image file not found: {image_path}")

    # Read and encode image
    with open(image_path, "rb") as image_file:
        base64_data = base64.b64encode(image_file.read()).decode('utf-8')

    # Get MIME type based on file extension
    _, extension = os.path.splitext(image_path)
    extension = extension.lower()

    mime_type = {
        '.jpg': 'image/jpeg',
        '.jpeg': 'image/jpeg',
        '.png': 'image/png',
        '.gif': 'image/gif',
        '.webp': 'image/webp'
    }.get(extension)

    if not mime_type:
        raise ValueError(f"Unsupported image format: {extension}")

    return base64_data, mime_type


## Model functions

In [None]:
def generate_MC_answer_openai(api_key: str, dir: str, rfile_path: str, images_folder_path: str, dfile_path: str, mc_txt: str, system_prompt: str = None, model: str = "gpt-4o") -> str:
    """
    Generates a result paragraph based on the given report and multiple-choice document using OpenAI's API.

    Args:
        api_key (str): Your OpenAI API key.
        report_txt (str): The main report content to analyze.
        mc_doc (str): The multiple-choice document content for the question.
        system_prompt (str, optional): The system-level instruction for the AI. Defaults to a biological writing prompt.
        model (str, optional): The OpenAI model to use. Defaults to "gpt-4o".

    Returns:
        str: The generated result paragraph.
    """


    # Initialize OpenAI client
    client = OpenAI(api_key=api_key)

    # Use a default system prompt if none is provided
    if system_prompt is None:
        system_prompt = (
            "You are an expert scientist who has a strong background in both bioinformatics and biology."
            "I have provided you with a .txt file containing the outputs from my analysis, and in some cases, additional supporting materials such as plots."
            "These files are the core analytics you will need to understand to read the following multiple choice question."
            "When answering the multiple choice question, please select one option only."
            "Please be concise without including any additional information. For example if you think option A is the correct answer, then please just output A."
        )

    # Prepare the input message
    user_prompt=[]

    user_prompt.append(
       "Please read the following .txt file and any supporting materials and use the information to answer the multiple choice question below.\n Please provide only the letter of the correct option (A, B, C, D, or E). Do not include the answer text, explanations, or any other information."
        )

    if pd.notna(rfile_path):
        report_txt = file_to_text(os.path.join(dir,rfile_path))
        user_prompt.append("Txt file:\n" + report_txt)

    # if pd.notna(dfile_path):
    #     data_txt = file_to_text(os.path.join(dir,dfile_path))
    #     user_prompt.append("I have a csv file:" + "\n" + data_txt)

    if pd.notna(dfile_path):
        # Split dfile_path into individual file paths
        file_paths = dfile_path.split("\n")
        if len(file_paths)>1:
            user_prompt.append("CSV files:\n")
        else:
            user_prompt.append("CSV file:\n")
        for file_path in file_paths:
            # Process each file path
            full_path = os.path.join(dir, file_path.strip())  # Strip any extra spaces or newline characters
            if os.path.exists(full_path):  # Ensure the file exists before reading
                data_txt = file_to_text(full_path)  # Convert the file to text
                user_prompt.append(data_txt+"\n")
            else:
                print(f"File not found: {full_path}")


    if pd.notna(images_folder_path):
        image_path_list = []
        image_dir=os.path.join(dir,images_folder_path,"figure-markdown_strict")
        for file_name in os.listdir(image_dir):
            if file_name.endswith((".jpg", ".jpeg", ".png", ".gif", ".webp")):
                image_path_list.append(os.path.join(image_dir, file_name))
        image_file_list = [encode_image(item) for item in image_path_list]
        for image_file,mime_type in image_file_list:
            user_prompt.append("Image file:\n")
            user_prompt.append(
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:{mime_type};base64,{image_file}",
                    }
                }
            )


    user_prompt.append("Multiple choice question: \n"
                       + mc_txt )

    messages = [
        {"role": "system", "content": f"{system_prompt}"},
        {"role": "user", "content": f"{user_prompt}"},
    ]

    # print("Number of tokens:", count_tokens(messages,model=model))

    #Generate the completion
    try:
        chat_completion = client.chat.completions.create(
            messages=messages,
            model=model,
            # temperature=0.1
        )
        mcq_answer = chat_completion.choices[0].message.content

        messages.append({"role": "assistant", "content": mcq_answer})

        messages.append({"role": "user", "content": "Please indicate whether the information needed to answer the multiple-choice question can be found in the document. Respond with 'Yes' or 'No' only. Do not provide explanations or additional details."})
        response_yes_no = client.chat.completions.create(
            model=model,
            messages=messages  # Now includes Q1, A1, and Q2
        )
        yes_no_answer = response_yes_no.choices[0].message.content
        return mcq_answer, yes_no_answer
    except Exception as e:
        # raise RuntimeError(f"Error generating result paragraph: {e}")
        print(f"Exception occurred: {e}")
            # Return a default error string
        return "error", "error"



In [None]:
def generate_MC_answer_google_gemini(api_key: str, dir: str, rfile_path: str, images_folder_path: str, dfile_path: str, mc_txt: str, system_prompt: str = None, model: str = "gemini-2.0-flash-exp") -> str:
    """
    Generates a result paragraph based on the given report and multiple-choice document using Google's Generative AI API.

    Args:
        api_key (str): Your Google Generative AI API key.
        report_txt (str): The main report content to analyze.
        mc_doc (str): The multiple-choice document content for the question.
        system_prompt (str, optional): The system-level instruction for the AI. Defaults to a biological writing prompt.
        model (str, optional): The Google Generative AI model to use. Defaults to "models/chat-bison-001".

    Returns:
        str: The generated result paragraph.
    """

    # Configure the API key
    client = genai.Client(api_key=api_key)

    # Use a default system prompt if none is provided
    if system_prompt is None:
        system_prompt = (
            "You are an expert scientist who has a strong background in both bioinformatics and biology."
            "I have provided you with a .txt file containing the outputs from my analysis, and in some cases, additional supporting materials such as plots."
            "These files are the core analytics you will need to understand to read the following multiple choice question."
            "When answering the multiple choice question, please select one option only."
            "Please be concise without including any additional information. For example if you think option A is the correct answer, then please just output A."
        )

    # Prepare the input message
    user_prompt=[]

    user_prompt.append(
       "Please read the following .txt file and any supporting materials and use the information to answer the multiple choice question below.\n Please provide only the letter of the correct option (A, B, C, D, or E). Do not include the answer text, explanations, or any other information."
        )



    if pd.notna(rfile_path):
        report_txt = file_to_text(os.path.join(dir,rfile_path))
        user_prompt.append("Txt file:\n" + report_txt)

    # if pd.notna(dfile_path):
    #     data_txt = file_to_text(os.path.join(dir,dfile_path))
    #     user_prompt.append("I have a csv file:" + "\n" + data_txt)

    if pd.notna(dfile_path):
        # Split dfile_path into individual file paths
        file_paths = dfile_path.split("\n")
        if len(file_paths)>1:
            user_prompt.append("CSV files:\n")
        else:
            user_prompt.append("CSV file:\n")
        for file_path in file_paths:
            # Process each file path
            full_path = os.path.join(dir, file_path.strip())  # Strip any extra spaces or newline characters
            if os.path.exists(full_path):  # Ensure the file exists before reading
                data_txt = file_to_text(full_path)  # Convert the file to text
                user_prompt.append(data_txt+"\n")
            else:
                print(f"File not found: {full_path}")


    if pd.notna(images_folder_path):
        image_path_list = []
        image_dir=os.path.join(dir,images_folder_path,"figure-markdown_strict")
        for file_name in os.listdir(image_dir):
            if file_name.endswith((".jpg", ".jpeg", ".png", ".gif", ".webp")):
                image_path_list.append(os.path.join(image_dir, file_name))
        image_file_list = [encode_image(item) for item in image_path_list]
        for image_file,mime_type in image_file_list:
            user_prompt.append("Image file:\n")
            user_prompt.append(
                # {'mime_type': mime_type, 'data': image_file}
                # image_file
                types.Part.from_bytes(data=image_file, mime_type=mime_type)
            )

    user_prompt.append("Multiple choice question: \n"
                       + mc_txt )

    # messages = [
    #     {"role": "user", "parts": f"{user_prompt}"},
    # ]


        # Create the model
    # generation_config = {
    #   # "temperature": 0.1,
    #   "top_p": 0.95,
    #   "top_k": 40,
    #   "max_output_tokens": 8192,
    #   "response_mime_type": "text/plain",
    # }

    try:
        # chat_session = model.start_chat()
        # chat_session = model.start_chat(history=messages)
        # Send an additional message if needed (optional)
        # response = chat_session.send_message(messages)
        chat = client.chats.create(model=model,
                                   config=types.GenerateContentConfig(
                # generation_config=generation_config,
                # temperature= 0.5,
                response_mime_type="text/plain",
                system_instruction=system_prompt,
            ))
        response_mcq =  chat.send_message(
            user_prompt
        )

        response_yes_no = chat.send_message("Does the report contain the information necessary to answer the multiple-choice question? Please provide only 'Yes' or 'No'. Do not include explanations or additional details.")

        return response_mcq.text.strip(), response_yes_no.text.strip()

    except Exception as e:
        # raise RuntimeError(f"Error generating result paragraph with Generative AI: {e}")
        print(f"Exception occurred: {e}")
            # Return a default error string
        return "error", "error"




In [None]:
def generate_MC_answer_claude(api_key: str, dir: str, rfile_path: str, images_folder_path: str, dfile_path: str, mc_txt: str, system_prompt: str = None, model: str = "claude-3-5-sonnet-20241022") -> str:
    """
    Generates a multiple-choice answer based on the given report and document using Claude's API.

    Args:
        api_key (str): Your Claude API key.
        report_txt (str): The main report content to analyze.
        mc_doc (str): The multiple-choice document content for the question.
        system_prompt (str, optional): The system-level instruction for the AI. Defaults to a biological writing prompt.
        model (str, optional): The Claude model to use. Defaults to "claude-3-5-sonnet-20241022".

    Returns:
        str: The multiple-choice answer generated by Claude.
    """
    # Initialize the Claude API client
    client = anthropic.Anthropic(
        api_key=api_key
    )

    # Use a default system prompt if none is provided
    if system_prompt is None:
        system_prompt = (
            "You are an expert scientist who has a strong background in both bioinformatics and biology."
            "I have provided you with a .txt file containing the outputs from my analysis, and in some cases, additional supporting materials such as plots."
            "These files are the core analytics you will need to understand to read the following multiple choice question."
            "When answering the multiple choice question, please select one option only."
            "Please be concise without including any additional information. For example if you think option A is the correct answer, then please just output A."
        )

    # Prepare the input message
    user_prompt=[]
    user_prompt.append(
      {"type": "text", "text": "Please read the following .txt file and any supporting materials and use the information to answer the multiple choice question below.\n Please provide only the letter of the correct option (A, B, C, D, or E). Do not include the answer text, explanations, or any other information."
      })


    if pd.notna(rfile_path):
        report_txt = file_to_text(os.path.join(dir,rfile_path))
        user_prompt.append({"type": "text", "text":"Txt file:\n" + report_txt})

    # if pd.notna(dfile_path):
    #     data_txt = file_to_text(os.path.join(dir,dfile_path))
    #     user_prompt.append("I have a csv file:" + "\n" + data_txt)

    if pd.notna(dfile_path):
        # Split dfile_path into individual file paths
        file_paths = dfile_path.split("\n")
        if len(file_paths)>1:
            user_prompt.append({"type": "text", "text":"CSV files:\n"})
        else:
            user_prompt.append({"type": "text", "text":"CSV file:\n"})
        for file_path in file_paths:
            # Process each file path
            full_path = os.path.join(dir, file_path.strip())  # Strip any extra spaces or newline characters
            if os.path.exists(full_path):  # Ensure the file exists before reading
                data_txt = file_to_text(full_path)  # Convert the file to text
                user_prompt.append({"type": "text", "text": data_txt+"\n"})
            else:
                print(f"File not found: {full_path}")


    if pd.notna(images_folder_path):
        image_path_list = []
        image_dir=os.path.join(dir,images_folder_path,"figure-markdown_strict")
        for file_name in os.listdir(image_dir):
            if file_name.endswith((".jpg", ".jpeg", ".png", ".gif", ".webp")):
                image_path_list.append(os.path.join(image_dir, file_name))
        image_file_list = [encode_image(item) for item in image_path_list]
        for image_file,mime_type in image_file_list:
            user_prompt.append( "Image file:\n")
            user_prompt.append(
                {
                    "type": "image",
                    "source": {
                    "type": "base64",
                    "media_type": mime_type,  # e.g., "image/jpeg"
                    "data": image_file
                    }
                }
            )



    user_prompt.append({"type": "text", "text":"Multiple choice question: \n"
                       + mc_txt} )

    messages = [
        {"role": "user", "content": f"{user_prompt}"},
    ]


    # Call the Claude API to generate the response
    try:
        response_mcq = client.messages.create(
            model=model,
            max_tokens=8192,
           # temperature=0.1,
            system=system_prompt,
            messages=messages
        )

        mcq_answer = response_mcq.content[0].text.strip()

     # Append the response to maintain conversation history
        messages.append({"role": "assistant", "content": mcq_answer})

        messages.append({"role": "user", "content":"Does the report contain the information necessary to answer the multiple-choice question? Please provide only 'Yes' or 'No'. Do not include explanations or additional details."})

        response_yes_no = client.messages.create(
            model=model,
            max_tokens=1024,
            system=system_prompt,
            messages=messages  # Now includes Q1, A1, and Q2
        )
        yes_no_answer = response_yes_no.content[0].text.strip()

        return mcq_answer, yes_no_answer

    except Exception as e:
        # raise RuntimeError(f"Error generating multiple-choice answer with Claude: {e}")
        print(f"Exception occurred: {e}")
            # Return a default error string
        return "error", "error"



## Mount google drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


## Read Prompt Sheet

In [None]:
## Don't change this url
url = 'https://docs.google.com/spreadsheets/d/XXXXXXXXXXXXXXXXXXX/export?format=csv&gid=XXXXXXXXXXXXXXXXXXX'
case_df = pd.read_csv(url)
print(case_df.head(9))
df=case_df

     Authors          Google Folder              CaseStudy_ID  \
0  EXAMPLE 1  Bioinformatics_method  Dataset_method_increment   
1  EXAMPLE 2  Bioinformatics_method  Dataset_method_increment   
2  EXAMPLE 3  Bioinformatics_method  Dataset_method_increment   
3         DK                Pathway          Kidney_pathway_1   
4         DK                     DE               Kidney_DE_1   
5         DK                Pathway          Kidney_pathway_2   
6         YC                    CCI               Covid_CCI_1   
7         YC                    CCI               Covid_CCI_1   
8         YC                    CCI               Covid_CCI_1   

                   Sample_ID LLM task updated?    Input type  \
0  Dataset_method_incrementA                No         Graph   
1   Dataset_method_increment                No          Data   
2  Dataset_method_incrementB                No   Code + Data   
3          Kidney_pathway_1B                No  Code + Graph   
4                Kidney_DE_1 

In [None]:
## Don't change this url
url = 'https://docs.google.com/spreadsheets/d/XXXXXXXXXXXXXXXXXXX/export?format=csv&gid=XXXXXXXXXXXXXXXXXXX'
mc_df = pd.read_csv(url)
# print(mc_df.head(25))
print(mc_df['Specific Question'][0])
print(mc_df['Google Folder'][0])
print(mc_df['CaseStudy_ID'][0])

How many significant pathways are there after performing GSEA for both Biological Processes and Molecular Functions?
A. 3
B. 0
C. 5
D. 1
E. None of the above
Pathway
Kidney_pathway_1


### Parameters

In [None]:
OPENAI_api_key= "sk-"
GEMINI_api_key= "AI"
CLAUDE_api_key="sk-"
## root_dir is the only path that you need to modify by yourself.
## You may find the shared Proj-LLM-Bioinfo-Interpretation2024 folder in the /content/drive/MyDrive,
## so the dir path can be /content/drive/MyDrive/Proj-LLM-Bioinfo-Interpretation2024/Rmd_word_document/
root_dir = "/content/drive/MyDrive/Usyd/Proj-LLM-Bioinfo-Interpretation2024/Rmd_word_document/"
pattern = "MC_*.docx"

In [None]:
#### test
os.path.join(root_dir, df['Google Folder'][1], df['CaseStudy_ID'][1])

'/content/drive/MyDrive/Usyd/Proj-LLM-Bioinfo-Interpretation2024/Rmd_word_document/Bioinformatics_method/Dataset_method_increment'

### Multiple Choice questions


In [None]:
columns_to_check = ['TXT_input_ID', 'Graphics_input_Folder', 'Data_input_ID']
# index=np.where(mc_df["Authors"] == "YC")[0]
results = []
for idx in range(len(mc_df)): #range(len(mc_df))[20:30] range(len(mc_df))
    if pd.notna(mc_df['Google Folder'][idx]) and pd.notna(mc_df['CaseStudy_ID'][idx]):
        subfolder = os.path.join(root_dir, mc_df['Google Folder'][idx].strip(), mc_df['CaseStudy_ID'][idx])
        print(subfolder)
        if os.path.exists(subfolder):
            target_folder=mc_df['Google Folder'][idx].strip()
            target_casestudy=mc_df['CaseStudy_ID'][idx]
            mc_txt=mc_df['Specific Question'][idx]
            print("Index: ",idx)
            print(target_casestudy+"_"+str(mc_df['Question_ID'][idx]))
            print(mc_txt)
            filtered_df = case_df[case_df['CaseStudy_ID'] == target_casestudy]
            for i in range(filtered_df.shape[0]): #
                print(i)
                row_entry=filtered_df.iloc[i]
                print(row_entry["Sample_ID"])
                time.sleep(30)
                gpt4o_mcq_answer, gpt4o_info_check =generate_MC_answer_openai(api_key=OPENAI_api_key, dir=subfolder,
                                          rfile_path=row_entry["TXT_input_ID"], images_folder_path=row_entry["Graphics_input_Folder"],
                                          dfile_path=row_entry["Data_input_ID"], mc_txt=mc_txt, model = "gpt-4o")
                gemini_mcq_answer, gemini_info_check =generate_MC_answer_google_gemini(api_key=GEMINI_api_key, dir=subfolder,
                                          rfile_path=row_entry["TXT_input_ID"], images_folder_path=row_entry["Graphics_input_Folder"],
                                          dfile_path=row_entry["Data_input_ID"], mc_txt=mc_txt, model = "gemini-2.0-flash")
                claude_mcq_answer, claude_info_check = generate_MC_answer_claude(api_key=CLAUDE_api_key, dir=subfolder,
                                          rfile_path=row_entry["TXT_input_ID"], images_folder_path=row_entry["Graphics_input_Folder"],
                                          dfile_path=row_entry["Data_input_ID"], mc_txt=mc_txt, model = "claude-3-7-sonnet-20250219")
                print("Sample ID: "+row_entry['Sample_ID']+", Question ID: "+mc_df["CaseStudy_ID"][idx]+"_"+str(mc_df["Question_ID"][idx])+", GPT4o answer:"+gpt4o_mcq_answer+", GPT4o info check:"+gpt4o_info_check+"\n")
                print("Sample ID: "+row_entry['Sample_ID']+", Question ID: "+mc_df["CaseStudy_ID"][idx]+"_"+str(mc_df["Question_ID"][idx])+", Gemini answer:"+gemini_mcq_answer+", Gemini info check:"+gemini_info_check+"\n")
                print("Sample ID: "+row_entry['Sample_ID']+", Question ID: "+mc_df["CaseStudy_ID"][idx]+"_"+str(mc_df["Question_ID"][idx])+", Claude answer:"+claude_mcq_answer+", Claude info check:"+claude_info_check+"\n")
                model_answers = {
                    "gpt-4o": gpt4o_mcq_answer,
                    "gemini-2.0-flash": gemini_mcq_answer,
                    "claude-3-7-sonnet-20250219": claude_mcq_answer#,
                }

                info_check={
                        "gpt-4o": gpt4o_info_check,
                        "gemini-2.0-flash": gemini_info_check,
                        "claude-3-7-sonnet-20250219": claude_info_check
                  }
                for j in model_answers.keys():
                    new_row = {
                        "CaseStudy_ID": mc_df['CaseStudy_ID'][idx],
                        "Question_ID":  mc_df['Question_ID'][idx],
                        "Answer": mc_df['Answer'][idx],
                        "Sample_ID": row_entry['Sample_ID'],
                        "Model_name": j,
                        "Model_return": model_answers[j],
                        "Info_check": info_check[j]#,
                    }
                    selected_columns = ["Authors","Google Folder", "Data", "Task Category"]  # Replace with actual column names "Input type",
                    new_row.update(row_entry[selected_columns].to_dict())
                    results.append(new_row)


result_df = pd.DataFrame(results)


In [None]:
result_df.shape

(48, 11)

In [None]:
result_df.head()

Unnamed: 0,CaseStudy_ID,Question_ID,Answer,Sample_ID,Model_name,Model_return,Info_check,Authors,Google Folder,Data,Task Category
0,Kidney_pathway_1,2,B,Kidney_pathway_1B,claude-3-7-sonnet-20250219,B,Yes,DK,Pathway,Kidney,Descriptive - information retrival
1,Kidney_pathway_1,3,A,Kidney_pathway_1B,claude-3-7-sonnet-20250219,A,Yes,DK,Pathway,Kidney,Descriptive - information retrival
2,Covid_CCI_6,1,C,Covid_CCI_6A,claude-3-7-sonnet-20250219,C,Yes,YC,CCI,COVID,Descriptive - information retrival
3,Covid_CCI_6,1,C,Covid_CCI_6C,claude-3-7-sonnet-20250219,C,Yes,YC,CCI,COVID,Descriptive - information retrival
4,Covid_CCI_6,2,D,Covid_CCI_6A,claude-3-7-sonnet-20250219,B,Yes,YC,CCI,COVID,Descriptive - information retrival


In [None]:
output_path=os.path.join("/content/drive/MyDrive/Usyd/Proj-LLM-Bioinfo-Interpretation2024/MCQ_output","APRIL15_MCQ_result_strategy2_cleaned_2.csv")
print(output_path)
# Check if the file exists
file_exists = os.path.isfile(output_path)
result_df["Model_return"] = result_df["Model_return"].str.strip().str.replace(r"\.$", "", regex=True)
# Save data: Append if file exists, otherwise create a new one
result_df.to_csv(output_path, mode='a', index=False, header=not file_exists)

/content/drive/MyDrive/Usyd/Proj-LLM-Bioinfo-Interpretation2024/MCQ_output/APRIL15_MCQ_result_strategy2_cleaned_2.csv
