## Step 2 - Setup system

In [1]:
from dotenv import load_dotenv
import os
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from openai import OpenAI

# Load environment variables from a .env file
load_dotenv()
OPENAI_API = os.getenv('OPENAI_API_KEY')
embedding = OpenAIEmbeddings(model="text-embedding-3-large", openai_api_key=OPENAI_API)
db_openai = Chroma(persist_directory="./vectordb/openai_vectorDB/", embedding_function=embedding) #for existing database

In [2]:
llm = OpenAI(api_key=OPENAI_API)
cont = llm.containers.create(name="test")

container_id_manual = cont.id
print(f"Container created with ID: {container_id_manual}")

Container created with ID: cntr_6848012dada4819180050a42566649ab0b9192992cccfdb1


In [None]:
from openai.types.responses.response_output_message import ResponseOutputMessage
import requests
import numpy as np

def extract_llm_response_code_interpreter(response: list) -> str:
    output = ""
    #print(response)
    #print(len(response))
    for index in range(len(response)-1, -1, -1):
        if isinstance(response[index], ResponseOutputMessage):
            #print(len(response[out].content)) check if this length is 1
            #print(f"Output {index}:\n{response[index].content[0].text}")
            output = response[index].content[0].text
            break #stop after the answer is found
    return output

In [None]:
def extract_llm_response_code_interpreter_image(response: list) -> np.ndarray:
    # Extract the file citation annotation from the last output message
    output_message = response.output[-1].content[0]
    annotations = output_message.annotations

    # Find the annotation with type 'container_file_citation'
    image_annotation = next(
        (ann for ann in annotations if getattr(ann, "type", "") == "container_file_citation"),
        None
    )

    if image_annotation is not None:
        file_id = image_annotation.file_id
        filename = image_annotation.filename
        print(f"Image file ID: {file_id}")
        print(f"Image filename: {filename}")

        # Check file extension
        if filename.lower().endswith(".png"):
            save_as = filename
        elif filename.lower().endswith(".jpg") or filename.lower().endswith(".jpeg"):
            save_as = filename
        else:
            raise ValueError("Unsupported file extension. Only .png and .jpg are supported.")

        # Download the file from the container
        url = f"https://api.openai.com/v1/containers/{container_id_manual}/files/{file_id}/content"
        headers = {"Authorization": f"Bearer {OPENAI_API}"}

        response_file = requests.get(url, headers=headers)
        if response_file.status_code == 200:
            with open(save_as, "wb") as f:
                f.write(response_file.content)
            print(f"File downloaded and saved as {save_as}")
        else:
            print(f"Failed to download file: {response_file.status_code} - {response_file.text}")
    else:
        print("No image annotation found in the response.")
    return save_as

In [6]:
image = extract_llm_response_code_interpreter_image(response)

Image file ID: cfile_6848013f5a188191861a4bc60e077181
Image filename: cfile_6848013f5a188191861a4bc60e077181.png
File downloaded and saved as cfile_6848013f5a188191861a4bc60e077181.png


In [4]:
try:
    response = llm.responses.create(
        model="o4-mini",
        tools=[{"type": "code_interpreter", "container": container_id_manual}],
        #instructions=custom_prompt, #disabled the prompt for now, we could include it here later and directly feed the exam question
        #but this approach might not work with the Streamlit app
        input="Create a histogram of the following data: [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]",
    )
    # Extract the answer from the response
    print(f"Response received: {response}")
    answer = extract_llm_response_code_interpreter(response.output)
    image = extract_llm_response_code_interpreter_image(response)
    if answer == "":
        raise ValueError("Empty answer received from LLM.")
except Exception as e:
    print(f"Error processing question in container\nError: {e}")

Response received: Response(id='resp_68480134339481a2a3f3341476f4d81f018cf5e364e5d963', created_at=1749549364.0, error=None, incomplete_details=None, instructions=None, metadata={}, model='o4-mini-2025-04-16', object='response', output=[ResponseReasoningItem(id='rs_6848013536e481a28a3689a6bf081eeb018cf5e364e5d963', summary=[], type='reasoning', encrypted_content=None, status=None), ResponseCodeInterpreterToolCall(id='ci_6848013a3bfc81a2bbf52f08238aaa2f018cf5e364e5d963', code="import matplotlib.pyplot as plt\n\n# Data\ndata = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]\n\n# Create histogram\nplt.figure(figsize=(6, 4))\nplt.hist(data, bins=[0.5, 1.5, 2.5, 3.5, 4.5], edgecolor='black')\nplt.xticks([1, 2, 3, 4])\nplt.xlabel('Value')\nplt.ylabel('Frequency')\nplt.title('Histogram of Given Data')\nplt.grid(axis='y', alpha=0.75)\nplt.show()", results=None, status='completed', type='code_interpreter_call', container_id='cntr_6848012dada4819180050a42566649ab0b9192992cccfdb1', outputs=None), ResponseOutputMe

In [8]:
response.output[-1].content

[ResponseOutputText(annotations=[AnnotationContainerFileCitation(container_id='cntr_6847f97337cc8191ac0f3788d0cd05750a5731a572001f38', end_index=0, file_id='cfile_6847f9833d8081918800fef81d1581a2', start_index=0, type='container_file_citation', filename='cfile_6847f9833d8081918800fef81d1581a2.png')], text="Here's the histogram of your data:\n\n- Value 1 appears once.\n- Value 2 appears twice.\n- Value 3 appears three times.\n- Value 4 appears four times.\n\nThe x-axis shows the data values and the y-axis shows their frequencies. Let me know if you’d like any changes or further analysis!", type='output_text', logprobs=None)]

Image file ID: cfile_6847f9833d8081918800fef81d1581a2
File downloaded and saved as cfile_6847f9833d8081918800fef81d1581a2.png


## Step 3 - Evaluate with system

In [7]:
from tqdm import tqdm

def get_llm_answer_openai(exam_question, prompt, openai_model, use_RAG=False, k=4, container_id=None) -> str:
    if use_RAG == True:
        if k == 0:
            return "Error retrieving"
        # Retrieve relevant context from the vector database
        retrieved_docs = db_openai.similarity_search(exam_question, k=k)
        context = "\n\n".join(doc.page_content for doc in retrieved_docs)
        rag_prompt = (
            "Use the following pieces of retrieved context to answer the question. "
            "\n\nContext:\n" + context
        )
        prompt = prompt + rag_prompt

    messages = []
    messages.append({"role": "system", "content": prompt})
    messages.append({"role": "user", "content": exam_question})
    if container_id is not None:
        #print(f"Using container ID: {container_id}")
        try:
            response = llm.responses.create(
                model=openai_model,
                tools=[{"type": "code_interpreter", "container": container_id}],
                #instructions=custom_prompt, #disabled the prompt for now, we could include it here later and directly feed the exam question
                #but this approach might not work with the Streamlit app
                input=messages
            )
            # Extract the answer from the response
            answer = extract_llm_response_code_interpreter(response.output)
            if answer == "":
                raise ValueError("Empty answer received from LLM.")
        except Exception as e:
            print(f"Error processing question in container: {exam_question}\nError: {e}")
            answer = get_llm_answer_openai(exam_question, prompt, openai_model, use_RAG, k=k-1, container_id=container_id)
    else:
        try:
            response = llm.chat.completions.create(
                model=openai_model,
                messages=messages,
            )
            answer = response.choices[0].message.content
        except Exception as e:
            print(f"Error processing question: {exam_question}\nError: {e}")
            answer = get_llm_answer_openai(exam_question, prompt, openai_model, use_RAG, k=k-1)
    return answer


def get_llm_answers_openai(exam_questions_TF, prompt, openai_model, use_RAG=False, container_id=None) -> list[str]:
    llm_answers_TF = []
    for question in tqdm(exam_questions_TF):
        llm_answers_TF.append(get_llm_answer_openai(question, prompt, openai_model, use_RAG=use_RAG, container_id=container_id))
    return llm_answers_TF

In [8]:
def extract_answer_llm(llm_answers: list[str]) -> list[str]:
    cleaned_answers = []
    for org_answer in llm_answers:
        answer = org_answer.lower()
        if "true" in answer and "false" in answer:
            #print("Case 1")
            # Check if one is bold that overrules the other
            # For example, an answer is bold, and there is "not true" in the answer too
            if "\\textbf{false}" in answer and "\\textbf{true}" in answer:
                print("Warning: Both True and False found in the answer, CONFLICT.")
                print("Answer:", answer)
                print("Index:", llm_answers.index(org_answer))
                print(40* "-")
                cleaned_answers.append("CONFLICT")
            # Give priority to bold answer or final conclusion in the beginning of the answer
            elif "\\textbf{false}" in answer or "false" in answer[:5]: 
                cleaned_answers.append("False")
            elif "\\textbf{true}" in answer or "true" in answer[:5]:
                cleaned_answers.append("True")
            else:
                print("Warning: unchecked case.")
                print("Answer:", answer)
                print("Index:", llm_answers.index(org_answer))
                print(40* "-")
                cleaned_answers.append("CONFLICT")
        else:
            #print("Case 2")
            if "true" in answer or "\\textbf{true}" in answer:
                cleaned_answers.append("True")
            elif "false" in answer or "\\textbf{False}" in answer:
                cleaned_answers.append("False")
            else:
                print("Warning: Neither True nor False found in the answer, CONFLICT.")
                print("Answer:", answer)
                print("Index:", llm_answers.index(org_answer))
                print(40* "-")
                cleaned_answers.append("CONFLICT")
    return cleaned_answers

In [9]:
def extract_answers_Martijn(true_answers: list[str]) -> list[str]:
    cleaned_answers = []
    for org_answer in true_answers:
        answer = org_answer.lower()
        if "\\textbf{false}" in answer:
            cleaned_answers.append("False")
        elif "\\textbf{true}" in answer:
            cleaned_answers.append("True")
        else:
            print("Warning: Neither True nor False found in the Martijn's answer, CONFLICT.")
            print("Answer:", answer)
            print("Index:", true_answers.index(org_answer))
            print("CONFLICT MARTIJN")
        #cleaned_answers.append("True")
    return cleaned_answers

In [10]:
def compute_accuracy(Martijn_answers: list[str], llm_answers: list[str]) -> float:
    correct = 0
    incorrect = 0
    for Martijn_answer, llm_answer in zip(Martijn_answers, llm_answers):
        if Martijn_answer.lower() == llm_answer.lower():
            correct += 1
        else:
            incorrect += 1
    total = correct + incorrect
    return correct/total

In [11]:
import pandas as pd

def convert_dataframe(questions: list[str], llm_answers: list[str], Martijn_answers: list[str]) -> pd.DataFrame:
    df = pd.DataFrame({
        'Question': questions,
        'LLM Answer': llm_answers,
        'Martijn Answer': Martijn_answers
    })
    return df

### Step 3.1 - Baseline LLM

In [12]:
openai_model = "o4-mini" #o4-mini-2025-04-16
custom_prompt = (
    "You are an assistant for question-answering tasks in linear algebra. "
    "Your are given a True/False statement. You must include 'True', 'False' or 'I don't know' in your answer. "
    "If the statement is 'False', a counter-example is sufficient. "
    "If the statement is 'True', you briefly outline a proof and/or mention relevant theorems. "
    "If you are not sure, you say 'I don't know'. "
    "Please use LaTeX formatting for mathematical expressions by writing them between dollar signs."
    "For example, to write a matrix, use $\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$. " 
    "You can write and run code to answer the question. "
)
custom_prompt_construction = (
    "You are an assistant for question-answering tasks in linear algebra. "
    "Your are given a question to construct. "
    "If you do not know the answer, respond with 'I don't know'. "
    "Please use LaTeX formatting for mathematical expressions by writing them between dollar signs."
    "For example, to write a matrix, use $\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$. "
    "You can write and run code to answer the question. "
)

True/False questions

In [None]:
get_llm_answers_openai([exam_questions_TF[0]], custom_prompt, openai_model, container_id=container_id_manual) #single step

In [14]:
llm_answers_TF = get_llm_answers_openai(exam_questions_TF, custom_prompt, openai_model, container_id=container_id_manual)

100%|██████████| 84/84 [09:28<00:00,  6.77s/it]


In [15]:
llm_answers_cleaned = extract_answer_llm(llm_answers_TF)
Martijn_answers = extract_answers_Martijn(exam_answers_TF)

assert len(llm_answers_cleaned) == len(Martijn_answers), "Mismatch between LLM answers and Martijn's answers."

accuracy = compute_accuracy(Martijn_answers, llm_answers_cleaned)
print(f"Accuracy of LLM answers compared to Martijn's answers: {accuracy:.2%}")

Accuracy of LLM answers compared to Martijn's answers: 92.86%


In [16]:
df_baseline_o4mini_CODE_TF = convert_dataframe(
    exam_questions_TF,
    llm_answers_TF,
    exam_answers_TF
)
df_baseline_o4mini_CODE_TF.to_pickle("results/o4-mini-code/o4mini_TF_CODE_3.pkl")
#temp = pd.read_pickle("results/o4-mini-code/o4mini_TF_CODE_3.pkl")
#df_baseline_o4mini_CODE_TF.to_csv("results/o4-mini-code/o4mini_TF_CODE_3.csv", sep="\t", index=False)

Construction questions

In [17]:
if False:
    llm_answers_construction = get_llm_answers_openai(exam_answers_construction, custom_prompt_construction, openai_model, container_id=container_id_manual)

    df_baseline_o4mini_construction = convert_dataframe(
        exam_questions_construction,
        llm_answers_construction,
        exam_answers_construction
    )
    df_baseline_o4mini_construction.to_pickle("results/o4-mini-code/o4mini_CODE_Construction.pkl")
    #temp = pd.read_pickle("results/o4-mini-code/o4mini_CODE_Construction.pkl")
    #df_baseline_o4mini_construction.to_csv("results/o4-mini-code/o4mini_CODE_Construction.csv", sep="\t", index=False)

### Step 3.2 - Baseline LLM + RAG

In [18]:
openai_model = "o4-mini" #o4-mini-2025-04-16
custom_prompt = (
    "You are an assistant for question-answering tasks in linear algebra. "
    "Your are given a True/False statement. You must include 'True', 'False' or 'I don't know' in your answer. "
    "If the statement is 'False', a counter-example is sufficient. "
    "If the statement is 'True', you briefly outline a proof and/or mention relevant theorems. "
    "If you are not sure, you say 'I don't know'. "
    "Please use LaTeX formatting for mathematical expressions by writing them between dollar signs."
    "For example, to write a matrix, use $\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$. " 
    "You can write and run code to answer the question. "
)
custom_prompt_construction = (
    "You are an assistant for question-answering tasks in linear algebra. "
    "Your are given a question to construct. "
    "If you do not know the answer, respond with 'I don't know'. "
    "Please use LaTeX formatting for mathematical expressions by writing them between dollar signs."
    "For example, to write a matrix, use $\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$. "
    "You can write and run code to answer the question. "
)

True/False questions

In [19]:
llm_answers_TF = get_llm_answers_openai(exam_questions_TF, custom_prompt, openai_model, use_RAG=True, container_id=container_id_manual)

100%|██████████| 84/84 [10:34<00:00,  7.56s/it]


In [20]:
error_present = any("Error retrieving" in answer for answer in llm_answers_TF)
print(f'Contains "Error retrieving": {error_present}')

Contains "Error retrieving": False


In [21]:
llm_answers_cleaned = extract_answer_llm(llm_answers_TF)
Martijn_answers = extract_answers_Martijn(exam_answers_TF)

accuracy = compute_accuracy(Martijn_answers, llm_answers_cleaned)
print(f"Accuracy of LLM answers compared to Martijn's answers: {accuracy:.2%}")

Accuracy of LLM answers compared to Martijn's answers: 96.43%


In [22]:
df_baseline_o4mini_RAG_CODE_TF = convert_dataframe(
    exam_questions_TF,
    llm_answers_TF,
    exam_answers_TF
)
df_baseline_o4mini_RAG_CODE_TF.to_pickle("results/o4-mini-code/o4mini_RAG_CODE_TF_3.pkl")
#temp = pd.read_pickle("results/o4-mini-code/o4mini_RAG_CODE_TF_3.pkl")
#df_baseline_o4mini_RAG_CODE_TF.to_csv("results/o4-mini-code/o4mini_RAG_CODE_TF_3.csv", sep="\t", index=False)

Construction questions

In [23]:
if False:
    llm_answers_construction = get_llm_answers_openai(exam_answers_construction, custom_prompt_construction, openai_model, use_RAG=True, container_id=container_id_manual)

    df_baseline_o4mini_RAG_construction = convert_dataframe(
        exam_questions_construction,
        llm_answers_construction,
        exam_answers_construction
    )
    df_baseline_o4mini_RAG_construction.to_pickle("results/o4-mini-code/o4mini_RAG_CODE_Construction.pkl")
    #temp = pd.read_pickle("results/o4-mini-code/o4mini_RAG_CODE_Construction.pkl")
    #df_baseline_o4mini_RAG_construction.to_csv("results/o4-mini-code/o4mini_RAG_CODE_Construction.csv", sep="\t", index=False)

### Step 4.1 - Baseline LLM + all-in-prompt

In [24]:
import json

def load_json(filename: str) -> dict:
    """
    Load the JSON file.

    Args:
        filename (str): name of the file to load

    Returns:
        dict: json file content as a dictionary
    """
    with open(filename, 'r') as f:
        file = json.load(f)
    return file

topics = load_json('topics.json')['Topics']

In [25]:
theorems = []
for topic in topics.values():
    for section, items in topic.items():
        for item in items:
            if isinstance(item, dict) and "metadata" in item:
                if item["metadata"].get("type") == "theorem":
                    theorems.append(item)
print(f"There are {len(theorems)} theorems.")
theorems_text = [theorem['text'] for theorem in theorems] #only get the text of the theorems

There are 65 theorems.


In [26]:
openai_model = "o4-mini" #o4-mini-2025-04-16
custom_prompt = (
    "You are an assistant for question-answering tasks in linear algebra. "
    "Your are given a True/False statement. You must include 'True', 'False' or 'I don't know' in your answer. "
    "If the statement is 'False', a counter-example is sufficient. "
    "If the statement is 'True', you briefly outline a proof and/or mention relevant theorems. "
    "If you are not sure, you say 'I don't know'. "
    "Please use LaTeX formatting for mathematical expressions by writing them between dollar signs."
    "For example, to write a matrix, use $\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$. "
    "You can write and run code to answer the question. "
    "You can use the following theorems to answer the question. " + "\n\n".join(theorems_text)
)
custom_prompt_construction = (
    "You are an assistant for question-answering tasks in linear algebra. "
    "Your are given a question to construct. "
    "If you do not know the answer, respond with 'I don't know'. "
    "Please use LaTeX formatting for mathematical expressions by writing them between dollar signs."
    "For example, to write a matrix, use $\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$. "
    "You can write and run code to answer the question. "
    "You can use the following theorems to answer the question. " + "\n\n".join(theorems_text)
)

True/False questions

In [27]:
llm_answers_TF = get_llm_answers_openai(exam_questions_TF, custom_prompt, openai_model, container_id=container_id_manual)

100%|██████████| 84/84 [11:56<00:00,  8.53s/it]


In [28]:
llm_answers_cleaned = extract_answer_llm(llm_answers_TF)
Martijn_answers = extract_answers_Martijn(exam_answers_TF)

accuracy = compute_accuracy(Martijn_answers, llm_answers_cleaned)
print(f"Accuracy of LLM answers compared to Martijn's answers: {accuracy:.2%}")

Accuracy of LLM answers compared to Martijn's answers: 94.05%


In [29]:
df_baseline_o4mini_PROMPT_CODE_TF = convert_dataframe(
    exam_questions_TF,
    llm_answers_TF,
    exam_answers_TF
)
df_baseline_o4mini_PROMPT_CODE_TF.to_pickle("results/o4-mini-code/o4mini_PROMPT_CODE_TF_3.pkl")
#temp = pd.read_pickle("results/o4-mini-code/o4mini_PROMPT_CODE_TF_3.pkl")
#df_baseline_o4mini_PROMPT_CODE_TF.to_csv("results/o4-mini-code/o4mini_PROMPT_CODE_TF_3.csv", sep="\t", index=False)

Construction questions

In [30]:
if False:
    llm_answers_construction = get_llm_answers_openai(exam_answers_construction, custom_prompt_construction, openai_model, container_id=container_id_manual)
    
    df_baseline_o4mini_PROMPT_construction = convert_dataframe(
        exam_questions_construction,
        llm_answers_construction,
        exam_answers_construction
    )
    df_baseline_o4mini_PROMPT_construction.to_pickle("results/o4-mini-code/o4mini_PROMPT_CODE_Construction.pkl")
    #temp = pd.read_pickle("results/o4-mini-code/o4mini_PROMPT_CODE_Construction.pkl")
    #df_baseline_o4mini_PROMPT_construction.to_csv("results/o4-mini-code/o4mini_PROMPT_CODE_Construction.csv", sep="\t", index=False)