# Intrinsic evaluation - Mock exams
Dr. ir. M Boussé provided mock exams for the system to evaluate.

## Step 1 - Load data
Load the .tex files and get the questions with their answers

In [1]:
import re

def extract_question_answer(tex_content):
    # Extract content within the enumerate environment
    enum_match = re.search(r'\\begin{enumerate}(.*?)\\end{enumerate}', tex_content, re.DOTALL)
    if not enum_match:
        return [], []
    enum_content = enum_match.group(1)

    # Find all questions (\item ... \begin{solutionorbox})
    question_blocks = re.findall(
        r'\\item(.*?)(?=\\begin{solutionorbox})',
        enum_content, re.DOTALL
    )
    # Find all answers (\begin{solutionorbox} ... \end{solutionorbox})
    answer_blocks = re.findall(
        r'\\begin{solutionorbox}\[[^\]]*\]\s*(.*?)\\end{solutionorbox}',
        enum_content, re.DOTALL
    )
    questions = [q.strip() for q in question_blocks]
    answers = [a.strip() for a in answer_blocks]
    return questions, answers

In [None]:
# Split out True/False questions
exam_questions_TF = []
exam_answers_TF = []


with open('exams/together.tex', 'r', encoding='utf-8') as file:
    tex_content = file.read()
    questions, answers = extract_question_answer(tex_content)
    print(f"Total Questions: {len(questions)}")
    print(f"Total Answers: {len(answers)}")
    if len(questions) != len(answers):
        print("Warning: The number of questions and answers do not match!")

    
    print()
    for idx, (q, a) in enumerate(zip(questions, answers), 1):
        print(f"Question {idx}:\n{q}\n")
        print(f"Answer {idx}:\n{a}\n")
        exam_questions_TF.append(q)
        exam_answers_TF.append(a)

Total Questions: 8
Total Answers: 8

Question 1:
% new 2 \ linear independence
	If $\{\textbf{u},\textbf{v}\}$ is linearly independent and $\{\textbf{v},\textbf{w}\}$ is linearly independent, then so is $\{\textbf{u},\textbf{v},\textbf{w}\}$.
	
	\begin{itemize}
		\item[$\square$] True
		\item[$\square$] False
	\end{itemize}

Answer 1:
\textbf{False}.
		
		Counter-example: The set $\left\{\begin{bmatrix} 1 \\ 0\end{bmatrix},\begin{bmatrix} 0 \\ 1\end{bmatrix}\right\}$ is linearly dependent and the set $\left\{\begin{bmatrix} 1 \\ 0\end{bmatrix},\begin{bmatrix} 1 \\ 1\end{bmatrix}\right\}$ is linearly dependent, but the set $\left\{\begin{bmatrix} 1 \\ 0\end{bmatrix},\begin{bmatrix} 0 \\ 1\end{bmatrix},\begin{bmatrix} 1 \\ 1\end{bmatrix}\right\}$ is \textbf{not} linearly dependent.

Question 2:
% variation of lecture
	Let $\textbf{A}$ and $\textbf{B}$ be two orthogonal matrices, then the product $\textbf{A}\textbf{B}$ is also \textbf{always} orthogonal.
	
	\begin{itemize}
		\item[$\squar

In [2]:
exam_questions = []
exam_answers = []

# Split out True/False questions
exam_questions_TF = []
exam_answers_TF = []

# Split out construction questions
exam_questions_construction = []
exam_answers_construction = []

exam_files = [
    'exams/together.tex',
]
for exam_file in exam_files:
    with open(exam_file, 'r', encoding='utf-8') as file:
        tex_content = file.read()
        questions, answers = extract_question_answer(tex_content)
        print(f"Processing file: {exam_file}")
        print(f"Total Questions: {len(questions)}")
        print(f"Total Answers: {len(answers)}")
        if len(questions) != len(answers):
            print("Warning: The number of questions and answers do not match!")
        exam_questions.extend(questions)
        exam_answers.extend(answers)

        
        print()
        for idx, (q, a) in enumerate(zip(questions, answers), 1):
            print(f"Question {idx}:\n{q}\n")
            print(f"Answer {idx}:\n{a}\n")
            print("-" * 40)
            if "construction" in q:
                exam_questions_construction.append(q)
                exam_answers_construction.append(a)
            else:
                exam_questions_TF.append(q)
                exam_answers_TF.append(a)
    print("=" * 80)

Processing file: exams/together.tex
Total Questions: 8
Total Answers: 8

Question 1:
% new 2 \ linear independence
	If $\{\textbf{u},\textbf{v}\}$ is linearly independent and $\{\textbf{v},\textbf{w}\}$ is linearly independent, then so is $\{\textbf{u},\textbf{v},\textbf{w}\}$.
	
	\begin{itemize}
		\item[$\square$] True
		\item[$\square$] False
	\end{itemize}

Answer 1:
\textbf{False}.
		
		Counter-example: The set $\left\{\begin{bmatrix} 1 \\ 0\end{bmatrix},\begin{bmatrix} 0 \\ 1\end{bmatrix}\right\}$ is linearly dependent and the set $\left\{\begin{bmatrix} 1 \\ 0\end{bmatrix},\begin{bmatrix} 1 \\ 1\end{bmatrix}\right\}$ is linearly dependent, but the set $\left\{\begin{bmatrix} 1 \\ 0\end{bmatrix},\begin{bmatrix} 0 \\ 1\end{bmatrix},\begin{bmatrix} 1 \\ 1\end{bmatrix}\right\}$ is \textbf{not} linearly dependent.

----------------------------------------
Question 2:
% variation of lecture
	Let $\textbf{A}$ and $\textbf{B}$ be two orthogonal matrices, then the product $\textbf{A}\text

In [5]:
# Print total counts
print(f"Total Questions across all exams: {len(exam_questions)}")
print(f"Total Answers across all exams: {len(exam_answers)}")

assert len(exam_questions) == len(exam_answers), "Mismatch between total questions and answers across all exams."
#assert len(exam_questions) == 108, "Manually calculated number of questions/answers does not match."

#assert len(exam_answers_construction) == 6*len(exam_files), "Manually calculated number of construction exercises does not match."

Total Questions across all exams: 8
Total Answers across all exams: 8


## Step 2 - Setup system

In [4]:
from dotenv import load_dotenv
import os
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from openai import OpenAI

# Load environment variables from a .env file
load_dotenv()
OPENAI_API = os.getenv('OPENAI_API_KEY')
embedding = OpenAIEmbeddings(model="text-embedding-3-large", openai_api_key=OPENAI_API)
db_openai = Chroma(persist_directory="./vectordb/openai_vectorDB/", embedding_function=embedding) #for existing database
llm = OpenAI(api_key=OPENAI_API)

## Step 3 - Evaluate with system

In [5]:
from tqdm import tqdm

def get_llm_answer_openai(exam_question, prompt, openai_model, use_RAG=False, k=4) -> str:
    if use_RAG == True:
        if k == 0:
            return "Error retrieving"
        # Retrieve relevant context from the vector database
        retrieved_docs = db_openai.similarity_search(exam_question, k=k)
        context = "\n\n".join(doc.page_content for doc in retrieved_docs)
        rag_prompt = (
            "Use the following pieces of retrieved context to answer the question. "
            "\n\nContext:\n" + context
        )
        prompt = prompt + rag_prompt

    messages = []
    messages.append({"role": "system", "content": prompt})
    messages.append({"role": "user", "content": exam_question})

    try:
        response = llm.chat.completions.create(
            model=openai_model,
            messages=messages,
        )
        answer = response.choices[0].message.content
    except Exception as e:
        print(f"Error processing question: {exam_question}\nError: {e}")
        answer = get_llm_answer_openai(exam_question, prompt, openai_model, use_RAG, k=k-1)
    return answer


def get_llm_answers_openai(exam_questions_TF, prompt, openai_model, use_RAG=False) -> list[str]:
    llm_answers_TF = []
    for question in tqdm(exam_questions_TF):
        llm_answers_TF.append(get_llm_answer_openai(question, prompt, openai_model, use_RAG=use_RAG))
    return llm_answers_TF

In [6]:
if False:
    from tqdm import tqdm

    def get_llm_answer_openai(exam_questions_TF, prompt, openai_model, use_RAG=False, k=4) -> list[str]:
        if k == 0:
            return 
        llm_answers_TF = []

        for question in tqdm(exam_questions_TF):
            if use_RAG == True:
                # Retrieve relevant context from the vector database
                retrieved_docs = db_openai.similarity_search(question, k=k)
                context = "\n\n".join(doc.page_content for doc in retrieved_docs)
                rag_prompt = (
                    "Use the following pieces of retrieved context to answer the question. "
                    "\n\nContext:\n" + context
                )
                prompt = prompt + rag_prompt

            messages = []
            messages.append({"role": "system", "content": prompt})
            messages.append({"role": "user", "content": question})
            
            try:
                response = llm.chat.completions.create(
                    model=openai_model,
                    messages=messages,
                )
                answer = response.choices[0].message.content
            except Exception as e:
                print(f"Error processing question: {question}\nError: {e}")
                try:
                    if use_RAG == True:
                        # Retrieve relevant context from the vector database
                        retrieved_docs = db_openai.similarity_search(question, k=3)
                        context = "\n\n".join(doc.page_content for doc in retrieved_docs)
                        rag_prompt = (
                            "Use the following pieces of retrieved context to answer the question. "
                            "\n\nContext:\n" + context
                        )
                        prompt = prompt + rag_prompt

                    messages = []
                    messages.append({"role": "system", "content": prompt})
                    messages.append({"role": "user", "content": question})
                    response = llm.chat.completions.create(
                        model=openai_model,
                        messages=messages,
                    )
                    answer = response.choices[0].message.content
                except Exception as e:
                    print(f"Retry failed for question: {question}\nError: {e}")
                    answer = "Error retrieving answer"
            llm_answers_TF.append(answer)
        return llm_answers_TF

In [7]:
def extract_answer_llm(llm_answers: list[str]) -> list[str]:
    cleaned_answers = []
    for org_answer in llm_answers:
        answer = org_answer.lower()
        if "true" in answer and "false" in answer:
            #print("Case 1")
            # Check if one is bold that overrules the other
            # For example, an answer is bold, and there is "not true" in the answer too
            if "\\textbf{false}" in answer and "\\textbf{true}" in answer:
                print("Warning: Both True and False found in the answer, CONFLICT.")
                print("Answer:", answer)
                print("Index:", llm_answers.index(org_answer))
                print(40* "-")
                cleaned_answers.append("CONFLICT")
            # Give priority to bold answer or final conclusion in the beginning of the answer
            elif "\\textbf{false}" in answer or "false" in answer[:5]: 
                cleaned_answers.append("False")
            elif "\\textbf{true}" in answer or "true" in answer[:5]:
                cleaned_answers.append("True")
            else:
                print("Warning: unchecked case.")
                print("Answer:", answer)
                print("Index:", llm_answers.index(org_answer))
                print(40* "-")
                cleaned_answers.append("CONFLICT")
        else:
            #print("Case 2")
            if "true" in answer or "\\textbf{true}" in answer:
                cleaned_answers.append("True")
            elif "false" in answer or "\\textbf{False}" in answer:
                cleaned_answers.append("False")
            else:
                print("Warning: Neither True nor False found in the answer, CONFLICT.")
                print("Answer:", answer)
                print("Index:", llm_answers.index(org_answer))
                print(40* "-")
                cleaned_answers.append("CONFLICT")
    return cleaned_answers

In [8]:
def extract_answers_Martijn(true_answers: list[str]) -> list[str]:
    cleaned_answers = []
    for org_answer in true_answers:
        answer = org_answer.lower()
        if "\\textbf{false}" in answer:
            cleaned_answers.append("False")
        elif "\\textbf{true}" in answer:
            cleaned_answers.append("True")
        else:
            print("Warning: Neither True nor False found in the Martijn's answer, CONFLICT.")
            print("Answer:", answer)
            print("Index:", true_answers.index(org_answer))
            print("CONFLICT MARTIJN")
        #cleaned_answers.append("True")
    return cleaned_answers

In [9]:
def compute_accuracy(Martijn_answers: list[str], llm_answers: list[str]) -> float:
    correct = 0
    incorrect = 0
    for Martijn_answer, llm_answer in zip(Martijn_answers, llm_answers):
        if Martijn_answer.lower() == llm_answer.lower():
            correct += 1
        else:
            incorrect += 1
    total = correct + incorrect
    return correct/total

In [10]:
import pandas as pd

def convert_dataframe(questions: list[str], llm_answers: list[str], Martijn_answers: list[str]) -> pd.DataFrame:
    df = pd.DataFrame({
        'Question': questions,
        'LLM Answer': llm_answers,
        'Martijn Answer': Martijn_answers
    })
    return df

### Step 3.1 - Baseline LLM

In [None]:
openai_model = "gpt-3.5-turbo" #gpt-3.5-turbo-0125
custom_prompt = (
    "You are an assistant for question-answering tasks in linear algebra. "
    "Your are given a True/False statement. You must include 'True', 'False' or 'I don't know' in your answer. "
    "If the statement is 'False', a counter-example is sufficient. "
    "If the statement is 'True', you briefly outline a proof and/or mention relevant theorems. "
    "If you are not sure, you say 'I don't know'. "
    "Please use LaTeX formatting for mathematical expressions by writing them between dollar signs."
    "For example, to write a matrix, use $\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$. "
)
custom_prompt_construction = (
    "You are an assistant for question-answering tasks in linear algebra. "
    "Your are given a question to construct. "
    "If you do not know the answer, respond with 'I don't know'. "
    "Please use LaTeX formatting for mathematical expressions by writing them between dollar signs."
    "For example, to write a matrix, use $\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$. "
)

True/False questions

In [12]:
llm_answers_TF = get_llm_answers_openai(exam_questions_TF, custom_prompt, openai_model)

100%|██████████| 84/84 [03:04<00:00,  2.20s/it]


In [13]:
llm_answers_cleaned = extract_answer_llm(llm_answers_TF)
Martijn_answers = extract_answers_Martijn(exam_answers_TF)

assert len(llm_answers_cleaned) == len(Martijn_answers), "Mismatch between LLM answers and Martijn's answers."

accuracy = compute_accuracy(Martijn_answers, llm_answers_cleaned)
print(f"Accuracy of LLM answers compared to Martijn's answers: {accuracy:.2%}")

Accuracy of LLM answers compared to Martijn's answers: 46.43%


In [None]:
df_baseline_GPT3_5_TF = convert_dataframe(
    exam_questions_TF,
    llm_answers_TF,
    exam_answers_TF
)
df_baseline_GPT3_5_TF.to_pickle("results/GPT-3_5-Turbo/baseline_GPT3_5_TF_3.pkl")
#temp = pd.read_pickle("results/GPT-3_5-Turbo/baseline_GPT3_5_TF.pkl")
#df_baseline_GPT3_5_TF.to_csv("results/GPT-3_5-Turbo/baseline_GPT3_5_TF.csv", sep="\t", index=False)

Construction questions

In [17]:
llm_answers_construction = get_llm_answers_openai(exam_answers_construction, custom_prompt_construction, openai_model)

100%|██████████| 24/24 [00:42<00:00,  1.75s/it]


In [None]:
df_baseline_GPT3_5_construction = convert_dataframe(
    exam_questions_construction,
    llm_answers_construction,
    exam_answers_construction
)
df_baseline_GPT3_5_construction.to_pickle("results/GPT-3_5-Turbo/baseline_GPT3_5_Construction.pkl")
#temp = pd.read_pickle("results/GPT-3_5-Turbo/baseline_GPT3_5_Construction.pkl")
#df_baseline_GPT3_5_construction.to_csv("results/GPT-3_5-Turbo/baseline_GPT3_5_Construction.csv", sep="\t", index=False)

### Step 3.2 - Baseline LLM + RAG

In [None]:
openai_model = "gpt-3.5-turbo" #gpt-3.5-turbo-0125
custom_prompt = (
    "You are an assistant for question-answering tasks in linear algebra. "
    "Your are given a True/False statement. You must include 'True', 'False' or 'I don't know' in your answer. "
    "If the statement is 'False', a counter-example is sufficient. "
    "If the statement is 'True', you briefly outline a proof and/or mention relevant theorems. "
    "If you are not sure, you say 'I don't know'. "
    "Please use LaTeX formatting for mathematical expressions by writing them between dollar signs."
    "For example, to write a matrix, use $\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$. "
)
custom_prompt_construction = (
    "You are an assistant for question-answering tasks in linear algebra. "
    "Your are given a question to construct. "
    "If you do not know the answer, respond with 'I don't know'. "
    "Please use LaTeX formatting for mathematical expressions by writing them between dollar signs."
    "For example, to write a matrix, use $\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$. "
)

True/False questions

In [12]:
llm_answers_TF = get_llm_answers_openai(exam_questions_TF, custom_prompt, openai_model, use_RAG=True)

100%|██████████| 84/84 [03:33<00:00,  2.55s/it]


In [13]:
error_present = any("Error retrieving" in answer for answer in llm_answers_TF)
print(f'Contains "Error retrieving": {error_present}')

Contains "Error retrieving": False


In [14]:
llm_answers_cleaned = extract_answer_llm(llm_answers_TF)
Martijn_answers = extract_answers_Martijn(exam_answers_TF)

accuracy = compute_accuracy(Martijn_answers, llm_answers_cleaned)
print(f"Accuracy of LLM answers compared to Martijn's answers: {accuracy:.2%}")

Accuracy of LLM answers compared to Martijn's answers: 44.05%


In [None]:
df_baseline_GPT3_5_RAG_TF = convert_dataframe(
    exam_questions_TF,
    llm_answers_TF,
    exam_answers_TF
)
df_baseline_GPT3_5_RAG_TF.to_pickle("results/GPT-3_5-Turbo/baseline_GPT3_5_RAG_TF_3.pkl")
#temp = pd.read_pickle("results/GPT-3_5-Turbo/baseline_GPT3_5_RAG_TF.pkl")
#df_baseline_GPT3_5_RAG_TF.to_csv("results/GPT-3_5-Turbo/baseline_GPT3_5_RAG_TF.csv", sep="\t", index=False)

Construction questions

In [24]:
llm_answers_construction = get_llm_answers_openai(exam_answers_construction, custom_prompt_construction, openai_model, use_RAG=True)

100%|██████████| 24/24 [00:57<00:00,  2.39s/it]


In [None]:
df_baseline_GPT3_5_RAG_construction = convert_dataframe(
    exam_questions_construction,
    llm_answers_construction,
    exam_answers_construction
)
df_baseline_GPT3_5_RAG_construction.to_pickle("results/GPT-3_5-Turbo/baseline_GPT3_5_RAG_Construction.pkl")
#temp = pd.read_pickle("results/GPT-3_5-Turbo/baseline_GPT3_5_RAG_Construction.pkl")
#df_baseline_GPT3_5_RAG_construction.to_csv("results/GPT-3_5-Turbo/baseline_GPT3_5_RAG_Construction.csv", sep="\t", index=False)

### Step 4.1 - Baseline LLM + all-in-prompt

In [11]:
import json

def load_json(filename: str) -> dict:
    """
    Load the JSON file.

    Args:
        filename (str): name of the file to load

    Returns:
        dict: json file content as a dictionary
    """
    with open(filename, 'r') as f:
        file = json.load(f)
    return file

topics = load_json('topics.json')['Topics']

In [12]:
theorems = []
for topic in topics.values():
    for section, items in topic.items():
        for item in items:
            if isinstance(item, dict) and "metadata" in item:
                if item["metadata"].get("type") == "theorem":
                    theorems.append(item)
print(f"There are {len(theorems)} theorems.")
theorems_text = [theorem['text'] for theorem in theorems] #only get the text of the theorems

There are 65 theorems.


In [13]:
openai_model = "gpt-3.5-turbo" #gpt-3.5-turbo-0125
custom_prompt = (
    "You are an assistant for question-answering tasks in linear algebra. "
    "Your are given a True/False statement. You must include 'True', 'False' or 'I don't know' in your answer. "
    "If the statement is 'False', a counter-example is sufficient. "
    "If the statement is 'True', you briefly outline a proof and/or mention relevant theorems. "
    "If you are not sure, you say 'I don't know'. "
    "Please use LaTeX formatting for mathematical expressions by writing them between dollar signs."
    "For example, to write a matrix, use $\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$. "
    "You can use the following theorems to answer the question. " + "\n\n".join(theorems_text)
)
custom_prompt_construction = (
    "You are an assistant for question-answering tasks in linear algebra. "
    "Your are given a question to construct. "
    "If you do not know the answer, respond with 'I don't know'. "
    "Please use LaTeX formatting for mathematical expressions by writing them between dollar signs."
    "For example, to write a matrix, use $\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$. "
    "You can use the following theorems to answer the question. " + "\n\n".join(theorems_text)
)

True/False questions

In [14]:
llm_answers_TF = get_llm_answers_openai(exam_questions_TF, custom_prompt, openai_model)

100%|██████████| 84/84 [02:55<00:00,  2.09s/it]


In [15]:
llm_answers_cleaned = extract_answer_llm(llm_answers_TF)
Martijn_answers = extract_answers_Martijn(exam_answers_TF)

accuracy = compute_accuracy(Martijn_answers, llm_answers_cleaned)
print(f"Accuracy of LLM answers compared to Martijn's answers: {accuracy:.2%}")

Accuracy of LLM answers compared to Martijn's answers: 63.10%


In [None]:
df_baseline_GPT3_5_PROMPT_TF = convert_dataframe(
    exam_questions_TF,
    llm_answers_TF,
    exam_answers_TF
)
df_baseline_GPT3_5_PROMPT_TF.to_pickle("results/GPT-3_5-Turbo/baseline_GPT3_5_PROMPT_TF_3.pkl")
#temp = pd.read_pickle("results/GPT-3_5-Turbo/baseline_GPT3_5_PROMPT_TF.pkl")
#df_baseline_GPT3_5_PROMPT_TF.to_csv("results/GPT-3_5-Turbo/baseline_GPT3_5_PROMPT_TF.csv", sep="\t", index=False)

Construction questions

In [14]:
llm_answers_construction = get_llm_answers_openai(exam_answers_construction, custom_prompt_construction, openai_model)

100%|██████████| 24/24 [00:48<00:00,  2.01s/it]


In [15]:
df_baseline_GPT3_5_PROMPT_construction = convert_dataframe(
    exam_questions_construction,
    llm_answers_construction,
    exam_answers_construction
)
df_baseline_GPT3_5_PROMPT_construction.to_pickle("results/GPT-3_5-Turbo/baseline_GPT3_5_PROMPT_Construction.pkl")
#temp = pd.read_pickle("results/GPT-3_5-Turbo/baseline_GPT3_5_PROMPT_Construction.pkl")
#df_baseline_GPT3_5_PROMPT_construction.to_csv("results/GPT-3_5-Turbo/baseline_GPT3_5_PROMPT_Construction.csv", sep="\t", index=False)