# Intrinsic evaluation - Mock exams - Vector similarity
Dr. ir. M Boussé provided mock exams for the system to evaluate.

## Step 1 - Load, calculate, return

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

def cosine_similarity(vec1, vec2):
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def evaluate_vector_similarity(file_name: str, embedding_model):
    df = pd.read_pickle(file_name)
    llm_answers = df["LLM Answer"].tolist()
    martijn_answers = df["Martijn Answer"].tolist()

    assert len(llm_answers) == len(martijn_answers), "The number of answers must match."

    similarities = []
    for i in tqdm(range(len(llm_answers))):
        llm_answer = llm_answers[i]
        martijn_answer = martijn_answers[i]

        emb1 = embedding_model.embed_query(llm_answer)
        emb2 = embedding_model.embed_query(martijn_answer)

        similarities.append(cosine_similarity(emb1, emb2))
    return np.mean(similarities)

## Step 2 - Setup system

In [None]:
from dotenv import load_dotenv
import os
from langchain_openai import OpenAIEmbeddings

# Load environment variables from a .env file
load_dotenv()
OPENAI_API = os.getenv('OPENAI_API_KEY')
embedding = OpenAIEmbeddings(model="text-embedding-3-large", openai_api_key=OPENAI_API)

## Step 3 - Evaluate with system

In [None]:
#evaluate_vector_similarity("results/GPT-3_5-Turbo/baseline_GPT3_5_TF_1.pkl", embedding)

In [4]:
import os

directory = "results/GPT-3_5-Turbo"
for filename in os.listdir(directory):
    score = evaluate_vector_similarity(f"{directory}/{filename}", embedding)
    print(f"{filename}: {score}")
    print()

100%|██████████| 24/24 [00:19<00:00,  1.24it/s]


baseline_GPT3_5_Construction.pkl: 0.5995982943090642



100%|██████████| 24/24 [00:17<00:00,  1.40it/s]


baseline_GPT3_5_PROMPT_Construction.pkl: 0.6802290016670476



100%|██████████| 84/84 [01:12<00:00,  1.16it/s]


baseline_GPT3_5_PROMPT_TF_1.pkl: 0.7264868759092055



100%|██████████| 84/84 [01:13<00:00,  1.15it/s]


baseline_GPT3_5_PROMPT_TF_2.pkl: 0.7404606349926406



100%|██████████| 84/84 [01:05<00:00,  1.29it/s]


baseline_GPT3_5_PROMPT_TF_3.pkl: 0.7267169690045898



100%|██████████| 24/24 [00:28<00:00,  1.18s/it]


baseline_GPT3_5_RAG_Construction.pkl: 0.6484688620869817



100%|██████████| 84/84 [01:14<00:00,  1.13it/s]


baseline_GPT3_5_RAG_TF_1.pkl: 0.7266822512554272



100%|██████████| 84/84 [00:57<00:00,  1.45it/s]


baseline_GPT3_5_RAG_TF_2.pkl: 0.728373680373423



100%|██████████| 84/84 [01:08<00:00,  1.22it/s]


baseline_GPT3_5_RAG_TF_3.pkl: 0.7222415618019398



100%|██████████| 84/84 [01:02<00:00,  1.35it/s]


baseline_GPT3_5_TF_1.pkl: 0.7158339720340555



100%|██████████| 84/84 [01:08<00:00,  1.23it/s]


baseline_GPT3_5_TF_2.pkl: 0.715062429514593



100%|██████████| 84/84 [01:23<00:00,  1.01it/s]

baseline_GPT3_5_TF_3.pkl: 0.7152455341035397






In [6]:
directory = "results/LearnLM"
for filename in os.listdir(directory):
    score = evaluate_vector_similarity(f"{directory}/{filename}", embedding)
    print(f"{filename}: {score}")
    print()

  0%|          | 0/24 [00:00<?, ?it/s]

100%|██████████| 24/24 [00:18<00:00,  1.29it/s]


LearnLM_Construction.pkl: 0.6307196873615832



100%|██████████| 24/24 [00:16<00:00,  1.47it/s]


LearnLM_PROMPT_Construction.pkl: 0.5535162491702663



100%|██████████| 84/84 [00:59<00:00,  1.40it/s]


LearnLM_PROMPT_TF_1.pkl: 0.7531173311655561



100%|██████████| 84/84 [01:02<00:00,  1.34it/s]


LearnLM_PROMPT_TF_2.pkl: 0.753585791170458



100%|██████████| 84/84 [01:10<00:00,  1.19it/s]


LearnLM_PROMPT_TF_3.pkl: 0.7535598328498402



100%|██████████| 24/24 [00:13<00:00,  1.74it/s]


LearnLM_RAG_Construction.pkl: 0.47075392716945913



100%|██████████| 84/84 [01:12<00:00,  1.16it/s]


LearnLM_RAG_TF_1.pkl: 0.7571904698903734



100%|██████████| 84/84 [01:09<00:00,  1.21it/s]


LearnLM_RAG_TF_2.pkl: 0.755472006455475



100%|██████████| 84/84 [01:07<00:00,  1.25it/s]


LearnLM_RAG_TF_3.pkl: 0.7596482059966818



100%|██████████| 84/84 [00:56<00:00,  1.48it/s]


LearnLM_TF_1.pkl: 0.7584957546534591



100%|██████████| 84/84 [03:09<00:00,  2.26s/it]


LearnLM_TF_2.pkl: 0.7558459985758241



100%|██████████| 84/84 [00:58<00:00,  1.44it/s]

LearnLM_TF_3.pkl: 0.7526197097598564






In [7]:
directory = "results/LearnLM-code"
for filename in os.listdir(directory):
    score = evaluate_vector_similarity(f"{directory}/{filename}", embedding)
    print(f"{filename}: {score}")
    print()

100%|██████████| 24/24 [02:26<00:00,  6.12s/it]


LearnLM_CODE_Construction.pkl: 0.5697890959810658



100%|██████████| 84/84 [01:26<00:00,  1.02s/it]


LearnLM_CODE_TF_1.pkl: 0.7346707689661456



100%|██████████| 84/84 [01:03<00:00,  1.32it/s]


LearnLM_CODE_TF_2.pkl: 0.7361402342534616



100%|██████████| 84/84 [00:57<00:00,  1.47it/s]


LearnLM_CODE_TF_3.pkl: 0.737995694236698



100%|██████████| 24/24 [00:15<00:00,  1.57it/s]


LearnLM_PROMPT_CODE_Construction.pkl: 0.5851901264990311



100%|██████████| 84/84 [01:12<00:00,  1.15it/s]


LearnLM_PROMPT_CODE_TF_1.pkl: 0.7209737969385371



100%|██████████| 84/84 [01:02<00:00,  1.35it/s]


LearnLM_PROMPT_CODE_TF_2.pkl: 0.7187773560179216



100%|██████████| 84/84 [02:01<00:00,  1.45s/it]


LearnLM_PROMPT_CODE_TF_3.pkl: 0.7165470939428866



100%|██████████| 24/24 [00:15<00:00,  1.55it/s]


LearnLM_RAG_CODE_Construction.pkl: 0.5854900776201216



100%|██████████| 84/84 [01:23<00:00,  1.01it/s]


LearnLM_RAG_CODE_TF_1.pkl: 0.7376574860195296



100%|██████████| 84/84 [01:02<00:00,  1.34it/s]


LearnLM_RAG_CODE_TF_2.pkl: 0.7345034277960428



100%|██████████| 84/84 [01:01<00:00,  1.36it/s]

LearnLM_RAG_CODE_TF_3.pkl: 0.7337120724717872






In [None]:
directory = "results/o4-mini"
for filename in os.listdir(directory):
    score = evaluate_vector_similarity(f"{directory}/{filename}", embedding)
    print(f"{filename}: {score:4f}")
    print()

100%|██████████| 24/24 [00:26<00:00,  1.09s/it]


o4mini_Construction.pkl: 0.650098



100%|██████████| 24/24 [02:38<00:00,  6.61s/it]


o4mini_PROMPT_Construction.pkl: 0.621577



100%|██████████| 84/84 [00:55<00:00,  1.53it/s]


o4mini_PROMPT_TF_1.pkl: 0.739318



100%|██████████| 84/84 [00:59<00:00,  1.42it/s]


o4mini_PROMPT_TF_2.pkl: 0.738640



100%|██████████| 84/84 [01:02<00:00,  1.35it/s]


o4mini_PROMPT_TF_3.pkl: 0.740549



100%|██████████| 24/24 [00:24<00:00,  1.03s/it]


o4mini_RAG_Construction.pkl: 0.681001



100%|██████████| 84/84 [01:07<00:00,  1.25it/s]


o4mini_RAG_TF_1.pkl: 0.742882



100%|██████████| 84/84 [01:47<00:00,  1.28s/it]


o4mini_RAG_TF_2.pkl: 0.740757



100%|██████████| 84/84 [01:03<00:00,  1.32it/s]


o4mini_RAG_TF_3.pkl: 0.737057



100%|██████████| 84/84 [01:07<00:00,  1.25it/s]


o4mini_TF_1.pkl: 0.733498



100%|██████████| 84/84 [01:01<00:00,  1.36it/s]


o4mini_TF_2.pkl: 0.736885



100%|██████████| 84/84 [00:53<00:00,  1.56it/s]

o4mini_TF_3.pkl: 0.733012






In [10]:
directory = "results/o4-mini-code"
for filename in os.listdir(directory):
    score = evaluate_vector_similarity(f"{directory}/{filename}", embedding)
    print(f"{filename}: {score:.4f}")
    print()

100%|██████████| 24/24 [00:16<00:00,  1.47it/s]


o4mini_CODE_Construction.pkl: 0.6593



100%|██████████| 84/84 [00:58<00:00,  1.42it/s]


o4mini_CODE_TF_1.pkl: 0.7321



100%|██████████| 84/84 [01:01<00:00,  1.36it/s]


o4mini_CODE_TF_2.pkl: 0.7402



100%|██████████| 84/84 [01:14<00:00,  1.13it/s]


o4mini_CODE_TF_3.pkl: 0.7361



100%|██████████| 24/24 [00:17<00:00,  1.40it/s]


o4mini_PROMPT_CODE_Construction.pkl: 0.6636



100%|██████████| 84/84 [01:07<00:00,  1.25it/s]


o4mini_PROMPT_CODE_TF_1.pkl: 0.7428



100%|██████████| 84/84 [01:08<00:00,  1.22it/s]


o4mini_PROMPT_CODE_TF_2.pkl: 0.7446



100%|██████████| 84/84 [01:13<00:00,  1.14it/s]


o4mini_PROMPT_CODE_TF_3.pkl: 0.7398



100%|██████████| 24/24 [00:15<00:00,  1.52it/s]


o4mini_RAG_CODE_Construction.pkl: 0.6521



100%|██████████| 84/84 [00:54<00:00,  1.54it/s]


o4mini_RAG_CODE_TF_1.pkl: 0.7413



100%|██████████| 84/84 [00:57<00:00,  1.46it/s]


o4mini_RAG_CODE_TF_2.pkl: 0.7368



100%|██████████| 84/84 [01:00<00:00,  1.40it/s]

o4mini_RAG_CODE_TF_3.pkl: 0.7441




