# Getting the data

In [1]:
github_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv'

In [2]:
import pandas as pd

url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [5]:
df = df.iloc[:300]

In [18]:
df.head(2)

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp


# Q1. Getting the embeddings model

In [7]:
model_name = 'multi-qa-mpnet-base-dot-v1'

In [8]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
answer_llm = df.iloc[0].answer_llm
print(answer_llm)

You can sign up for the course by visiting the course page at [http://mlzoomcamp.com/](http://mlzoomcamp.com/).


In [16]:
embedded_answer = embedding_model.encode([answer_llm])
print(embedded_answer[0][0].round(2))

-0.42


# Q2. Computing the dot product

In [26]:
import tqdm
import numpy as np
evaluations = []

for i, row in tqdm.tqdm(df.iterrows()):
    answer_llm = row.answer_llm
    answer_orig = row.answer_orig

    embedded_answer_llm = embedding_model.encode([answer_llm])
    embedded_answer_orig = embedding_model.encode([answer_orig])

    score = (embedded_answer_orig[0] @ embedded_answer_llm[0].T).item()
    evaluations.append(score)


300it [00:34,  8.74it/s]


In [30]:
# Compute percentiles
print(evaluations[:5])
percentiles = np.percentile(evaluations, [75])
print("75th percentile:", percentiles[0].round(2))

[17.515993118286133, 13.418405532836914, 25.313255310058594, 12.147417068481445, 18.747726440429688]
75th percentile: 31.67


# Q3. Computing the cosine

In [31]:
import tqdm
import numpy as np
evaluations_norm = []

def normalize(v):
    return v / np.sqrt((v * v).sum())

for i, row in tqdm.tqdm(df.iterrows()):
    answer_llm = row.answer_llm
    answer_orig = row.answer_orig

    embedded_answer_llm = normalize(embedding_model.encode([answer_llm]))
    embedded_answer_orig = normalize(embedding_model.encode([answer_orig]))

    cosine = (embedded_answer_orig @ embedded_answer_llm.T).item()
    evaluations_norm.append(cosine)

300it [00:34,  8.78it/s]


In [33]:
# Compute percentiles
print(evaluations_norm[:5])
percentiles = np.percentile(evaluations_norm, [75])
print("75th percentile:", percentiles[0].round(3))


[0.5067542195320129, 0.3885490298271179, 0.7185991406440735, 0.33726635575294495, 0.5217921733856201]
75th percentile: 0.836


# Q4. Rouge

In [40]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df.iloc[10]

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
print((scores['rouge-1']['f']))

0.45454544954545456


# Q5. Average rouge score

In [43]:
rouge_1 = scores['rouge-1']['f']
rouge_2 = scores['rouge-2']['f']
rouge_l = scores['rouge-l']['f']
rouge_avg = (rouge_1 + rouge_2 + rouge_l) / 3
print(rouge_avg)

0.35490034990035496


# Q6. Average rouge score for all the data points

In [46]:
import tqdm
import numpy as np
evaluations_rouge_avg = []

for i, row in tqdm.tqdm(df.iterrows()):
    scores = rouge_scorer.get_scores(row['answer_llm'], row['answer_orig'])[0]
    rouge_1 = scores['rouge-1']['f']
    rouge_2 = scores['rouge-2']['f']
    rouge_l = scores['rouge-l']['f']
    rouge_avg = (rouge_1 + rouge_2 + rouge_l) / 3
    evaluations_rouge_avg.append([rouge_1, rouge_2, rouge_l, rouge_avg])
    # Create dataframe
    eval_df = pd.DataFrame(evaluations_rouge_avg, columns=['rouge_1', 'rouge_2', 'rouge_l', 'rouge_avg'])

eval_df['rouge_2'].mean()

300it [00:00, 711.79it/s]


0.20696501983423318