In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import requests 
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


# Getting the data
Let's start by getting the dataset. We will use the data we generated in the module.

In particular, we'll evaluate the quality of our RAG system with gpt-4o-mini

Read it:

In [2]:
github_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv'
url = f'{github_url}?raw=1'
df = pd.read_csv(url)

We will use only the first 300 documents:

In [3]:
df = df.iloc[:300]
df.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


# Q1. Getting the embeddings model
Now, get the embeddings model multi-qa-mpnet-base-dot-v1 from the Sentence Transformer library

Note: this is not the same model as in HW3

In [5]:
from sentence_transformers import SentenceTransformer
model_name = 'multi-qa-mpnet-base-dot-v1'
embedding_model = SentenceTransformer(model_name)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Create the embeddings for the first LLM answer:

In [6]:
answer_llm = df.iloc[0].answer_llm

What's the first value of the resulting vector?

In [7]:
embedding_model.encode(answer_llm)[0]

np.float32(-0.42244682)

### The answer is -0.42

# Q2. Computing the dot product
Now for each answer pair, let's create embeddings and compute dot product between them

We will put the results (scores) into the evaluations list

What's the 75% percentile of the score?

In [9]:
# Create embeddings for each pair
evaluations = [] # initialize empty evaluations list

for index, row in tqdm(df.iterrows()):
    evaluations.append(np.dot(embedding_model.encode(row.answer_llm),embedding_model.encode(row.answer_orig)))

evaluations

300it [01:38,  3.05it/s]


[np.float32(17.515991),
 np.float32(13.4184065),
 np.float32(25.313257),
 np.float32(12.147416),
 np.float32(18.74773),
 np.float32(33.970394),
 np.float32(30.251696),
 np.float32(29.52159),
 np.float32(35.27221),
 np.float32(27.751755),
 np.float32(32.344707),
 np.float32(31.441845),
 np.float32(36.38073),
 np.float32(33.340508),
 np.float32(30.606163),
 np.float32(32.503044),
 np.float32(29.674438),
 np.float32(24.353466),
 np.float32(20.132465),
 np.float32(23.99546),
 np.float32(30.880272),
 np.float32(32.692436),
 np.float32(30.049168),
 np.float32(16.07816),
 np.float32(31.796417),
 np.float32(37.98001),
 np.float32(20.839039),
 np.float32(32.61286),
 np.float32(38.894203),
 np.float32(34.051826),
 np.float32(28.26388),
 np.float32(27.12483),
 np.float32(23.975267),
 np.float32(26.34014),
 np.float32(18.658121),
 np.float32(25.016405),
 np.float32(21.101137),
 np.float32(33.72679),
 np.float32(29.340351),
 np.float32(28.654493),
 np.float32(29.608587),
 np.float32(30.810726),
 np

In [10]:
np.percentile(evaluations, 75)

np.float32(31.674307)

### The answer is 31.67

# From Q2, we can see that the results are not within the [0, 1] range. It's because the vectors coming from this model are not normalized.

So we need to normalize them.

To do it, we

Compute the norm of a vector
Divide each element by this norm
So, for vector v, it'll be v / ||v||

In numpy, this is how you do it:

norm = np.sqrt((v * v).sum())

v_norm = v / norm

Let's put it into a function and then compute dot product between normalized vectors. This will give us cosine similarity

What's the 75% cosine in the scores?

In [11]:
# First define a function for normalization

def normalize(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    return v_norm

# Now repeat the comutation from Q2, but with normalized vectors
evaluations_normalized = [] # initialize empty evaluations list

for index, row in tqdm(df.iterrows()):
    evaluations_normalized.append(np.dot(normalize(embedding_model.encode(row.answer_llm)),normalize(embedding_model.encode(row.answer_orig))))

evaluations_normalized

300it [01:22,  3.64it/s]


[np.float32(0.5067541),
 np.float32(0.38854894),
 np.float32(0.7185991),
 np.float32(0.33726627),
 np.float32(0.5217923),
 np.float32(0.83053184),
 np.float32(0.7462832),
 np.float32(0.69440633),
 np.float32(0.8468865),
 np.float32(0.65590715),
 np.float32(0.77795565),
 np.float32(0.78356636),
 np.float32(0.9046881),
 np.float32(0.8063029),
 np.float32(0.7275961),
 np.float32(0.7751893),
 np.float32(0.7151661),
 np.float32(0.58905566),
 np.float32(0.53322935),
 np.float32(0.5857588),
 np.float32(0.812327),
 np.float32(0.83714414),
 np.float32(0.7661154),
 np.float32(0.4333397),
 np.float32(0.81558573),
 np.float32(0.92667866),
 np.float32(0.55261576),
 np.float32(0.7622106),
 np.float32(0.9452982),
 np.float32(0.847837),
 np.float32(0.7192839),
 np.float32(0.6864791),
 np.float32(0.6100941),
 np.float32(0.6491078),
 np.float32(0.48555008),
 np.float32(0.6549568),
 np.float32(0.52971894),
 np.float32(0.84890294),
 np.float32(0.7395624),
 np.float32(0.76096797),
 np.float32(0.70153177),


In [12]:
np.percentile(evaluations_normalized, 75)

np.float32(0.8362348)

### The answer is 0.83

# Q4. Rouge
Now we will explore an alternative metric - the ROUGE score.

This is a set of metrics that compares two answers based on the overlap of n-grams, word sequences, and word pairs.

It can give a more nuanced view of text similarity than just cosine similarity alone.

We don't need to implement it ourselves, there's a python package for it:

pip install rouge
(The latest version at the moment of writing is 1.0.1)

Let's compute the ROUGE score between the answers at the index 10 of our dataframe (doc_id=5170565b)

from rouge import Rouge

rouge_scorer = Rouge()

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

There are three scores: rouge-1, rouge-2 and rouge-l, and precision, recall and F1 score for each.

rouge-1 - the overlap of unigrams,
rouge-2 - bigrams,
rouge-l - the longest common subsequence

What's the F score for rouge-1?

In [15]:
from rouge import Rouge
rouge_scorer = Rouge()

# Assign the 10th record to r
r = df.iloc[10].to_dict()

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

### The answer is 0.45

# Q5. Average rouge score
Let's compute the average F-score between rouge-1, rouge-2 and rouge-l for the same record from Q4

In [16]:
np.mean([scores['rouge-1']['f'],scores['rouge-2']['f'],scores['rouge-l']['f']])

np.float64(0.35490034990035496)

### The answer is 0.35

# Q6. Average rouge score for all the data points
Now let's compute the score for all the records and create a dataframe from them.

What's the average rouge_2 across all the records?

In [17]:
rouge_scores=[]

for index,row in df.iterrows():
    scores = rouge_scorer.get_scores(row['answer_llm'], row['answer_orig'])[0]
    rouge_scores.append(scores['rouge-2'])

df_rouge_scores = pd.DataFrame(rouge_scores)
df_rouge_scores

Unnamed: 0,r,p,f
0,0.017544,0.071429,0.028169
1,0.035088,0.133333,0.055556
2,0.140351,0.242424,0.177778
3,0.035088,0.071429,0.047059
4,0.070175,0.022346,0.033898
...,...,...,...
295,0.559322,0.523810,0.540984
296,0.542373,0.400000,0.460432
297,0.593220,0.538462,0.564516
298,0.135593,0.129032,0.132231


In [18]:
np.mean(df_rouge_scores['f'])

np.float64(0.20696501983423318)

### The answer is 0.20