In [26]:
import pandas as pd
import numpy as np

from tqdm.auto import tqdm
from tqdm import tqdm, tqdm_notebook
tqdm.pandas()

In [4]:
url = f'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv?raw=1'
df = pd.read_csv(url)

In [15]:
df = df.iloc[:300]
df.head(2)

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp


## Q1. Getting the embeddings model

In [9]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-mpnet-base-dot-v1'
embedding_model = SentenceTransformer(model_name)

You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





In [11]:
answer_llm = df.iloc[0].answer_llm
answer_llm

'You can sign up for the course by visiting the course page at [http://mlzoomcamp.com/](http://mlzoomcamp.com/).'

In [14]:
embedding_model.encode(answer_llm)[0]

-0.42244655

The first value of the resulting vector = -0.42

## Q2. Computing the dot product

In [18]:
answer_llm = df.iloc[0].answer_llm
answer_orig = df.iloc[0].answer_orig
embedding_model.encode(answer_llm).dot(embedding_model.encode(answer_orig))

17.515987

In [20]:
df['evaluations'] = df.progress_apply(lambda row: embedding_model.encode(row.answer_llm).dot(embedding_model.encode(row.answer_orig)), axis = 1)
df['evaluations'].describe()

count    300.000000
mean      27.495996
std        6.384742
min        4.547923
25%       24.307844
50%       28.336870
75%       31.674309
max       39.476013
Name: evaluations, dtype: float64

The 75% percentile of the score = 31.67

## Q3. Computing the cosine

In [27]:
def norma(v):
    v=np.array(v)
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    return v_norm

df['evaluations_norm'] = df.progress_apply(lambda row: norma(embedding_model.encode(row.answer_llm)).dot(norma(embedding_model.encode(row.answer_orig))), axis = 1)
df['evaluations_norm'].describe()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [01:20<00:00,  3.71it/s]


count    300.000000
mean       0.728393
std        0.157755
min        0.125357
25%        0.651273
50%        0.763761
75%        0.836235
max        0.958796
Name: evaluations_norm, dtype: float64

The 75% cosine in the scores = 0.83

## Q4. Rouge

In [28]:
!pip install rouge

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [30]:
from rouge import Rouge
rouge_scorer = Rouge()

df['rouge_scores'] = df.progress_apply(lambda row: rouge_scorer.get_scores(row['answer_llm'], row['answer_orig'])[0], axis=1)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:00<00:00, 355.14it/s]


In [34]:
df[df.document == '5170565b'].rouge_scores[10]

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

The F score for rouge-1 = 0.45

## Q5. Average rouge score

In [42]:
res = df[df.document == '5170565b'].rouge_scores[10]
average = (res['rouge-1']['f'] + res['rouge-2']['f'] + res['rouge-l']['f']) / 3
average

0.35490034990035496

The average between rouge-1, rouge-2 and rouge-l for the same record from Q4 = 0.35

## Q6. Average rouge score for all the data points

In [44]:
df['rouge_1'] = df['rouge_scores'].apply(lambda scores: scores['rouge-1']['f'])
df['rouge_2'] = df['rouge_scores'].apply(lambda scores: scores['rouge-2']['f'])
df['rouge_l'] = df['rouge_scores'].apply(lambda scores:  scores['rouge-l']['f'])
df['rouge_avg'] = (df.rouge_1 + df.rouge_2 + df.rouge_l) / 3

In [45]:
df.rouge_2.describe()

count    300.000000
mean       0.206965
std        0.153550
min        0.000000
25%        0.097809
50%        0.178671
75%        0.286181
max        0.739130
Name: rouge_2, dtype: float64

The agerage rouge_2 across all the records = 0.20