In [1]:
import pandas as pd

In [2]:
github_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv'
url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [3]:
df = df.iloc[:300]

In [4]:
from sentence_transformers import SentenceTransformer

In [5]:
model_name = 'multi-qa-mpnet-base-dot-v1'

embedding_model = SentenceTransformer(model_name)

You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





In [6]:
answer_llm = df.iloc[0].answer_llm
answer_llm_embeddings = embedding_model.encode(answer_llm)

In [7]:
answer_llm_embeddings

array([-4.22446549e-01, -2.24856257e-01, -3.24058414e-01, -2.84758478e-01,
        7.25642918e-03,  1.01186566e-01,  1.03716910e-01, -1.89983174e-01,
       -2.80599259e-02,  2.71588802e-01, -1.15337655e-01,  1.14666030e-01,
       -8.49586725e-02,  3.32365334e-01,  5.52720726e-02, -2.22195774e-01,
       -1.42540857e-01,  1.02519155e-01, -1.52333647e-01, -2.02912465e-01,
        1.98422875e-02,  8.38149190e-02, -5.68632066e-01,  2.32844148e-02,
       -1.67292684e-01, -2.39256918e-01, -8.05464387e-02,  2.57084146e-02,
       -8.15464780e-02, -7.39290118e-02, -2.61550009e-01,  1.92575473e-02,
        3.22909206e-01,  1.90357104e-01, -9.34726413e-05, -2.13165611e-01,
        2.88943425e-02, -1.79530401e-02, -5.92756271e-02,  1.99918285e-01,
       -4.75170948e-02,  1.71634093e-01, -2.45917086e-02, -9.38061550e-02,
       -3.57002735e-01,  1.33263692e-01,  1.94045901e-01, -1.18530318e-01,
        4.56915230e-01,  1.47728190e-01,  3.35945129e-01, -1.86959356e-01,
        2.45954901e-01, -

In [8]:
df.iloc[0]

answer_llm     You can sign up for the course by visiting the...
answer_orig    Machine Learning Zoomcamp FAQ\nThe purpose of ...
document                                                0227b872
question                     Where can I sign up for the course?
course                                 machine-learning-zoomcamp
Name: 0, dtype: object

In [9]:
from tqdm.auto import tqdm

In [10]:
evaluations = []

for i, res in tqdm(df.iterrows()):
    answer_llm_embeddings = embedding_model.encode(res['answer_llm'])
    answer_orig_embeddings = embedding_model.encode(res['answer_orig'])
    evaluations.append(answer_llm_embeddings.dot(answer_orig_embeddings))

0it [00:00, ?it/s]

In [11]:
pd.DataFrame(evaluations).describe()

Unnamed: 0,0
count,300.0
mean,27.495996
std,6.384742
min,4.547923
25%,24.307844
50%,28.33687
75%,31.674309
max,39.476013


In [12]:
import numpy as np

In [13]:
def vector_normalized(v):
    norm = np.sqrt((v * v).sum())
    return v / norm

In [14]:
evaluations1 = []

for i, res in tqdm(df.iterrows()):
    answer_llm_embeddings = embedding_model.encode(res['answer_llm'])
    answer_orig_embeddings = embedding_model.encode(res['answer_orig'])
    answer_llm_embeddings_norm = vector_normalized(answer_llm_embeddings)
    answer_orig_embeddings_norm = vector_normalized(answer_orig_embeddings)
    evaluations1.append(answer_llm_embeddings_norm.dot(answer_orig_embeddings_norm))

0it [00:00, ?it/s]

In [15]:
pd.DataFrame(evaluations1).describe()

Unnamed: 0,0
count,300.0
mean,0.728393
std,0.157755
min,0.125357
25%,0.651273
50%,0.763761
75%,0.836235
max,0.958796


In [16]:
!pip install rouge

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [17]:
from rouge import Rouge

rouge_scorer = Rouge()

In [18]:
r = df.iloc[10]

In [19]:
scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

In [20]:
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [21]:
scores['rouge-1']['f']

0.45454544954545456

In [22]:
np.average([scores['rouge-1']['f'], scores['rouge-2']['f'], scores['rouge-l']['f']])

0.35490034990035496

In [23]:
scores_list = []

for i, row in tqdm(df.iterrows()):
    scores_row = rouge_scorer.get_scores(row['answer_llm'], row['answer_orig'])[0]
    
    scores_dict = {}
    for score in scores_row:
        scores_dict = {**scores_dict, **{score + '_' + k: val for k, val in scores_row[score].items()}}

    scores_list.append(scores_dict)

scores_df = pd.DataFrame(scores_list)

0it [00:00, ?it/s]

In [24]:
scores_df

Unnamed: 0,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f
0,0.061224,0.214286,0.095238,0.017544,0.071429,0.028169,0.061224,0.214286,0.095238
1,0.081633,0.266667,0.125000,0.035088,0.133333,0.055556,0.061224,0.200000,0.093750
2,0.326531,0.571429,0.415584,0.140351,0.242424,0.177778,0.306122,0.535714,0.389610
3,0.163265,0.320000,0.216216,0.035088,0.071429,0.047059,0.142857,0.280000,0.189189
4,0.265306,0.097015,0.142076,0.070175,0.022346,0.033898,0.224490,0.082090,0.120219
...,...,...,...,...,...,...,...,...,...
295,0.642857,0.666667,0.654545,0.559322,0.523810,0.540984,0.607143,0.629630,0.618182
296,0.642857,0.545455,0.590164,0.542373,0.400000,0.460432,0.607143,0.515152,0.557377
297,0.660714,0.649123,0.654867,0.593220,0.538462,0.564516,0.642857,0.631579,0.637168
298,0.285714,0.326531,0.304762,0.135593,0.129032,0.132231,0.285714,0.326531,0.304762


In [25]:
np.average(scores_df.loc[:, 'rouge-2_f'])

0.20696501983423318