Homework for https://github.com/DataTalksClub/llm-zoomcamp/blob/main/cohorts/2024/04-monitoring/homework.md

In [1]:
import pandas as pd

In [2]:
github_url = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/04-monitoring/data/results-gpt4o-mini.csv'

url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [3]:
df = df.iloc[:300]

In [4]:
df.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


## Q1

In [5]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-mpnet-base-dot-v1'
embedding_model = SentenceTransformer(model_name)

  from .autonotebook import tqdm as notebook_tqdm
You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





In [6]:
answer_llm = df.iloc[0].answer_llm

In [7]:
embedding_model.encode(answer_llm)[0]

-0.42244658

## Q2

In [8]:
from tqdm.auto import tqdm

In [9]:
def compute_dot_product(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

In [10]:
evaluations = []

for _, record in tqdm(df.iterrows(), total=len(df)):
    score = compute_dot_product(record)
    evaluations.append(score)

100%|██████████| 300/300 [00:59<00:00,  5.07it/s]


In [11]:
df['dot_product'] = evaluations
df['dot_product'].describe()

count    300.000000
mean      27.495996
std        6.384743
min        4.547925
25%       24.307841
50%       28.336864
75%       31.674304
max       39.476013
Name: dot_product, dtype: float64

## Q3

In [12]:
import numpy as np

In [13]:
def normalize_vector(v: np.ndarray):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    return v_norm

In [14]:
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = normalize_vector(embedding_model.encode(answer_llm))
    v_orig = normalize_vector(embedding_model.encode(answer_orig))
    
    return v_llm.dot(v_orig)

In [15]:
similarities = []

for _, record in tqdm(df.iterrows(), total=len(df)):
    sim = compute_similarity(record)
    similarities.append(sim)

100%|██████████| 300/300 [00:15<00:00, 19.00it/s]


In [16]:
df['cosine_sim'] = similarities
df['cosine_sim'].describe()

count    300.000000
mean       0.728392
std        0.157755
min        0.125357
25%        0.651273
50%        0.763761
75%        0.836235
max        0.958796
Name: cosine_sim, dtype: float64

## Q4

In [17]:
!pip install rouge

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [18]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df.iloc[10]
scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

## Q5

In [19]:
f_scores = [s['f'] for s in scores.values()]
f_scores

[0.45454544954545456, 0.21621621121621637, 0.393939388939394]

In [20]:
np.mean(f_scores)

0.35490034990035496

## Q6

In [21]:
rouge_scores = []
for _, record in tqdm(df.iterrows(), total=len(df)):
    r = rouge_scorer.get_scores(record['answer_llm'], record['answer_orig'])[0]
    rouge_scores.append(r)
rouge_scores

100%|██████████| 300/300 [00:00<00:00, 481.02it/s]


[{'rouge-1': {'r': 0.061224489795918366,
   'p': 0.21428571428571427,
   'f': 0.09523809178130524},
  'rouge-2': {'r': 0.017543859649122806,
   'p': 0.07142857142857142,
   'f': 0.028169010918468917},
  'rouge-l': {'r': 0.061224489795918366,
   'p': 0.21428571428571427,
   'f': 0.09523809178130524}},
 {'rouge-1': {'r': 0.08163265306122448,
   'p': 0.26666666666666666,
   'f': 0.12499999641113292},
  'rouge-2': {'r': 0.03508771929824561,
   'p': 0.13333333333333333,
   'f': 0.05555555225694465},
  'rouge-l': {'r': 0.061224489795918366, 'p': 0.2, 'f': 0.09374999641113295}},
 {'rouge-1': {'r': 0.32653061224489793,
   'p': 0.5714285714285714,
   'f': 0.41558441095631643},
  'rouge-2': {'r': 0.14035087719298245,
   'p': 0.24242424242424243,
   'f': 0.17777777313333343},
  'rouge-l': {'r': 0.30612244897959184,
   'p': 0.5357142857142857,
   'f': 0.3896103849822905}},
 {'rouge-1': {'r': 0.16326530612244897, 'p': 0.32, 'f': 0.2162162117421476},
  'rouge-2': {'r': 0.03508771929824561,
   'p': 0

In [22]:
rouge_df = pd.json_normalize(rouge_scores)
rouge_df

Unnamed: 0,rouge-1.r,rouge-1.p,rouge-1.f,rouge-2.r,rouge-2.p,rouge-2.f,rouge-l.r,rouge-l.p,rouge-l.f
0,0.061224,0.214286,0.095238,0.017544,0.071429,0.028169,0.061224,0.214286,0.095238
1,0.081633,0.266667,0.125000,0.035088,0.133333,0.055556,0.061224,0.200000,0.093750
2,0.326531,0.571429,0.415584,0.140351,0.242424,0.177778,0.306122,0.535714,0.389610
3,0.163265,0.320000,0.216216,0.035088,0.071429,0.047059,0.142857,0.280000,0.189189
4,0.265306,0.097015,0.142076,0.070175,0.022346,0.033898,0.224490,0.082090,0.120219
...,...,...,...,...,...,...,...,...,...
295,0.642857,0.666667,0.654545,0.559322,0.523810,0.540984,0.607143,0.629630,0.618182
296,0.642857,0.545455,0.590164,0.542373,0.400000,0.460432,0.607143,0.515152,0.557377
297,0.660714,0.649123,0.654867,0.593220,0.538462,0.564516,0.642857,0.631579,0.637168
298,0.285714,0.326531,0.304762,0.135593,0.129032,0.132231,0.285714,0.326531,0.304762


In [23]:
rouge_df.describe()

Unnamed: 0,rouge-1.r,rouge-1.p,rouge-1.f,rouge-2.r,rouge-2.p,rouge-2.f,rouge-l.r,rouge-l.p,rouge-l.f
count,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0
mean,0.357817,0.472618,0.378844,0.198613,0.258626,0.206965,0.334597,0.440623,0.353807
std,0.18252,0.194116,0.165977,0.164964,0.174559,0.15355,0.177885,0.189329,0.162965
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.220668,0.357143,0.261625,0.074632,0.138093,0.097809,0.19918,0.32642,0.228032
50%,0.341784,0.485281,0.378762,0.159075,0.230769,0.178671,0.317073,0.44,0.337792
75%,0.470588,0.6,0.479281,0.260995,0.335366,0.286181,0.440656,0.567045,0.451613
max,0.896552,1.0,0.85,0.805556,1.0,0.73913,0.896552,1.0,0.85


In [24]:
rouge_df.describe()['rouge-2.f'].loc['mean']

0.20696501983423318