In [22]:
from tqdm import tqdm
import pandas as pd

In [3]:
github_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv'
url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [4]:
df = df.iloc[:300]

# Q1. Getting the embeddings model

In [6]:
from sentence_transformers import SentenceTransformer

In [7]:
model_name = 'multi-qa-mpnet-base-dot-v1'
embedding_model = SentenceTransformer(model_name)

  return torch._C._cuda_getDeviceCount() > 0


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
answer_llm = df.iloc[0].answer_llm

In [10]:
answer_llm

'You can sign up for the course by visiting the course page at [http://mlzoomcamp.com/](http://mlzoomcamp.com/).'

In [11]:
embedding = embedding_model.encode(answer_llm)

In [13]:
embedding[0]

-0.42244655

The first value of the resulting vector is -0.42.

# Q2. Computing the dot product

In [14]:
evaluations = []

In [15]:
df.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


In [17]:
results_gpt4o_mini = df.to_dict(orient='records')

In [18]:
results_gpt4o_mini[0]

{'answer_llm': 'You can sign up for the course by visiting the course page at [http://mlzoomcamp.com/](http://mlzoomcamp.com/).',
 'answer_orig': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
 'document': '0227b872',
 'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp'}

In [19]:
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

In [24]:
for record in tqdm(results_gpt4o_mini):
    sim = compute_similarity(record)
    evaluations.append(sim)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:59<00:00,  5.05it/s]


In [25]:
import numpy as np

In [26]:
np.percentile(evaluations, 75)

31.67430877685547

The 75% percential is 31.67.

# Q3. Computing the cosine

In [27]:
def calc_norm(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    return v_norm

In [29]:
def compute_cosine_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)

    v_llm_norm = calc_norm(v_llm)
    v_orig_norm = calc_norm(v_orig)
    
    return v_llm_norm.dot(v_orig_norm)

In [30]:
cosine_similarity = []

In [31]:
for record in tqdm(results_gpt4o_mini):
    sim = compute_cosine_similarity(record)
    cosine_similarity.append(sim)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:50<00:00,  5.96it/s]


In [32]:
print(min(cosine_similarity))
print(max(cosine_similarity))

0.12535673
0.9587959


In [34]:
np.percentile(cosine_similarity, 75)

0.8362348973751068

The 75% percentile value for the cosine similarity is 0.83.

# Q4. Rouge

In [35]:
from rouge import Rouge

In [36]:
rouge_scorer = Rouge()

In [39]:
r = results_gpt4o_mini[10]

In [40]:
r['document'] == '5170565b'

True

In [41]:
scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

In [42]:
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

The F score for rouge-1 is 0.45.

# Q5. Average rouge score

In [47]:
list_of_scores = [scores['rouge-1']['f'], scores['rouge-2']['f'], scores['rouge-l']['f']]

In [48]:
sum(list_of_scores)/len(list_of_scores)

0.35490034990035496

The average F-score is 0.35.

# Q6. Average rouge score for all the data points

In [50]:
rouge_scores = []

for record in tqdm(results_gpt4o_mini):
    rouge_score = rouge_scorer.get_scores(record['answer_llm'], record['answer_orig'])[0]
    rouge_scores.append(rouge_score)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:00<00:00, 364.75it/s]


In [54]:
flatt_rouge = []

for data in rouge_scores:
    flattened_data = {}
    for key1, sub_dict in data.items():
        for key2, value in sub_dict.items():
            flattened_key = f"{key1}_{key2}"
            flattened_data[flattened_key] = value
    flatt_rouge.append(flattened_data)

In [56]:
df_rouge = pd.DataFrame(flatt_rouge)

In [59]:
df_rouge.mean()

rouge-1_r    0.357817
rouge-1_p    0.472618
rouge-1_f    0.378844
rouge-2_r    0.198613
rouge-2_p    0.258626
rouge-2_f    0.206965
rouge-l_r    0.334597
rouge-l_p    0.440623
rouge-l_f    0.353807
dtype: float64

The average rouge-2 f score is 0.20.