In [1]:
import pandas as pd

github_url = (
    "https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv"
)
url = f"{github_url}?raw=1"
df = pd.read_csv(url)

In [3]:
df = df.iloc[:300]

In [17]:
records = df.to_dict(orient="records")
len(records)

300

### 1

In [4]:
from sentence_transformers import SentenceTransformer

model_name = "multi-qa-mpnet-base-dot-v1"
model = SentenceTransformer(model_name)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
answer_llm = df.iloc[0].answer_llm
vector = model.encode(answer_llm)

In [7]:
vector[0], vector.shape

(-0.42244658, (768,))

### 2

In [8]:
def compute_similarity(record):
    answer_orig = record["answer_orig"]
    answer_llm = record["answer_llm"]

    v_llm = model.encode(answer_llm)
    v_orig = model.encode(answer_orig)

    return v_llm.dot(v_orig)

In [13]:
evaluations = []

for rec in records:
    sim_score = compute_similarity(rec)
    evaluations.append(sim_score)

In [None]:
df["similarity"] = evaluations

In [52]:
df["similarity"].describe()["75%"]

31.67430353164673

### 3

In [19]:
import numpy as np


def norm(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    return v_norm

In [23]:
def compute_similarity_norm(record):
    answer_orig = record["answer_orig"]
    answer_llm = record["answer_llm"]

    v_llm = model.encode(answer_llm)
    v_llm = norm(v_llm)

    v_orig = model.encode(answer_orig)
    v_orig = norm(v_orig)

    return v_llm.dot(v_orig)

In [25]:
from tqdm import tqdm

evaluations_normalized = []

for rec in tqdm(records):
    sim_score = compute_similarity_norm(rec)
    evaluations_normalized.append(sim_score)

100%|██████████| 300/300 [00:12<00:00, 23.78it/s]


In [26]:
df["similarity_norm"] = evaluations_normalized

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["similarity_norm"] = evaluations_normalized


In [51]:
df["similarity_norm"].describe()["75%"]

0.8362347185611725

### 4

In [29]:
# !pip install rouge

In [33]:
test_record = records[10]

In [35]:
from rouge import Rouge

rouge_scorer = Rouge()

scores = rouge_scorer.get_scores(test_record["answer_llm"], test_record["answer_orig"])[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [37]:
scores["rouge-1"]["f"]

0.45454544954545456

### 5

In [42]:
f_score_list = [scores[key]["f"] for key in scores]
f_score_average = sum(f_score_list) / len(f_score_list)
f_score_average

0.35490034990035496

### 6

In [45]:
from tqdm import tqdm

f_list = []

for rec in tqdm(records):
    scores = rouge_scorer.get_scores(rec["answer_llm"], rec["answer_orig"])[0]
    f_score_current = {key: scores[key]["f"] for key in scores}
    f_list.append(f_score_current)

100%|██████████| 300/300 [00:00<00:00, 693.53it/s]


In [47]:
df_f_score = pd.DataFrame(f_list)

In [50]:
df_f_score["rouge-2"].mean()

0.20696501983423318