In [1]:
from openai import OpenAI
import os
import requests
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
from rouge import Rouge

  from tqdm.autonotebook import tqdm, trange


In [3]:
github_url='https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv'
url = f'{github_url}?raw=1'
df = pd.read_csv(url)
df = df.iloc[:300]

In [5]:
model_name='multi-qa-mpnet-base-dot-v1'
embedding_model = SentenceTransformer(model_name)

In [11]:
answer_llm = df.iloc[0].answer_llm
embedding_model.encode(answer_llm)[0]

np.float32(-0.4224466)

In [13]:
def get_cosine_sim(llm_ans,ground_truth):
    v_llm = embedding_model.encode(llm_ans)
    v_orig = embedding_model.encode(ground_truth)
    return v_llm.dot(v_orig)


In [15]:
evaluations = []
for llm_ans, orig_ans in tqdm(zip(df["answer_llm"], df["answer_orig"]), total=len(df)):
    similarity = get_cosine_sim(llm_ans, orig_ans)
    evaluations.append(similarity)

100%|██████████| 300/300 [02:03<00:00,  2.43it/s]


In [16]:
np.percentile(evaluations, 75)

np.float32(31.674303)

In [17]:
def get_cosine_sim_norm(answer_llm, answer_orig):
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)
    return np.dot(v_llm, v_orig) / (np.linalg.norm(v_llm) * np.linalg.norm(v_orig))

In [18]:
evaluations = []
for llm_ans, orig_ans in tqdm(zip(df["answer_llm"], df["answer_orig"]), total=len(df)):
    similarity = get_cosine_sim_norm(llm_ans, orig_ans)
    evaluations.append(similarity)

100%|██████████| 300/300 [00:54<00:00,  5.50it/s]


In [19]:
np.percentile(evaluations, 75)

np.float32(0.8362348)

In [22]:
rouge_scorer = Rouge()
r=df.iloc[10]
scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

In [25]:
scores['rouge-1']

{'r': 0.45454545454545453, 'p': 0.45454545454545453, 'f': 0.45454544954545456}

In [27]:
rouge_1 = scores['rouge-1']['f']
rouge_2 = scores['rouge-2']['f']
rouge_l = scores['rouge-l']['f']
(rouge_1 + rouge_2 + rouge_l) / 3

0.35490034990035496

In [28]:
rouge_1_scores = []
rouge_2_scores = []
rouge_l_scores = []

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    scores = rouge_scorer.get_scores(row['answer_llm'], row['answer_orig'])[0]
    rouge_1_scores.append(scores['rouge-1']['f'])
    rouge_2_scores.append(scores['rouge-2']['f'])
    rouge_l_scores.append(scores['rouge-l']['f'])

100%|██████████| 300/300 [00:00<00:00, 323.64it/s]


In [29]:
rouge_1_avg = sum(rouge_1_scores) / len(rouge_1_scores)
rouge_2_avg = sum(rouge_2_scores) / len(rouge_2_scores)
rouge_l_avg = sum(rouge_l_scores) / len(rouge_l_scores)
rouge_avg = (rouge_1_avg + rouge_2_avg + rouge_l_avg) / 3

In [30]:
rouge_avg

0.3132053673398381