In [None]:
import pandas as pd

In [None]:
github_url = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv"

url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [None]:
df_new = df.iloc[:300]

In [None]:
# check the shape of the data
df_new.shape

In [None]:
# check the column of the dataset
df_new.head()

In [None]:
model_name = "multi-qa-mpnet-base-dot-v1"

from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name)

In [None]:
answer_llm = df_new.iloc[0].answer_llm

In [None]:
embedding_model.encode(answer_llm)[0]

In [None]:
#used to display progress bar for loops and iterators
from tqdm.auto import tqdm

In [None]:
evaluation = []

for index, record in tqdm(df_new.iterrows()):
    answer_org = record['answer_orig']
    answer_llm = record['answer_llm']
    
    llm = embedding_model.encode(answer_org)
    orig = embedding_model.encode(answer_llm)
    
    dot_product = llm.dot(orig)
    evaluation.append(dot_product)

In [None]:
df_new["score"] = evaluation

In [None]:
df_new["score"].describe()

In [None]:
# import dependency
import numpy as np

In [None]:
def cosine_similarity(df, embedding_model):
    similarity = []

    for index, record in tqdm(df.iterrows(), total=df.shape[0]):
        answer_org = record['answer_orig']
        answer_llm = record['answer_llm']
        
        # Encode the answers to get the vectors
        llm = embedding_model.encode(answer_llm)
        orig = embedding_model.encode(answer_org)
        
        # Compute norms for each vector
        norm_llm = np.sqrt(np.sum(llm ** 2))
        norm_orig = np.sqrt(np.sum(orig ** 2))
        
        # Normalize the vectors
        llm_norm = llm / norm_llm
        orig_norm = orig / norm_orig
        
        # Compute the cosine similarity (dot product of normalized vectors)
        dot_product = np.dot(llm_norm, orig_norm)
        similarity.append(dot_product)
    
    return similarity

In [None]:
df_new["cosine"] = cosine_similarity(df_new, embedding_model)

In [None]:
df_new["cosine"].describe()

In [None]:
from rouge import Rouge
rouge_scorer = Rouge()

In [None]:
r = df_new.iloc[10]

In [None]:

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

In [None]:
scores['rouge-1']['f']

In [None]:
rouge_1_f1 = scores['rouge-1']['f']
rouge_2_f1 = scores['rouge-2']['f']
rouge_l_f1 = scores['rouge-l']['f']

In [None]:
average_f1 = (rouge_1_f1 + rouge_2_f1 + rouge_l_f1) / 3
average_f1

In [None]:
rouge_2_f1_scores = []

# Compute ROUGE-2 F-scores for all records
for index, record in tqdm(df_new.iterrows(), total=df.shape[0]):
    scores = rouge_scorer.get_scores(record['answer_llm'], record['answer_orig'])[0]
    rouge_2_f1 = scores['rouge-2']['f']
    rouge_2_f1_scores.append(rouge_2_f1)

# Compute the average ROUGE-2 F-score
average_rouge_2_f1 = sum(rouge_2_f1_scores) / len(rouge_2_f1_scores)

In [None]:
print(f"Average ROUGE-2 F1 Score across all records: {average_rouge_2_f1}")