In [61]:
from llmFunctions import *
from pandarallel import pandarallel
import pandas as pd

# Initialize pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [None]:
import pandas as pd
# It has been done with the other methods too (RL, Random, BM25 ,Faiss, etc.)
df = pd.read_parquet("df_with_predicted_responses_faiss.parquet")

In [None]:

# Use parallel_apply instead of progress_apply
df["llm_response_faiss"] = df.parallel_apply(
    lambda row: clean_response(row["llm_response_faiss"]),
    axis=1
)

# Ensure all data is string-typed
df = df.astype(str)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=24), Label(value='0 / 24'))), HBox…

In [None]:
# ROUGE BLEU BERT COSINE

In [31]:
from llmFunctions import *

import nltk
import pandas as pd
import spacy
import bert_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from scipy.spatial.distance import cosine

# Download required NLTK resources
nltk.download('punkt')

# Load spaCy model
nlp = spacy.load("en_core_web_md")

# ------------------ Helper Functions ------------------ #

def preprocess(df, generated_response_column, gold_column):
    """Lowercase and extract text from response."""
    df['llm_response_text'] = df[generated_response_column]
    df['clean_answer'] = df[gold_column].str.lower()
    df['llm_response_text'] = df['llm_response_text'].str.lower()
    return df

def compute_bleu(reference, hypothesis):
    reference_tokens = reference.split()
    hypothesis_tokens = hypothesis.split()
    smoothie = SmoothingFunction().method4
    return sentence_bleu([reference_tokens], hypothesis_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)

def compute_rouge_l(reference, hypothesis):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(reference, hypothesis)
    return scores['rougeL'].fmeasure

def compute_spacy_cosine_similarity(reference, hypothesis):
    doc1 = nlp(reference)
    doc2 = nlp(hypothesis)
    if doc1.vector_norm and doc2.vector_norm:
        return 1 - cosine(doc1.vector, doc2.vector)
    return 0.0

def compute_metrics(df):
    df['BLEU-4'] = df.apply(lambda row: compute_bleu(row['clean_answer'], row['llm_response_text']), axis=1)
    df['ROUGE-L F1'] = df.apply(lambda row: compute_rouge_l(row['clean_answer'], row['llm_response_text']), axis=1)
    df['Cosine Sim'] = df.apply(lambda row: compute_spacy_cosine_similarity(row['clean_answer'], row['llm_response_text']), axis=1)
    
    # Compute BERTScore
    P, R, F1 = bert_score.score(df['llm_response_text'].tolist(), df['clean_answer'].tolist(), lang="en", verbose=True)
    df['BERTScore F1'] = F1.numpy()
    
    return df

def display_final_table(df):
    print("\nFinal Table with Metrics:")
    print(df[['answer', 'llm_response_text','ROUGE-L F1', 'BLEU-4',  'BERTScore F1', 'Cosine Sim']].round(4))

def group_by_domain(df):
    domain_means = df.groupby("domain")[['ROUGE-L F1', 'BLEU-4',  'BERTScore F1', 'Cosine Sim']].mean()
    print("\nMédia das métricas por domínio:")
    print(domain_means.round(4))

def group_by_domain_and_algo(df):
    grouped = df.groupby(["domain", "algo"])[['ROUGE-L F1', 'BLEU-4',  'BERTScore F1', 'Cosine Sim']].mean()
    print("\nMédia das métricas por domínio e algoritmo:")
    print(grouped.round(4))

# ------------------ Main Execution ------------------ #

def evaluate_llm_responses(df, generated_response_column='llm_response', gold_column='answer'):
    df = preprocess(df, generated_response_column, gold_column)
    df = compute_metrics(df)
    display_final_table(df)
    group_by_domain(df)
    group_by_domain_and_algo(df)
    return df

# ------------------ Usage ------------------ #

# Example:
# df = pd.read_csv("your_file.csv")
# df = evaluate_llm_responses(df, generated_response_column='llm_response', gold_column='answer')


[nltk_data] Downloading package punkt to /home/rafael/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [31]:
df.columns

Index(['row_idx', 'algo', 'actions', 'chunks_selected', 'total_reward',
       'steps', 'interaction_id', 'domain', 'question_type',
       'static_or_dynamic', 'query', 'answer', 'page_results_text',
       'llm_response', 'llm_response_text', 'BLEU-4', 'ROUGE-L F1',
       'Cosine Sim', 'BERTScore F1', 'faiss_chunks_selected',
       'llm_response_faiss'],
      dtype='object')

In [None]:
eval_df = evaluate_llm_responses(df, "llm_response_faiss", "clean_answer")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/9 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/12 [00:00<?, ?it/s]

done in 11.44 seconds, 65.75 sentences/sec

Final Table with Metrics:
                                                answer  \
0    the total value of all etfs in the global mark...   
1    the total value of all etfs in the global mark...   
2    the total value of all etfs in the global mark...   
3    the total value of all etfs in the global mark...   
4    the company with the highest ratio of insider ...   
..                                                 ...   
322  alex essoe, amanda fuller, fabianne therese, n...   
328                                                 44   
329                                                 44   
330                                                 44   
331                                                 44   

                                     llm_response_text  ROUGE-L F1  BLEU-4  \
0    the provided text appears to be instructions f...      0.0000  0.0000   
1    based on the provided information, there is no...      0.0000  0.0000   



In [1]:
# Ragas Metrics

In [None]:
import os
import ast
import pandas as pd
from datasets import Dataset
from ragas import evaluate
from typing import Union, List
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_precision,
    context_recall,
)

# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = "api_key_here"

# Initialize LLM and Embeddings wrappers
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1-nano"))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

def safe_eval(x):
    """Safely parse stringified lists."""
    if isinstance(x, list):
        return x
    try:
        return ast.literal_eval(x)
    except Exception:
        return [str(x)]
    
    

def evaluate_ragas(
    df: pd.DataFrame,
    answer_col: str,
    generated_answer_col: str,
    context_col: str,
    query_col: str,
    groupby_col: Union[str, List[str]] = None,  # compatible with Python <3.10
) -> dict:
    def _prepare_and_evaluate(group_df):
        group_df = group_df[[answer_col, generated_answer_col, context_col, query_col]].copy()
        group_df[context_col] = group_df[context_col].apply(safe_eval)

        df_ragas = group_df.rename(columns={
            answer_col: "answer",
            generated_answer_col: "generated_answer",
            context_col: "retrieved_contexts",
            query_col: "user_input"
        })
        df_ragas["reference"] = df_ragas["answer"]

        dataset = Dataset.from_pandas(df_ragas)

        return evaluate(
            dataset,
            metrics=[
                faithfulness,
                answer_relevancy,
                answer_correctness,
                context_precision,
                context_recall,
            ],
            llm=evaluator_llm,
            embeddings=evaluator_embeddings,
        )

    results = {}

    if groupby_col:
        if isinstance(groupby_col, str):
            groupby_col = [groupby_col]
        for group_keys, group_df in df.groupby(groupby_col):
            group_name = "__".join(str(k) for k in group_keys) if isinstance(group_keys, tuple) else str(group_keys)
            try:
                results[group_name] = _prepare_and_evaluate(group_df)
            except Exception as e:
                results[group_name] = {"error": str(e)}
    else:
        results["overall"] = _prepare_and_evaluate(df)

    return results


In [None]:

results = evaluate_ragas(
    df,
    answer_col="clean_answer",
    generated_answer_col="llm_response_faiss",
    context_col="chunks_selected_faiss",
    query_col="query",
    groupby_col=["algo", "domain"]
)


In [6]:
results

{'ddpg__finance': {'faithfulness': 0.2656, 'answer_relevancy': 0.7935, 'answer_correctness': 0.9406, 'context_precision': 0.1547, 'context_recall': 0.5114},
 'ddpg__movie': {'faithfulness': 0.3077, 'answer_relevancy': 0.7815, 'answer_correctness': 0.9101, 'context_precision': 0.1459, 'context_recall': 0.4460},
 'ddpg__music': {'faithfulness': 0.4775, 'answer_relevancy': 0.8324, 'answer_correctness': 0.9284, 'context_precision': 0.2112, 'context_recall': 0.5673},
 'ddpg__open': {'faithfulness': 0.6423, 'answer_relevancy': 0.8142, 'answer_correctness': 0.9787, 'context_precision': 0.3391, 'context_recall': 0.7092},
 'ddpg__sports': {'faithfulness': 0.3611, 'answer_relevancy': 0.7699, 'answer_correctness': 0.9687, 'context_precision': 0.2143, 'context_recall': 0.6445},
 'ppo__finance': {'faithfulness': 0.3083, 'answer_relevancy': 0.7919, 'answer_correctness': 0.8970, 'context_precision': 0.2378, 'context_recall': 0.4523},
 'ppo__movie': {'faithfulness': 0.3171, 'answer_relevancy': 0.7832,