In [29]:
# !pip install evaluate bert-score rouge-score mlflow pandas google-generativeai python-dotenv

In [30]:
import os
import json
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
from tqdm import tqdm

load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

import google.generativeai as genai
genai.configure(api_key=GEMINI_API_KEY)

import evaluate
import bert_score
import mlflow


In [31]:
GT_DIR = Path("../data/ground_truth")
GEN_DIR = Path("../data/generated_data")

files = [f.stem for f in GT_DIR.glob("*.json") if (GEN_DIR / f"{f.stem}.json").exists()]

references = []
predictions = []
file_ids = []

for file_id in files:
    with open(GT_DIR / f"{file_id}.json") as f:
        references.append(json.load(f)["abstract"])

    with open(GEN_DIR / f"{file_id}.json") as f:
        predictions.append(json.load(f)["summary"])

    file_ids.append(file_id)


In [32]:
rouge = evaluate.load("rouge")
bert = evaluate.load("bertscore")

rouge_results = rouge.compute(predictions=predictions, references=references)
bert_results = bert.compute(predictions=predictions, references=references, lang="en")

print("ROUGE:", rouge_results)
print("BERTScore:", bert_results["f1"][:5])  


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ROUGE: {'rouge1': np.float64(0.24064511890684534), 'rouge2': np.float64(0.051802790301772865), 'rougeL': np.float64(0.14304296413086323), 'rougeLsum': np.float64(0.14304296413086323)}
BERTScore: [0.8158199787139893, 0.8339284062385559]


In [38]:
import json

def gemini_llm_score(reference, prediction):
    prompt = f"""
You are an evaluation assistant. Compare the generated summary to the reference abstract.

Return a JSON object with:
- similarity_score: an integer from 1 to 10. 1 being the lowest similarity and 10 being the highest between the generated summary and the reference abstract.
- correct: true or false
- reason: a short explanation on why you decided on the similarity score and how you determined if the summary is correct or not.

Important: respond in valid JSON only. Do not include markdown or explanation outside the JSON.

Abstract:
{reference}

Generated Summary:
{prediction}
"""

    model = genai.GenerativeModel("models/gemini-1.5-flash-latest")
    
    try:
        response = model.generate_content(prompt)
        reply = response.text.strip()
        json_start = reply.find('{')
        json_end = reply.rfind('}') + 1
        json_str = reply[json_start:json_end]

        parsed = json.loads(json_str)
        return parsed.get("similarity_score"), parsed.get("correct"), parsed.get("reason")
    
    except Exception as e:
        print(f"LLM eval error: {e}")
        return None, None, f"LLM error: {e}"


In [42]:
results = []

for i in tqdm(range(len(predictions))):
    pred = predictions[i]
    ref = references[i]
    file_id = file_ids[i]

    sim_score, correct, reason = gemini_llm_score(ref, pred)

    result = {
        "file": file_id,
        "rouge1": rouge_results["rouge1"],
        "rougeL": rouge_results["rougeL"],
        "bert_f1": bert_results["f1"][i],
        "llm_similarity": sim_score,
        "llm_correct": correct,
        "llm_reason": reason
    }
    results.append(result)

df = pd.DataFrame(results)
pd.set_option('display.max_colwidth', None)
df.head()

# Daten im csv speichern - dont know if this is needed
# output_csv_path = Path("../data/evaluation_results.csv") 
# df.to_csv(output_csv_path, index=False)

100%|██████████| 2/2 [00:02<00:00,  1.27s/it]


Unnamed: 0,file,rouge1,rougeL,bert_f1,llm_similarity,llm_correct,llm_reason
0,Does_the_categorization_difficulty_elicit_the_uncanny-valley-like_phenomenon_without_animacy,0.240645,0.143043,0.81582,9,True,"The generated summary accurately captures the main points of the abstract. It correctly identifies the study's aim to investigate the uncanny valley effect using non-animate objects, the methodology involving morphing shapes and measuring categorization time and likeability, and the key finding that categorization difficulty, not processing fluency, is the primary driver of the effect. The summary concisely summarizes the experiments and their outcomes, aligning well with the abstract's core message. The slight difference in the level of detail doesn't detract significantly from the overall accuracy and similarity."
1,VerilogReader_LLM-Aided_Hardware_Test_Generation,0.240645,0.143043,0.833928,7,True,"The generated summary accurately captures the main points of the abstract: using LLMs for coverage-directed test generation, the improvement over random testing, and limitations with complex designs. However, it introduces additional details (e.g., 'Coverage Explainer', 'DUT Explainer', JSON format) not explicitly mentioned in the abstract, and it omits the mention of prompt engineering optimization. The score reflects the overall accuracy and completeness but considers the lack of perfect alignment with the source abstract."


In [43]:
import matplotlib.pyplot as plt

with mlflow.start_run():
    for row in results:
        file = row["file"]
        mlflow.log_metric(f"{file}_rouge1", row["rouge1"])
        mlflow.log_metric(f"{file}_rougeL", row["rougeL"])
        mlflow.log_metric(f"{file}_bert_f1", row["bert_f1"])

        if row["llm_similarity"] is not None:
            mlflow.log_metric(f"{file}_llm_similarity", row["llm_similarity"])

        if row["llm_correct"] is not None:
            mlflow.set_tag(f"{file}_llm_correct", row["llm_correct"])

        mlflow.set_tag(f"{file}_llm_reason", row["llm_reason"])

mlflow.end_run()

In [None]:
# Jetzt zum directory /notebooks wechseln
# und dann  "mlflow ui" im Terminal ausführen (it only runs locally)
# und im Browser auf http://localhost:5000 gehen, um die Metriken zu sehen. 