In [12]:
from collections import defaultdict

import evaluate
import numpy as np
import pandas as pd
import wandb
from tqdm import tqdm

In [6]:
paths = [
    ("Phi-3-mini-4k-instruct_val.csv", "Phi-3"),
    ("loraphi_val.csv", "Lora-Phi-3"),
    ("llama-3-8b-Instruct-bnb-4bit_val.csv", "Llama-3-8b"),
    ("PRGen-llama-3-8b-Instruct-bnb-4bit-4bit-LoRA_out.csv", "Lora-Llama-3-8b"),
]

In [11]:
bertscore = evaluate.load("bertscore")
rouge = evaluate.load("rouge")
chrf = evaluate.load("chrf")


def compute_metrics(predictions, labels):
    rouge_score = rouge.compute(predictions=predictions, references=labels)
    bert_score = bertscore.compute(predictions=predictions, references=[[label] for label in labels], lang="en")

    chrf_score = chrf.compute(predictions=predictions, references=labels, word_order=2)

    processed_bert_score = defaultdict(list)
    for key, value in bert_score.items():
        if key == "hashcode":
            continue
        for _, v in enumerate(value):
            processed_bert_score[key].append(v)

    bert_score_result = {}
    for key, value in processed_bert_score.items():
        key_name = key.split("/")[-1]
        bert_score_result["bert_" + key_name] = np.mean(value)

    return rouge_score | bert_score_result | chrf_score

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

In [None]:
for path, exp_name in tqdm(paths):
    df = pd.read_csv("output/" + path)
    predictions = df["generated_val"].tolist()
    labels = df["target"].tolist()

    wandb.init(
        project="PRGen",
        name=f"Evaluate {exp_name}",
        params={
            "max_new_tokens": 1024,
            "temperature": 0.1,
            "top_p": 0.9,
        },
    )
    metrics = compute_metrics(predictions, labels)
    wandb.log(metrics)
    wandb.finish()