"""
This script evaluates the output of a finetuned Large Language Model (LLM)
using standard classification metrics: Accuracy, F1 Score, Precision, and Recall.

It reads from a `.jsonl` file containing prediction-label pairs in the following format:

{"predict": "label1", "label": "label1"}

Metrics are computed using scikit-learn and support weighted averaging for multi-class outputs.

Typical use case:
- After LLM inference/generation is saved to `generated_predictions.jsonl`
- This script will summarize performance metrics for analysis and reporting
"""


In [None]:
import json
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

predictions = []
labels = []
precision =[]
recall=[]
with open('generated_predictions.jsonl', 'r') as file:
    for line in file:
        entry = json.loads(line)
        predictions.append(entry['predict'])
        labels.append(entry['label'])

accuracy = accuracy_score(labels, predictions)
print(f'Accuracy: {accuracy:.4f}')

f1 = f1_score(labels, predictions, average='weighted')
print(f'F1 Score: {f1:.4f}')

precision = precision_score(labels, predictions, average='weighted')
print(f'precision Score: {precision:.4f}')

recall = recall_score(labels, predictions, average='weighted')
print(f'recall Score: {recall:.4f}')
