In [None]:
# !pip install sentence-transformers evaluate scikit-learn

In [None]:
# !pip install git+https://github.com/google-research/bleurt.git

In [None]:
import logging
import json
import torch
import numpy as np
from collections import Counter
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from evaluate import load

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Setup logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
if not logger.handlers:
    fh = logging.FileHandler('evaluation.log')
    fh.setLevel(logging.INFO)
    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    ch.setFormatter(formatter)
    logger.addHandler(fh)
    logger.addHandler(ch)

In [None]:
# Detect device
device = torch.device(
    "mps" if torch.backends.mps.is_available()
    else "cuda" if torch.cuda.is_available()
    else "cpu"
)
logger.info("--------  Start with Baseline Predictions  -------------")
logger.info(f'Device selected: {device}')

In [None]:
# === Configuration (replace argparse) ===
class Args:
    input_path = '/content/drive/MyDrive/HSG/NLP/Project NLP/Data/sample.json'
    submission_path = '/content/drive/MyDrive/HSG/NLP/Project NLP/Data/results_zeroshot_llama_3.1-8B-instruct-finetuned_formated.json'
    sim_threshold = 0.6 # recommened 0.6 -0.75 [0,1]
    bleurt_threshold = 0.2 # recommened 0.2 - 0.4 [-1,1]

args = Args()

In [None]:
# === Load model ===
sim_model = SentenceTransformer("stsb-mpnet-base-v2")
bleurt_model = load("bleurt", module_type="metric")

In [None]:
# === Load data ===
with open(args.input_path) as f:
    reference = json.load(f)

with open(args.submission_path) as f:
    new = json.load(f)

In [None]:
# === Evaluate ===
predicted_labels = []
punctuations = []


for instance in new.keys():
    sim_punctuation = 0
    bleurt_punctuation = 0
    reference_set = [ref['cq'] for ref in reference[instance]['cqs']]

    if new[instance]['cqs'] != 'Missing CQs':
        cqs_check = [cq['cq'] for cq in new[instance]['cqs']]
        if len(cqs_check) != len(set(cqs_check)):
            logger.warning('There are repeated CQs in ' + instance)

        for i, line in enumerate(new[instance]['cqs']):

            sim_winner = None
            sim_sentence_embedding = sim_model.encode(line['cq'])
            sim_reference_embedding = sim_model.encode(reference_set)
            sims = sim_model.similarity(sim_sentence_embedding, sim_reference_embedding).tolist()[0]

            sim_winner = np.argmax(sims)
            sims_max_score = sims[sim_winner]

            if sims_max_score > args.sim_threshold:
                simlabel = reference[instance]['cqs'][sim_winner]['label']
                if simlabel == 'Useful':
                    sim_punctuation += 1/3
            else:
                label = 'not_able_to_evaluate'
            new[instance]['cqs'][i]['sim_label'] = simlabel
            new[instance]['cqs'][i]['sim_score'] = sims_max_score


            bleuert_winner = None
            bleurt_results = bleurt_model.compute(predictions=[line['cq']] * len(reference_set), references=reference_set)
            bleurt = bleurt_results['scores']

            bleuert_winner = np.argmax(bleurt)
            bleurt_max_score = bleurt[bleuert_winner]

            if bleurt_max_score > args.bleurt_threshold:
                bleurt_label = reference[instance]['cqs'][bleuert_winner]['label']
                if bleurt_label == 'Useful':
                    bleurt_punctuation += 1/3
            else:
                bleurt_label = 'not_able_to_evaluate'
            new[instance]['cqs'][i]['bleurt_label'] = bleurt_label
            new[instance]['cqs'][i]['bleurt_score'] = bleurt_max_score

            predicted_labels.append((simlabel, bleurt_label))

    else:
        predicted_labels.extend([('not_able_to_evaluate', 'not_able_to_evaluate'), ('not_able_to_evaluate', 'not_able_to_evaluate'), ('not_able_to_evaluate', 'not_able_to_evaluate')])

    new[instance]['sim_score'] = sim_punctuation
    new[instance]['bleurt_score'] = bleurt_punctuation
    punctuations.append((sim_punctuation, bleurt_punctuation))
    logger.info(f'{instance} score (based on similarity): {sim_punctuation:.2f}/1.00 score (based on bleurt): {bleurt_punctuation:.2f}/1.00')

In [None]:
# === Summary ===
logger.info('------ Summary Metrics ------')
logger.info(f'Distribution of labels: {Counter(predicted_labels)}')
logger.info(f'Distribution of punctuation: {Counter(punctuations)}')
total_sim, total_bleurt = 0.0, 0.0
for sim, bleurt in punctuations:
    total_sim += sim
    total_bleurt += bleurt

avg_sim = total_sim / len(punctuations) if punctuations else 0.0
avg_bleurt = total_bleurt / len(punctuations) if punctuations else 0.0

logger.info(f'Overall sim punctuation: {avg_sim:.4f}')
logger.info(f'Overall bleurt punctuation: {avg_bleurt:.4f}')

# === Save updated results with labels ===
output_path = args.submission_path[:-5] + f'_eval.json'
with open(output_path, 'w') as o:
    json.dump(new, o, indent=4)
logger.info(f"Saved labeled output to {output_path}")