In [None]:
!pip install textstat

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, GPT2LMHeadModel, GPT2Tokenizer
from sentence_transformers import SentenceTransformer, util
import spacy
import textstat

# Load models and tools
bert_model = SentenceTransformer('all-MiniLM-L6-v2')
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
nlp = spacy.load("en_core_web_sm")

# Input text and generated question
source_text = "The French Revolution was a period of far-reaching social and political upheaval in France and its colonies."
generated_question = "How did the French Revolution influence modern political ideologies?"

# 1. Relevance: Semantic similarity
source_emb = bert_model.encode(source_text, convert_to_tensor=True)
question_emb = bert_model.encode(generated_question, convert_to_tensor=True)
relevance_score = util.cos_sim(source_emb, question_emb).item()

# 2. Clarity
def calculate_perplexity(text):
    inputs = gpt2_tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = gpt2_model(**inputs, labels=inputs["input_ids"])
    return torch.exp(outputs.loss).item()

def grammar_errors(text):
    doc = nlp(text)
    # Simple grammar heuristic: count tokens with dependency 'dep'
    return sum(1 for token in doc if token.dep_ == "dep")

perplexity = calculate_perplexity(generated_question)
grammar_issue_count = grammar_errors(generated_question)
readability_score = textstat.flesch_reading_ease(generated_question)

# 3. Depth: Semantic distance (proxy for abstraction)
depth_score = 1 - relevance_score

# 4. Insightfulness: Heuristic
abstract_keywords = ["impact", "influence", "significance", "consequences", "assumption", "implication"]
insightfulness_score = depth_score
if any(word in generated_question.lower() for word in abstract_keywords):
    insightfulness_score += 1

# Results
print("\n--- Critical Question Evaluation ---")
print(f"Question: {generated_question}\n")
print(f"Relevance Score (0–1): {relevance_score:.3f}")
print(f"Perplexity (lower = better): {perplexity:.2f}")
print(f"Grammar Issue Count (lower = better): {grammar_issue_count}")
print(f"Readability Score (Flesch, higher = easier): {readability_score:.2f}")
print(f"Depth Score (0–1): {depth_score:.3f}")
print(f"Insightfulness Score (0–2): {insightfulness_score:.3f}")

ModuleNotFoundError: No module named 'textstat'

# Evaluation of Baseline predictions
In this notebook we evaluate the results of our baseline predictions. For the baseline the following pretrained models are used:
- Llama 3.1-8B-Instruct
- Qwen2.5 7B Instruct

## Libraries and Installation

In [None]:
# !pip install sentence-transformers evaluate scikit-learn

In [None]:
# !pip install git+https://github.com/google-research/bleurt.git

In [None]:
!pip install evaluate

In [None]:
import logging
import os
import json
import torch
import numpy as np
from collections import Counter
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from evaluate import load

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Static Variables and Paths

In [None]:
################################################################################
#######################   STATIC VARIABLES      ################################
################################################################################

EVALUATION_NAME = "Evaluation_Llama_3.1_8B_Instruct"

sim_threshold = 0.6 # recommened 0.6 -0.75 [0,1]
bleurt_threshold = 0.2 # recommened 0.2 - 0.4 [-1,1]

sentence_transformer_model = "stsb-mpnet-base-v2"

################################################################################
#######################   PATH VARIABLES        ################################
################################################################################

log_dir = f"/content/drive/MyDrive/HSG/NLP/Project NLP/Evaluation/{EVALUATION_NAME}/"
os.makedirs(log_dir, exist_ok=True)

base_file_path = '/content/drive/MyDrive/HSG/NLP/Project NLP/Data/sample.json'
generated_cqs_path = '/content/drive/MyDrive/HSG/NLP/Project NLP/Data/results_zeroshot_llama_3.1-8B-instruct-finetuned_formated.json'
evaluation_result_path = ""

# Setup logger manually
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Create file handler (only if not already added)
if not logger.handlers:
    fh = logging.FileHandler(f'{log_dir}{EVALUATION_NAME}.log')
    fh.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    logger.addHandler(fh)


In [None]:
# Detect device
device = torch.device(
    "mps" if torch.backends.mps.is_available()
    else "cuda" if torch.cuda.is_available()
    else "cpu"
)

logger.info(f"--------  Start with evaluation for {EVALUATION_NAME}  -------------")
logger.info(f'Device selected: {device}')
logger.info(f'Similarity threshold: {sim_threshold}')
logger.info(f'Bleurt threshold: {bleurt_threshold}')
logger.info(f'Sentence transformer model: {sentence_transformer_model}')

## Evaluation

In [None]:
# === Load model ===
sim_model = SentenceTransformer(sentence_transformer_model)
bleurt_model = load("bleurt", module_type="metric")

In [None]:
# === Load data ===
with open(base_file_path) as f:
    reference = json.load(f)

with open(generated_cqs_path) as f:
    new = json.load(f)

In [None]:
# === Evaluate ===
predicted_labels = []
punctuations = []


for instance in new.keys():
    sim_punctuation = 0
    bleurt_punctuation = 0
    reference_set = [ref['cq'] for ref in reference[instance]['cqs']]

    if new[instance]['cqs'] != 'Missing CQs':
        cqs_check = [cq['cq'] for cq in new[instance]['cqs']]
        if len(cqs_check) != len(set(cqs_check)):
            logger.warning('There are repeated CQs in ' + instance)

        for i, line in enumerate(new[instance]['cqs']):

            sim_winner = None
            sim_sentence_embedding = sim_model.encode(line['cq'])
            sim_reference_embedding = sim_model.encode(reference_set)
            sims = sim_model.similarity(sim_sentence_embedding, sim_reference_embedding).tolist()[0]

            sim_winner = np.argmax(sims)
            sims_max_score = sims[sim_winner]

            if sims_max_score > sim_threshold:
                simlabel = reference[instance]['cqs'][sim_winner]['label']
                if simlabel == 'Useful':
                    sim_punctuation += 1/3
            else:
                label = 'not_able_to_evaluate'
            new[instance]['cqs'][i]['sim_label'] = simlabel
            new[instance]['cqs'][i]['sim_score'] = sims_max_score


            bleuert_winner = None
            bleurt_results = bleurt_model.compute(predictions=[line['cq']] * len(reference_set), references=reference_set)
            bleurt = bleurt_results['scores']

            bleuert_winner = np.argmax(bleurt)
            bleurt_max_score = bleurt[bleuert_winner]

            if bleurt_max_score > bleurt_threshold:
                bleurt_label = reference[instance]['cqs'][bleuert_winner]['label']
                if bleurt_label == 'Useful':
                    bleurt_punctuation += 1/3
            else:
                bleurt_label = 'not_able_to_evaluate'
            new[instance]['cqs'][i]['bleurt_label'] = bleurt_label
            new[instance]['cqs'][i]['bleurt_score'] = bleurt_max_score

            predicted_labels.append((simlabel, bleurt_label))

    else:
        predicted_labels.extend([('not_able_to_evaluate', 'not_able_to_evaluate'), ('not_able_to_evaluate', 'not_able_to_evaluate'), ('not_able_to_evaluate', 'not_able_to_evaluate')])

    new[instance]['sim_score'] = sim_punctuation
    new[instance]['bleurt_score'] = bleurt_punctuation
    punctuations.append((sim_punctuation, bleurt_punctuation))
    logger.info(f'{instance} score (based on similarity): {sim_punctuation:.2f}/1.00 score (based on bleurt): {bleurt_punctuation:.2f}/1.00')

In [None]:
# === Summary ===
logger.info('------ Summary Metrics ------')
logger.info(f'Distribution of labels: {Counter(predicted_labels)}')
logger.info(f'Distribution of punctuation: {Counter(punctuations)}')
total_sim, total_bleurt = 0.0, 0.0
for sim, bleurt in punctuations:
    total_sim += sim
    total_bleurt += bleurt

avg_sim = total_sim / len(punctuations) if punctuations else 0.0
avg_bleurt = total_bleurt / len(punctuations) if punctuations else 0.0

logger.info(f'Overall sim punctuation: {avg_sim:.4f}')
logger.info(f'Overall bleurt punctuation: {avg_bleurt:.4f}')

# === Save updated results with labels ===
output_path = generated_cqs_path[:-5] + f'_eval.json'
with open(output_path, 'w') as o:
    json.dump(new, o, indent=4)
logger.info(f"Saved labeled output to {output_path}")