<a href="https://colab.research.google.com/github/RicoStaedeli/NLP2025_CQG/blob/main/3_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import spacy
import nltk
import json


nltk.download('framenet_v17')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package framenet_v17 to /root/nltk_data...
[nltk_data]   Unzipping corpora/framenet_v17.zip.


In [2]:
from google.colab import userdata
import os

In [3]:
token = userdata.get('GITHUB')
repo_url = f"https://{token}@github.com/RicoStaedeli/NLP2025_CQG.git"

!git clone {repo_url}

Cloning into 'NLP2025_CQG'...
remote: Enumerating objects: 996, done.[K
remote: Counting objects: 100% (83/83), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 996 (delta 72), reused 54 (delta 54), pack-reused 913 (from 1)[K
Receiving objects: 100% (996/996), 46.08 MiB | 15.96 MiB/s, done.
Resolving deltas: 100% (548/548), done.
Updating files: 100% (95/95), done.


In [4]:
os.chdir("NLP2025_CQG")
!ls

1_Information_preprocessing.md	      Doc
1_Preprocessing.ipynb		      Evaluation
2_Baseline_Generation.ipynb	      INFORMATION.md
2_Information_Baseline_Generation.md  LICENSE
3_Evaluation.ipynb		      Logs
4_Finetuned_Generation.ipynb	      README.md
5_Evaluation_Analytics.ipynb	      requirements.txt
Data				      Training
Development			      Utils


In [13]:
result_file = "results_Meta-Llama-3.1-1B-Instruct_SFT_1"

In [14]:
with open(os.path.join(os.getcwd(), f"Evaluation/Results/{result_file}.json"), "r", encoding="utf-8") as f:
    results = json.load(f)

## Schema

In [15]:
from nltk.corpus import framenet as fn

def get_causal_verbs_from_framenet():
    causal_frame_names = [
        "Causation", "Cause_change", "Cause_change_of_position_on_a_scale",
        "Cause_motion", "Cause_to_amalgamate", "Cause_to_start", "Cause_to_make_progress",
        "Causation_scenario", "Cause_to_end", "Cause_to_resume",
        "Cause_to_continue", "Cause_change_of_consistency","Cause_expansion","Cause_impact"
    ]

    causal_verbs = set()
    for frame_name in causal_frame_names:
        try:
            frame = fn.frame_by_name(frame_name)
            for lu in frame.lexUnit.values():
                if '.v' in lu['name']:  # Only verbs
                    causal_verbs.add(lu['name'].split('.')[0])
        except Exception as e:
            print(f"Error loading frame '{frame_name}': {e}")

    return causal_verbs


causal_meta_terms = {"generalisation", "implies", "entail", "necessitate", "follow from", "inference"}
alternative_factor_terms = {"factor", "interfere", "influence", "affect", "contribute", "complicate"}


def detect_cause_to_effect(sentence):
    doc = nlp(sentence)
    explanations = []
    score = 0

    causal_verbs = get_causal_verbs_from_framenet()

    has_condition = any(tok.dep_ == "mark" and tok.text.lower() in {"if", "when"} for tok in doc)
    if has_condition:
        explanations.append("✓ Conditional clause detected (e.g., 'if', 'when')")
        score += 3

    has_advcl = any(tok.dep_ == "advcl" for tok in doc)
    if has_advcl:
        explanations.append("✓ Adverbial clause (likely effect clause) detected")
        score += 2

    has_causal_verb_structure = False
    for tok in doc:
        if tok.lemma_ in causal_verbs and tok.pos_ == "VERB":
            subj = any(child.dep_ == "nsubj" for child in tok.children)
            obj = any(child.dep_ == "dobj" for child in tok.children)
            prep = any(child.dep_ == "prep" for child in tok.children)
            if subj or obj or prep:
                has_causal_verb_structure = True
                explanations.append(
                    f"✓ Verb '{tok.lemma_}' is listed in FrameNet under causal frames with subject/object/prep"
                )
                score += 3
                if subj: score += 0.5
                if obj: score += 0.5
                if prep: score += 0.5
                break

    if any(tok.lemma_ in causal_meta_terms for tok in doc):
      explanations.append("✓ Causal generalisation or implication term detected (e.g., 'implies', 'generalisation')")
      score += 1

    if any(tok.lemma_ in alternative_factor_terms for tok in doc):
      explanations.append("✓ Terms indicating alternative causes or interfering factors detected")
      score += 1

    is_causal = has_condition and has_advcl or has_causal_verb_structure
    if not is_causal:
        causal_phrases = ["result in", "lead to", "may cause", "because of", "due to","given rise to","resulting from", "stemming from", "driven by", "caused by", "attributed to", "stems from", "reason", "result of", "consequence of", "owning to", "thus", "so", "therefore", "hence"  "thereby"]
        if any(phrase in sentence.lower() for phrase in causal_phrases):
            explanations.append("✓ Phrase pattern matches known cause-to-effect trigger")
            score += 2

    score = min(score, 10)
    label = "Strong CauseToEffect" if score >= 7 else "Weak/Partial CauseToEffect" if score >= 4 else "Not CauseToEffect"
    return label, score, explanations

In [16]:
def get_lexical_units_from_frames(frames):
    terms = set()
    for frame_name in frames:
        try:
            frame = fn.frame_by_name(frame_name)
            for lu in frame.lexUnit.values():
                if '.v' in lu['name']:
                    terms.add(lu['name'].split('.')[0])
        except Exception as e:
            print(f"Warning: Could not load frame '{frame_name}': {e}")
    return terms


expert_frames = [
    "Expertise", "Judgment_communication", "Opinion",
    "Authority", "Statement", "Certainty"
]
quote_frames = ["Statement", "Judgment_communication"]
clarity_frames = ["Reasoning"]
evidence_frames = ["Evidence", "Certainty", "Causation"]


expert_verbs = get_lexical_units_from_frames(expert_frames)
quote_verbs = get_lexical_units_from_frames(quote_frames)
clarity_terms = get_lexical_units_from_frames(clarity_frames)
evidence_terms = get_lexical_units_from_frames(evidence_frames)

def detect_expert_opinion(question):

    doc = nlp(question)
    score = 0
    explanations = []

    expert_titles = {"expert", "researcher", "scientist", "doctor", "analyst", "professor", "Dr."}

    implicit_expert_terms = {"study", "research", "evidence", "report", "findings", "scientific", "government", "official", "paper", "survey", "data"}
    comparison_cues = {"consistent", "align", "similar", "agree", "disagree", "corroborate", "conflict"}
    technical_request_verbs = {"define", "explain", "describe", "elaborate", "clarify"}
    assertion_verbs = {"assert", "affirm", "pronounce", "declare", "maintain", "claim", "state"}
    reference_terms = {"quote", "reference", "cite", "check", "verify", "source"}
    domain_terms = {"science", "scientific", "domain", "field", "discipline", "area", "sector"}


    for ent in doc.ents:
        if ent.label_ in {"PERSON", "ORG"}:
            if any(title in ent.text.lower() for title in expert_titles):
                explanations.append(f"✓ Expert entity detected: '{ent.text}'")
                score += 3
                break

    if any(tok.lemma_ in expert_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Detected expert-related verb from FrameNet")
        score += 2

    if any(tok.lemma_ in quote_verbs for tok in doc):
        explanations.append("✓ Quotation or claim verb found")
        score += 1

    if any(tok.lemma_ in clarity_terms for tok in doc):
        explanations.append("✓ Clarity/definition markers found")
        score += 1

    if any(tok.lemma_ in evidence_terms for tok in doc):
        explanations.append("✓ Evidence or support-related terms found")
        score += 2

    if any(tok.lemma_.lower() in implicit_expert_terms for tok in doc):
      explanations.append("✓ Implicit expert-related term detected (e.g., 'study', 'government')")
      score += 2

    if any(tok.lemma_.lower() in comparison_cues for tok in doc):
      explanations.append("✓ Cross-study comparison term detected (e.g., 'consistent', 'similar')")
      score += 0.5

    if any(tok.lemma_.lower() in technical_request_verbs for tok in doc):
      explanations.append("✓ Technical explanation request detected (e.g., 'define', 'explain')")
      score += 1

    if any(tok.dep_ == "attr" and tok.lemma_ == "expert" for tok in doc):
      explanations.append("✓ Predicate nominative indicating expertise detected (e.g., 'X is an expert')")
      score += 2

    if any(tok.lemma_.lower() in assertion_verbs for tok in doc):
      explanations.append("✓ Assertion or claim verb detected (e.g., 'assert', 'affirm')")
      score += 1

    if any(tok.lemma_.lower() in reference_terms for tok in doc):
      explanations.append("✓ Source/reference validation term detected (e.g., 'quote', 'reference')")
      score += 1

    if any(tok.lemma_.lower() in domain_terms for tok in doc):
      explanations.append("✓ Domain relevance indicator detected (e.g., 'science', 'domainD')")
      score += 1

    label = "Strong Expert Opinion" if score >= 7 else "Weak/Partial Expert Opinion" if score >= 4 else "Not Expert Opinion"
    return label, score, explanations

In [17]:
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
analogy_synsets = [wn.synset('similar.a.01'), wn.synset('analogy.n.01'), wn.synset('compare.v.01')]

comparison_frames = ["Similarity"]
contrast_frames = ["Categorization"]
evidence_frames = ["Evidence", "Judgment_communication"]

comparison_verbs = get_lexical_units_from_frames(comparison_frames)
contrast_verbs = get_lexical_units_from_frames(contrast_frames)
evidence_verbs = get_lexical_units_from_frames(evidence_frames)

def is_semantically_analogical(token):
    token_synsets = wn.synsets(token.lemma_)
    for s in token_synsets:
        for analogy_syn in analogy_synsets:
            if s.path_similarity(analogy_syn) and s.path_similarity(analogy_syn) > 0.3:
                return True
    return False

analogy_context_cues = {"respect", "in which", "such that", "with regard to", "in terms of"}

analogy_force_cues = {"undermine", "weaken", "strengthen", "force of similarity", "degree of analogy"}

analogy_nouns = {"analogy", "comparison", "parallel", "similarity", "analogue"}

def detect_analogy_question(question):
    doc = nlp(question)
    score = 0
    explanations = []
    noun_chunks = list(doc.noun_chunks)

    if any(tok.lemma_ in comparison_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Comparison verb detected from FrameNet")
        score += 2.5

    entity_tokens = [tok for tok in doc if tok.pos_ in {"PROPN", "NOUN"}]
    if len(set(tok.lemma_ for tok in entity_tokens)) >= 2:
        explanations.append("✓ Contains at least two distinct concepts/entities")
        score += 1

    if any(tok.lemma_ in contrast_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Contrast or difference verb detected from FrameNet")
        score += 1

    if any(tok.lemma_ in evidence_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Evidence or justification verb found")
        score += 1

    if any(tok.tag_ == "MD" for tok in doc):
        score += 0.5

    if len(noun_chunks) >= 2 and any(tok.lemma_ in {"similar", "like", "as"} for tok in doc):
        explanations.append("✓ Two concepts compared with similarity cue (e.g., 'similar', 'like')")
        score += 3

    if any(tok.text.lower() == "if" for tok in doc):
        explanations.append("✓ Conditional structure suggesting hypothetical reasoning")
        score += 1

    if any(is_semantically_analogical(tok) for tok in doc if tok.pos_ in {"ADJ", "NOUN", "VERB"}):
        explanations.append("✓ Semantic similarity to analogy-related terms detected via WordNet")
        score += 2

    if any(tok.dep_ in {"prep", "relcl"} and tok.lemma_ in {"compare", "similar"} for tok in doc):
        explanations.append("✓ Syntactic cue of analogy (e.g., 'compared with', 'similar to')")
        score += 1

    if any(phrase in question.lower() for phrase in analogy_context_cues):
      explanations.append("✓ Contextual analogy marker detected (e.g., 'in which', 'such that')")
      score += 0.5

    if any(tok.lemma_ in analogy_force_cues for tok in doc):
      explanations.append("✓ Analogy evaluation term detected (e.g., 'undermine', 'strengthen')")
      score += 0.5

    if any(tok.lemma_ in analogy_nouns for tok in doc if tok.pos_ == "NOUN"):
      explanations.append("✓ Explicit analogy noun detected (e.g., 'analogy', 'comparison')")
      score += 2

    if any(tok.dep_ == "neg" for tok in doc):
      if any(tok.lemma_ in {"similar", "compare", "alike", "match"} for tok in doc):
          explanations.append("✓ Negated comparison detected (suggesting analogy breakdown)")
          score += 1

    score = min(score, 10)
    label = "Strong Analogy Question" if score >= 7 else "Weak/Partial Analogy Question" if score >= 4 else "Not Analogy Question"
    return label, score, explanations

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [18]:
def is_fear_related(token):
    syns = wn.synsets(token.lemma_)
    for s in syns:
        if any(s.path_similarity(wn.synset('danger.n.01')) or s.path_similarity(wn.synset('problem.n.01')) or
               s.path_similarity(wn.synset('fear.n.01')) or s.path_similarity(wn.synset('harm.n.01')) or
               s.path_similarity(wn.synset('threat.n.01')) for s in syns):
            return True
    return False


# ---- FrameNet Utility ----
def get_lexical_units_from_frames(frames):
    terms = set()
    for frame_name in frames:
        try:
            frame = fn.frame_by_name(frame_name)
            for lu in frame.lexUnit.values():
                if '.v' in lu['name']:
                    terms.add(lu['name'].split('.')[0])
        except:
            continue
    return terms

# ---- Relevant Lexical Resources ----
causal_frames = ["Causation", "Cause_to_start", "Preventing", "Risk", "Threaten", "Danger"]
causal_verbs = get_lexical_units_from_frames(causal_frames)

fear_keywords = {"danger", "threat", "risky", "harm", "catastrophe", "crisis", "ruin", "fear", "worse", "bad", "fatal", "negative", "die", "death"}
preventive_keywords = {"prevent", "avoid", "stop", "ban", "rescue", "save", "protect"}

urgency_keywords = {"immediately", "soon", "before it's too late", "critical", "urgent", "suddenly", "unexpectedly"}

possibility_terms = {"possible", "possibility", "likely", "likelihood", "chance", "probability", "conceivable", "potential", "can", "could", "might", "may", "able"}


def detect_fear_appeal_question(question):
    doc = nlp(question)
    score = 0
    explanations = []

    if any(tok.lemma_.lower() in fear_keywords for tok in doc):
        explanations.append("✓ Fear-related keyword detected (e.g., 'threat', 'danger')")
        score += 3

    if any(tok.lemma_.lower() in preventive_keywords for tok in doc):
        explanations.append("✓ Preventive action verb detected (e.g., 'prevent', 'stop')")
        score += 2

    if any(tok.lemma_ in causal_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Causal/preventive verb from FrameNet detected")
        score += 2

    if any(tok.text.lower() in {"if", "unless"} for tok in doc):
        explanations.append("✓ Conditional clause found (e.g., 'if', 'unless')")
        score += 1

    if any(is_fear_related(tok) for tok in doc if tok.pos_ in {"NOUN", "VERB", "ADJ"}):
        explanations.append("✓ Semantic fear-related concept detected via WordNet")
        score += 2

    if any(phrase in question.lower() for phrase in urgency_keywords):
        explanations.append("✓ Urgency marker detected (e.g., 'immediately', 'before it's too late')")
        score += 1

    if any(tok.lemma_ in possibility_terms for tok in doc):
        explanations.append("✓ Possibility-related term detected (e.g., 'possible', 'feasible', 'chance')")
        score += 1

    score = min(score, 10)
    label = "Strong Fear Appeal" if score >= 7 else "Weak/Partial Fear Appeal" if score >= 4 else "Not Fear Appeal"
    return label, score, explanations

In [19]:
questions = [
    "How strong is the generalisation that if the world exists then we exist?",
    "Are there other factors in this particular case that could have interfered with the event of existance of the world?"
]

for question in questions:
    result = detect_cause_to_effect(question)
    print(f"Question: {question}")
    print(f"Result: {result}\n")

Question: How strong is the generalisation that if the world exists then we exist?
Result: ('Weak/Partial CauseToEffect', 6, ["✓ Conditional clause detected (e.g., 'if', 'when')", '✓ Adverbial clause (likely effect clause) detected', "✓ Causal generalisation or implication term detected (e.g., 'implies', 'generalisation')"])

Question: Are there other factors in this particular case that could have interfered with the event of existance of the world?
Result: ('Not CauseToEffect', 1, ['✓ Terms indicating alternative causes or interfering factors detected'])



In [20]:
questions = [
    "Is Peter a genuine expert in science?",
    "Did Peter really assert that the world exists?",
    "Is Peter’s pronouncement directly quoted? If not, is a reference to the original source given? Can it be checked?",
    "If Peter’s advice is not quoted, does it look like important information or qualifications may have been left out?",
    "Is what Peter said clear? Are there technical terms used that are not explained clearly?",
    "Is existance of the world relevant to domain science?",
    "Is existance of the world consistent with what other experts in <domainD> say?",
    "Is existance of the world consistent with known evidence in <domainD>?"
]

for question in questions:
    result = detect_expert_opinion(question)
    print(f"Question: {question}")
    print(f"Result: {result}\n")

Question: Is Peter a genuine expert in science?
Result: ('Not Expert Opinion', 3, ["✓ Predicate nominative indicating expertise detected (e.g., 'X is an expert')", "✓ Domain relevance indicator detected (e.g., 'science', 'domainD')"])

Question: Did Peter really assert that the world exists?
Result: ('Weak/Partial Expert Opinion', 4, ['✓ Detected expert-related verb from FrameNet', '✓ Quotation or claim verb found', "✓ Assertion or claim verb detected (e.g., 'assert', 'affirm')"])

Question: Is Peter’s pronouncement directly quoted? If not, is a reference to the original source given? Can it be checked?
Result: ('Not Expert Opinion', 1, ["✓ Source/reference validation term detected (e.g., 'quote', 'reference')"])

Question: If Peter’s advice is not quoted, does it look like important information or qualifications may have been left out?
Result: ('Not Expert Opinion', 3, ['✓ Evidence or support-related terms found', "✓ Source/reference validation term detected (e.g., 'quote', 'reference

In [21]:
questions = [
    "Are frogs and horses similar in the respect cited?",
    "Is the existance of the world true in horses?",
    "Are there differences between horses and frogs that would tend to undermine the force of the similarity cited?",
    "Is there some other case that is also similar to horses, but in which frog is false?"
]

for question in questions:
    result = detect_analogy_question(question)
    print(f"Question: {question}")
    print(f"Result: {result}\n")

Question: Are frogs and horses similar in the respect cited?
Result: ('Strong Analogy Question', 7.5, ['✓ Contains at least two distinct concepts/entities', '✓ Evidence or justification verb found', "✓ Two concepts compared with similarity cue (e.g., 'similar', 'like')", '✓ Semantic similarity to analogy-related terms detected via WordNet', "✓ Contextual analogy marker detected (e.g., 'in which', 'such that')"])

Question: Is the existance of the world true in horses?
Result: ('Not Analogy Question', 3, ['✓ Contains at least two distinct concepts/entities', '✓ Semantic similarity to analogy-related terms detected via WordNet'])

Question: Are there differences between horses and frogs that would tend to undermine the force of the similarity cited?
Result: ('Weak/Partial Analogy Question', 5.0, ['✓ Contains at least two distinct concepts/entities', '✓ Evidence or justification verb found', "✓ Analogy evaluation term detected (e.g., 'undermine', 'strengthen')", "✓ Explicit analogy noun d

In [22]:
questions = [
    "Is the world existing bad? Why and to whom is it bad?",
    "Is the world existing away to prevent people from dying?",
    "Is it practically possible for world existing to happen?",
    "Are there other consequences from the world existing?"
]

for question in questions:
    result = detect_fear_appeal_question(question)
    print(f"Question: {question}")
    print(f"Result: {result}\n")

Question: Is the world existing bad? Why and to whom is it bad?
Result: ('Weak/Partial Fear Appeal', 5, ["✓ Fear-related keyword detected (e.g., 'threat', 'danger')", '✓ Semantic fear-related concept detected via WordNet'])

Question: Is the world existing away to prevent people from dying?
Result: ('Strong Fear Appeal', 7, ["✓ Fear-related keyword detected (e.g., 'threat', 'danger')", "✓ Preventive action verb detected (e.g., 'prevent', 'stop')", '✓ Semantic fear-related concept detected via WordNet'])

Question: Is it practically possible for world existing to happen?
Result: ('Not Fear Appeal', 3, ['✓ Semantic fear-related concept detected via WordNet', "✓ Possibility-related term detected (e.g., 'possible', 'feasible', 'chance')"])

Question: Are there other consequences from the world existing?
Result: ('Not Fear Appeal', 2, ['✓ Semantic fear-related concept detected via WordNet'])



## Evaluation

### Rule evaluation

In [23]:
def check_max_length(question, max_chars=120, max_words=20):
    return len(question) <= max_chars and len(question.split()) <= max_words

In [24]:
!pip install language-tool-python

Collecting language-tool-python
  Downloading language_tool_python-2.9.3-py3-none-any.whl.metadata (54 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/54.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.7/54.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Downloading language_tool_python-2.9.3-py3-none-any.whl (55 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/55.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: language-tool-python
Successfully installed language-tool-python-2.9.3


In [25]:
!sudo apt update
!sudo apt install openjdk-17-jdk
!java -version

[33m0% [Working][0m            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
[33m0% [Waiting for headers] [1 InRelease 14.2 kB/129 kB 11%] [Connecting to cloud.[0m                                                                               Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
[33m0% [Waiting for headers] [1 InRelease 129 kB/129 kB 100%] [Connecting to cloud.[0m                                                                               Get:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
[33m0% [3 InRelease 28.7 kB/128 kB 22%] [Connected to cloud.r-project.org (65.9.86.[0m[33m0% [Waiting for headers] [Connected to cloud.r-project.org (65.9.86.109)] [Conn[0m                                                                               Get:4 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
[33m0% [4 InRelease 15.6 kB/127 kB 12%] [Connected to cloud.r-project.org (65.9.86.[0m

In [26]:
import language_tool_python
import re

tool = language_tool_python.LanguageTool('en-US')

def check_grammar_tool(question):
    matches = tool.check(question)
    return len(matches) == 0

Downloading LanguageTool latest: 100%|██████████| 252M/252M [00:12<00:00, 19.7MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmppukjce6h.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://internal1.languagetool.org/snapshots/LanguageTool-latest-snapshot.zip to /root/.cache/language_tool_python.


In [27]:
def check_no_newlines(question):
    return '\n' not in question

In [28]:
def check_single_question(question):
    return question.count('?') == 1 and not re.search(r"\?\s*(and|or)\s", question.lower())

In [29]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def is_grammatical_question(text, threshold=0.7):
    candidate_labels = ["grammatically correct question", "not a question", "grammatically incorrect question"]
    result = classifier(text, candidate_labels)

    top_label = result['labels'][0]
    top_score = result['scores'][0]

    return top_label == "grammatically correct question" and top_score >= threshold

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
def combined_grammar_check(question):
    try:
        lt_ok = check_grammar_tool(question)
    except Exception as e:
        print("LanguageTool not available or error:", e)
        lt_ok = True

    spacy_ok = check_basic_grammar_syntax(question)

    zero_shot_ok = is_grammatical_question(question, threshold=0.7)

    checks = [lt_ok, spacy_ok, zero_shot_ok]
    return sum(checks) >= 2

In [None]:
def check_no_consecutive_repeated_words(question):
    words = question.lower().split()
    for i in range(len(words) - 1):
        if words[i] == words[i + 1]:
            print(f"Consecutive repetition detected: '{words[i]}'")
            return False
    return True

In [None]:
def validate_question(question):
    return all([
        check_max_length(question),
        check_grammar_tool(question),
        check_no_newlines(question),
        check_single_question(question),
        check_no_consecutive_repeated_words(question),
        combined_grammar_check(question)
    ])


In [None]:
q = "Why sky blue is?"
print(validate_question(q))

### syntax

In [30]:
def classify_schema(question):
    _, cte_score, _ = detect_cause_to_effect(question)

    _, expert_score, _ = detect_expert_opinion(question)

    _, analogy_score, _ = detect_analogy_question(question)

    _, fear_score, _ = detect_fear_appeal_question(question)

    return cte_score, expert_score, analogy_score, fear_score

### semantic

In [5]:
!pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=86b3b433197618194d75c4c32c28aeee46613d0f84c1f0a7a45f5b1ec1df0d8b
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [6]:
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from rouge_score import rouge_scorer
import numpy as np

In [12]:
sbert = SentenceTransformer("all-MiniLM-L6-v2")
tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")

def token_overlap(input_text, question):
    doc1 = nlp(input_text)
    doc2 = nlp(question)
    tokens1 = set([token.lemma_.lower() for token in doc1 if not token.is_stop and token.is_alpha])
    tokens2 = set([token.lemma_.lower() for token in doc2 if not token.is_stop and token.is_alpha])
    intersection = tokens1.intersection(tokens2)
    union = tokens1.union(tokens2)
    jaccard = len(intersection) / len(union) if union else 0
    return jaccard

def cosine_similarity(input_text, question):
    embeddings = sbert.encode([input_text, question], convert_to_tensor=True)
    score = util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()
    return score

def entailment_score(input_text, question):
    inputs = tokenizer.encode_plus(input_text, question, return_tensors="pt", truncation=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = torch.softmax(logits, dim=1).squeeze()
    entailment_prob = probs[2].item()
    return entailment_prob

def evaluate_context_closeness(input_text, question):
    scores = {
        "token_overlap": token_overlap(input_text, question),
        "cosine_similarity": cosine_similarity(input_text, question),
        "entailment_score": entailment_score(input_text, question),
    }
    scores["average_score"] = np.mean(list(scores.values()))

    verdict = True if (
        scores["token_overlap"] > 0.3 and
        scores["cosine_similarity"] > 0.6 and
        scores["entailment_score"] > 0.7
    ) else False

    return scores, verdict

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



--- Context Closeness Evaluation ---
token_overlap: 0.3333
cosine_similarity: 0.7829
entailment_score: 0.7297
average_score: 0.6153
True


In [None]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")

threshold = 0.5

from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

def rouge_score(context, question):
    scores = scorer.score(context, question)
    return scores['rougeL'].fmeasure

def clean_and_tokenize(text):
    text = text.lower()
    text = re.sub(rf"[{re.escape(string.punctuation)}]", "", text)
    tokens = text.split()
    return [t for t in tokens if t not in ENGLISH_STOP_WORDS]

def word_overlap_score(context, question):
    ctx_tokens = set(clean_and_tokenize(context))
    q_tokens = set(clean_and_tokenize(question))
    if not q_tokens:
        return 0.0
    return len(ctx_tokens & q_tokens) / len(q_tokens)

def semantic_score(context, question):

    embeddings = embedder.encode([context, question], convert_to_tensor=True)
    cos_sim_score = float(util.pytorch_cos_sim(embeddings[0], embeddings[1]).item())

    w_score = word_overlap_score(context, question)
    r_score = rouge_score(context, question)

    hybrid_score = float(0.6 * cos_sim_score + 0.2 * w_score + 0.2 * r_score)
    verdict = True if hybrid_score >= threshold else False

    return cos_sim_score, w_score, r_score, hybrid_score, verdict

### total evaluation

In [31]:
def classify_schema_and_semantics(context, question):
    cte_score, expert_score, analogy_score, fear_score = classify_schema(question)
    return {
        "CauseToEffect": cte_score,
        "ExpertOpinion": expert_score,
        "Analogy": analogy_score,
        "FearAppeal": fear_score,
    }

In [32]:
for key, obj in results.items():
    context_text = obj["input"]
    for cq_entry in obj["cqs"]:
        schema = cq_entry.get("schema")
        question = cq_entry["cq"]

        scores = classify_schema_and_semantics(context_text, question)

        cq_entry["CauseToEffect"] = scores["CauseToEffect"]
        cq_entry["ExpertOpinion"] = scores["ExpertOpinion"]
        cq_entry["Analogy"] = scores["Analogy"]
        cq_entry["FearAppeal"] = scores["FearAppeal"]

        is_critical = False
        if schema and cq_entry.get(schema, 0) >= 7:
            is_critical = True
        cq_entry["is_critical"] = is_critical


## Save and Git

In [33]:
with open(os.path.join(os.getcwd(), f"Evaluation/Scored/{result_file}_eval.json"), "w", encoding="utf-8") as f:
  json.dump(results, f, indent=2, ensure_ascii=False)

In [34]:
!git config --global user.name "Showcas"
!git config --global user.email "cedric.bohni@gmx.de"


commit_message = f"evaluate CQs for {result_file}"
!git add .
!git commit -m "{commit_message}"
!git push

[main f195a5c] evaluate CQs for results_Meta-Llama-3.1-1B-Instruct_SFT_1
 1 file changed, 3720 deletions(-)
To https://github.com/RicoStaedeli/NLP2025_CQG.git
 [31m! [rejected]       [m main -> main (fetch first)
[31merror: failed to push some refs to 'https://github.com/RicoStaedeli/NLP2025_CQG.git'
[m[33mhint: Updates were rejected because the remote contains work that you do[m
[33mhint: not have locally. This is usually caused by another repository pushing[m
[33mhint: to the same ref. You may want to first integrate the remote changes[m
[33mhint: (e.g., 'git pull ...') before pushing again.[m
[33mhint: See the 'Note about fast-forwards' in 'git push --help' for details.[m
