<a href="https://colab.research.google.com/github/RicoStaedeli/NLP2025_CQG/blob/main/3_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import spacy
import nltk
import json


nltk.download('framenet_v17')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package framenet_v17 to /root/nltk_data...
[nltk_data]   Unzipping corpora/framenet_v17.zip.


In [2]:
from google.colab import userdata
import os

In [3]:
token = userdata.get('GITHUB')
repo_url = f"https://{token}@github.com/RicoStaedeli/NLP2025_CQG.git"

!git clone {repo_url}

Cloning into 'NLP2025_CQG'...
remote: Enumerating objects: 1137, done.[K
remote: Counting objects: 100% (224/224), done.[K
remote: Compressing objects: 100% (142/142), done.[K
remote: Total 1137 (delta 158), reused 112 (delta 80), pack-reused 913 (from 1)[K
Receiving objects: 100% (1137/1137), 48.16 MiB | 18.74 MiB/s, done.
Resolving deltas: 100% (634/634), done.
Updating files: 100% (107/107), done.


In [4]:
os.chdir("NLP2025_CQG")
!ls

1_a_Generate_DPO_Dataset.ipynb	      Development
1_Information_preprocessing.md	      Doc
1_Preprocessing.ipynb		      Evaluation
2_Baseline_Generation.ipynb	      INFORMATION.md
2_Information_Baseline_Generation.md  LICENSE
3_Evaluation.ipynb		      Logs
3_Training_1_SFT_3.ipynb	      README.md
4_Finetuned_Generation.ipynb	      requirements.txt
5_Evaluation_Analytics.ipynb	      Training
Data				      Utils


In [7]:
result_file = "results_schema_Baseline_Meta-Llama-3.1-8B-Instruct-bnb-4bit"

In [8]:
with open(os.path.join(os.getcwd(), f"Evaluation/Results/{result_file}.json"), "r", encoding="utf-8") as f:
    results = json.load(f)

## Schema

In [9]:
from nltk.corpus import framenet as fn

def get_causal_verbs_from_framenet():
    causal_frame_names = [
        "Causation", "Cause_change", "Cause_change_of_position_on_a_scale",
        "Cause_motion", "Cause_to_amalgamate", "Cause_to_start", "Cause_to_make_progress",
        "Causation_scenario", "Cause_to_end", "Cause_to_resume",
        "Cause_to_continue", "Cause_change_of_consistency","Cause_expansion","Cause_impact"
    ]

    causal_verbs = set()
    for frame_name in causal_frame_names:
        try:
            frame = fn.frame_by_name(frame_name)
            for lu in frame.lexUnit.values():
                if '.v' in lu['name']:  # Only verbs
                    causal_verbs.add(lu['name'].split('.')[0])
        except Exception as e:
            print(f"Error loading frame '{frame_name}': {e}")

    return causal_verbs


causal_meta_terms = {"generalisation", "implies", "entail", "necessitate", "follow from", "inference"}
alternative_factor_terms = {"factor", "interfere", "influence", "affect", "contribute", "complicate"}


def detect_cause_to_effect(sentence):
    doc = nlp(sentence)
    explanations = []
    score = 0

    causal_verbs = get_causal_verbs_from_framenet()

    has_condition = any(tok.dep_ == "mark" and tok.text.lower() in {"if", "when"} for tok in doc)
    if has_condition:
        explanations.append("✓ Conditional clause detected (e.g., 'if', 'when')")
        score += 3

    has_advcl = any(tok.dep_ == "advcl" for tok in doc)
    if has_advcl:
        explanations.append("✓ Adverbial clause (likely effect clause) detected")
        score += 2

    has_causal_verb_structure = False
    for tok in doc:
        if tok.lemma_ in causal_verbs and tok.pos_ == "VERB":
            subj = any(child.dep_ == "nsubj" for child in tok.children)
            obj = any(child.dep_ == "dobj" for child in tok.children)
            prep = any(child.dep_ == "prep" for child in tok.children)
            if subj or obj or prep:
                has_causal_verb_structure = True
                explanations.append(
                    f"✓ Verb '{tok.lemma_}' is listed in FrameNet under causal frames with subject/object/prep"
                )
                score += 3
                if subj: score += 0.5
                if obj: score += 0.5
                if prep: score += 0.5
                break

    if any(tok.lemma_ in causal_meta_terms for tok in doc):
      explanations.append("✓ Causal generalisation or implication term detected (e.g., 'implies', 'generalisation')")
      score += 1

    if any(tok.lemma_ in alternative_factor_terms for tok in doc):
      explanations.append("✓ Terms indicating alternative causes or interfering factors detected")
      score += 1

    is_causal = has_condition and has_advcl or has_causal_verb_structure
    if not is_causal:
        causal_phrases = ["result in", "lead to", "may cause", "because of", "due to","given rise to","resulting from", "stemming from", "driven by", "caused by", "attributed to", "stems from", "reason", "result of", "consequence of", "owning to", "thus", "so", "therefore", "hence"  "thereby"]
        if any(phrase in sentence.lower() for phrase in causal_phrases):
            explanations.append("✓ Phrase pattern matches known cause-to-effect trigger")
            score += 2

    score = min(score, 10)
    label = "Strong CauseToEffect" if score >= 7 else "Weak/Partial CauseToEffect" if score >= 4 else "Not CauseToEffect"
    return label, score, explanations

In [10]:
def get_lexical_units_from_frames(frames):
    terms = set()
    for frame_name in frames:
        try:
            frame = fn.frame_by_name(frame_name)
            for lu in frame.lexUnit.values():
                if '.v' in lu['name']:
                    terms.add(lu['name'].split('.')[0])
        except Exception as e:
            print(f"Warning: Could not load frame '{frame_name}': {e}")
    return terms


expert_frames = [
    "Expertise", "Judgment_communication", "Opinion",
    "Authority", "Statement", "Certainty"
]
quote_frames = ["Statement", "Judgment_communication"]
clarity_frames = ["Reasoning"]
evidence_frames = ["Evidence", "Certainty", "Causation"]


expert_verbs = get_lexical_units_from_frames(expert_frames)
quote_verbs = get_lexical_units_from_frames(quote_frames)
clarity_terms = get_lexical_units_from_frames(clarity_frames)
evidence_terms = get_lexical_units_from_frames(evidence_frames)

def detect_expert_opinion(question):

    doc = nlp(question)
    score = 0
    explanations = []

    expert_titles = {"expert", "researcher", "scientist", "doctor", "analyst", "professor", "Dr."}

    implicit_expert_terms = {"study", "research", "evidence", "report", "findings", "scientific", "government", "official", "paper", "survey", "data"}
    comparison_cues = {"consistent", "align", "similar", "agree", "disagree", "corroborate", "conflict"}
    technical_request_verbs = {"define", "explain", "describe", "elaborate", "clarify"}
    assertion_verbs = {"assert", "affirm", "pronounce", "declare", "maintain", "claim", "state"}
    reference_terms = {"quote", "reference", "cite", "check", "verify", "source"}
    domain_terms = {"science", "scientific", "domain", "field", "discipline", "area", "sector"}


    for ent in doc.ents:
        if ent.label_ in {"PERSON", "ORG"}:
            if any(title in ent.text.lower() for title in expert_titles):
                explanations.append(f"✓ Expert entity detected: '{ent.text}'")
                score += 3
                break

    if any(tok.lemma_ in expert_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Detected expert-related verb from FrameNet")
        score += 2

    if any(tok.lemma_ in quote_verbs for tok in doc):
        explanations.append("✓ Quotation or claim verb found")
        score += 1

    if any(tok.lemma_ in clarity_terms for tok in doc):
        explanations.append("✓ Clarity/definition markers found")
        score += 1

    if any(tok.lemma_ in evidence_terms for tok in doc):
        explanations.append("✓ Evidence or support-related terms found")
        score += 2

    if any(tok.lemma_.lower() in implicit_expert_terms for tok in doc):
      explanations.append("✓ Implicit expert-related term detected (e.g., 'study', 'government')")
      score += 2

    if any(tok.lemma_.lower() in comparison_cues for tok in doc):
      explanations.append("✓ Cross-study comparison term detected (e.g., 'consistent', 'similar')")
      score += 0.5

    if any(tok.lemma_.lower() in technical_request_verbs for tok in doc):
      explanations.append("✓ Technical explanation request detected (e.g., 'define', 'explain')")
      score += 1

    if any(tok.dep_ == "attr" and tok.lemma_ == "expert" for tok in doc):
      explanations.append("✓ Predicate nominative indicating expertise detected (e.g., 'X is an expert')")
      score += 2

    if any(tok.lemma_.lower() in assertion_verbs for tok in doc):
      explanations.append("✓ Assertion or claim verb detected (e.g., 'assert', 'affirm')")
      score += 1

    if any(tok.lemma_.lower() in reference_terms for tok in doc):
      explanations.append("✓ Source/reference validation term detected (e.g., 'quote', 'reference')")
      score += 1

    if any(tok.lemma_.lower() in domain_terms for tok in doc):
      explanations.append("✓ Domain relevance indicator detected (e.g., 'science', 'domainD')")
      score += 1

    label = "Strong Expert Opinion" if score >= 7 else "Weak/Partial Expert Opinion" if score >= 4 else "Not Expert Opinion"
    return label, score, explanations

In [11]:
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
analogy_synsets = [wn.synset('similar.a.01'), wn.synset('analogy.n.01'), wn.synset('compare.v.01')]

comparison_frames = ["Similarity"]
contrast_frames = ["Categorization"]
evidence_frames = ["Evidence", "Judgment_communication"]

comparison_verbs = get_lexical_units_from_frames(comparison_frames)
contrast_verbs = get_lexical_units_from_frames(contrast_frames)
evidence_verbs = get_lexical_units_from_frames(evidence_frames)

def is_semantically_analogical(token):
    token_synsets = wn.synsets(token.lemma_)
    for s in token_synsets:
        for analogy_syn in analogy_synsets:
            if s.path_similarity(analogy_syn) and s.path_similarity(analogy_syn) > 0.3:
                return True
    return False

analogy_context_cues = {"respect", "in which", "such that", "with regard to", "in terms of"}

analogy_force_cues = {"undermine", "weaken", "strengthen", "force of similarity", "degree of analogy"}

analogy_nouns = {"analogy", "comparison", "parallel", "similarity", "analogue"}

def detect_analogy_question(question):
    doc = nlp(question)
    score = 0
    explanations = []
    noun_chunks = list(doc.noun_chunks)

    if any(tok.lemma_ in comparison_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Comparison verb detected from FrameNet")
        score += 2.5

    entity_tokens = [tok for tok in doc if tok.pos_ in {"PROPN", "NOUN"}]
    if len(set(tok.lemma_ for tok in entity_tokens)) >= 2:
        explanations.append("✓ Contains at least two distinct concepts/entities")
        score += 1

    if any(tok.lemma_ in contrast_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Contrast or difference verb detected from FrameNet")
        score += 1

    if any(tok.lemma_ in evidence_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Evidence or justification verb found")
        score += 1

    if any(tok.tag_ == "MD" for tok in doc):
        score += 0.5

    if len(noun_chunks) >= 2 and any(tok.lemma_ in {"similar", "like", "as"} for tok in doc):
        explanations.append("✓ Two concepts compared with similarity cue (e.g., 'similar', 'like')")
        score += 3

    if any(tok.text.lower() == "if" for tok in doc):
        explanations.append("✓ Conditional structure suggesting hypothetical reasoning")
        score += 1

    if any(is_semantically_analogical(tok) for tok in doc if tok.pos_ in {"ADJ", "NOUN", "VERB"}):
        explanations.append("✓ Semantic similarity to analogy-related terms detected via WordNet")
        score += 2

    if any(tok.dep_ in {"prep", "relcl"} and tok.lemma_ in {"compare", "similar"} for tok in doc):
        explanations.append("✓ Syntactic cue of analogy (e.g., 'compared with', 'similar to')")
        score += 1

    if any(phrase in question.lower() for phrase in analogy_context_cues):
      explanations.append("✓ Contextual analogy marker detected (e.g., 'in which', 'such that')")
      score += 0.5

    if any(tok.lemma_ in analogy_force_cues for tok in doc):
      explanations.append("✓ Analogy evaluation term detected (e.g., 'undermine', 'strengthen')")
      score += 0.5

    if any(tok.lemma_ in analogy_nouns for tok in doc if tok.pos_ == "NOUN"):
      explanations.append("✓ Explicit analogy noun detected (e.g., 'analogy', 'comparison')")
      score += 2

    if any(tok.dep_ == "neg" for tok in doc):
      if any(tok.lemma_ in {"similar", "compare", "alike", "match"} for tok in doc):
          explanations.append("✓ Negated comparison detected (suggesting analogy breakdown)")
          score += 1

    score = min(score, 10)
    label = "Strong Analogy Question" if score >= 7 else "Weak/Partial Analogy Question" if score >= 4 else "Not Analogy Question"
    return label, score, explanations

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [12]:
def is_fear_related(token):
    syns = wn.synsets(token.lemma_)
    for s in syns:
        if any(s.path_similarity(wn.synset('danger.n.01')) or s.path_similarity(wn.synset('problem.n.01')) or
               s.path_similarity(wn.synset('fear.n.01')) or s.path_similarity(wn.synset('harm.n.01')) or
               s.path_similarity(wn.synset('threat.n.01')) for s in syns):
            return True
    return False


# ---- FrameNet Utility ----
def get_lexical_units_from_frames(frames):
    terms = set()
    for frame_name in frames:
        try:
            frame = fn.frame_by_name(frame_name)
            for lu in frame.lexUnit.values():
                if '.v' in lu['name']:
                    terms.add(lu['name'].split('.')[0])
        except:
            continue
    return terms

# ---- Relevant Lexical Resources ----
causal_frames = ["Causation", "Cause_to_start", "Preventing", "Risk", "Threaten", "Danger"]
causal_verbs = get_lexical_units_from_frames(causal_frames)

fear_keywords = {"danger", "threat", "risky", "harm", "catastrophe", "crisis", "ruin", "fear", "worse", "bad", "fatal", "negative", "die", "death"}
preventive_keywords = {"prevent", "avoid", "stop", "ban", "rescue", "save", "protect"}

urgency_keywords = {"immediately", "soon", "before it's too late", "critical", "urgent", "suddenly", "unexpectedly"}

possibility_terms = {"possible", "possibility", "likely", "likelihood", "chance", "probability", "conceivable", "potential", "can", "could", "might", "may", "able"}


def detect_fear_appeal_question(question):
    doc = nlp(question)
    score = 0
    explanations = []

    if any(tok.lemma_.lower() in fear_keywords for tok in doc):
        explanations.append("✓ Fear-related keyword detected (e.g., 'threat', 'danger')")
        score += 3

    if any(tok.lemma_.lower() in preventive_keywords for tok in doc):
        explanations.append("✓ Preventive action verb detected (e.g., 'prevent', 'stop')")
        score += 2

    if any(tok.lemma_ in causal_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Causal/preventive verb from FrameNet detected")
        score += 2

    if any(tok.text.lower() in {"if", "unless"} for tok in doc):
        explanations.append("✓ Conditional clause found (e.g., 'if', 'unless')")
        score += 1

    if any(is_fear_related(tok) for tok in doc if tok.pos_ in {"NOUN", "VERB", "ADJ"}):
        explanations.append("✓ Semantic fear-related concept detected via WordNet")
        score += 2

    if any(phrase in question.lower() for phrase in urgency_keywords):
        explanations.append("✓ Urgency marker detected (e.g., 'immediately', 'before it's too late')")
        score += 1

    if any(tok.lemma_ in possibility_terms for tok in doc):
        explanations.append("✓ Possibility-related term detected (e.g., 'possible', 'feasible', 'chance')")
        score += 1

    score = min(score, 10)
    label = "Strong Fear Appeal" if score >= 7 else "Weak/Partial Fear Appeal" if score >= 4 else "Not Fear Appeal"
    return label, score, explanations

In [13]:
questions = [
    "How strong is the generalisation that if the world exists then we exist?",
    "Are there other factors in this particular case that could have interfered with the event of existance of the world?"
]

for question in questions:
    result = detect_cause_to_effect(question)
    print(f"Question: {question}")
    print(f"Result: {result}\n")

Question: How strong is the generalisation that if the world exists then we exist?
Result: ('Weak/Partial CauseToEffect', 6, ["✓ Conditional clause detected (e.g., 'if', 'when')", '✓ Adverbial clause (likely effect clause) detected', "✓ Causal generalisation or implication term detected (e.g., 'implies', 'generalisation')"])

Question: Are there other factors in this particular case that could have interfered with the event of existance of the world?
Result: ('Not CauseToEffect', 1, ['✓ Terms indicating alternative causes or interfering factors detected'])



In [14]:
questions = [
    "Is Peter a genuine expert in science?",
    "Did Peter really assert that the world exists?",
    "Is Peter’s pronouncement directly quoted? If not, is a reference to the original source given? Can it be checked?",
    "If Peter’s advice is not quoted, does it look like important information or qualifications may have been left out?",
    "Is what Peter said clear? Are there technical terms used that are not explained clearly?",
    "Is existance of the world relevant to domain science?",
    "Is existance of the world consistent with what other experts in <domainD> say?",
    "Is existance of the world consistent with known evidence in <domainD>?"
]

for question in questions:
    result = detect_expert_opinion(question)
    print(f"Question: {question}")
    print(f"Result: {result}\n")

Question: Is Peter a genuine expert in science?
Result: ('Not Expert Opinion', 3, ["✓ Predicate nominative indicating expertise detected (e.g., 'X is an expert')", "✓ Domain relevance indicator detected (e.g., 'science', 'domainD')"])

Question: Did Peter really assert that the world exists?
Result: ('Weak/Partial Expert Opinion', 4, ['✓ Detected expert-related verb from FrameNet', '✓ Quotation or claim verb found', "✓ Assertion or claim verb detected (e.g., 'assert', 'affirm')"])

Question: Is Peter’s pronouncement directly quoted? If not, is a reference to the original source given? Can it be checked?
Result: ('Not Expert Opinion', 1, ["✓ Source/reference validation term detected (e.g., 'quote', 'reference')"])

Question: If Peter’s advice is not quoted, does it look like important information or qualifications may have been left out?
Result: ('Not Expert Opinion', 3, ['✓ Evidence or support-related terms found', "✓ Source/reference validation term detected (e.g., 'quote', 'reference

In [15]:
questions = [
    "Are frogs and horses similar in the respect cited?",
    "Is the existance of the world true in horses?",
    "Are there differences between horses and frogs that would tend to undermine the force of the similarity cited?",
    "Is there some other case that is also similar to horses, but in which frog is false?"
]

for question in questions:
    result = detect_analogy_question(question)
    print(f"Question: {question}")
    print(f"Result: {result}\n")

Question: Are frogs and horses similar in the respect cited?
Result: ('Strong Analogy Question', 7.5, ['✓ Contains at least two distinct concepts/entities', '✓ Evidence or justification verb found', "✓ Two concepts compared with similarity cue (e.g., 'similar', 'like')", '✓ Semantic similarity to analogy-related terms detected via WordNet', "✓ Contextual analogy marker detected (e.g., 'in which', 'such that')"])

Question: Is the existance of the world true in horses?
Result: ('Not Analogy Question', 3, ['✓ Contains at least two distinct concepts/entities', '✓ Semantic similarity to analogy-related terms detected via WordNet'])

Question: Are there differences between horses and frogs that would tend to undermine the force of the similarity cited?
Result: ('Weak/Partial Analogy Question', 5.0, ['✓ Contains at least two distinct concepts/entities', '✓ Evidence or justification verb found', "✓ Analogy evaluation term detected (e.g., 'undermine', 'strengthen')", "✓ Explicit analogy noun d

In [16]:
questions = [
    "Is the world existing bad? Why and to whom is it bad?",
    "Is the world existing away to prevent people from dying?",
    "Is it practically possible for world existing to happen?",
    "Are there other consequences from the world existing?"
]

for question in questions:
    result = detect_fear_appeal_question(question)
    print(f"Question: {question}")
    print(f"Result: {result}\n")

Question: Is the world existing bad? Why and to whom is it bad?
Result: ('Weak/Partial Fear Appeal', 5, ["✓ Fear-related keyword detected (e.g., 'threat', 'danger')", '✓ Semantic fear-related concept detected via WordNet'])

Question: Is the world existing away to prevent people from dying?
Result: ('Strong Fear Appeal', 7, ["✓ Fear-related keyword detected (e.g., 'threat', 'danger')", "✓ Preventive action verb detected (e.g., 'prevent', 'stop')", '✓ Semantic fear-related concept detected via WordNet'])

Question: Is it practically possible for world existing to happen?
Result: ('Not Fear Appeal', 3, ['✓ Semantic fear-related concept detected via WordNet', "✓ Possibility-related term detected (e.g., 'possible', 'feasible', 'chance')"])

Question: Are there other consequences from the world existing?
Result: ('Not Fear Appeal', 2, ['✓ Semantic fear-related concept detected via WordNet'])



## Evaluation

### Rule evaluation

In [17]:
import re
from collections import Counter

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer


_embedding_model = SentenceTransformer('all-MiniLM-L6-v2')


def is_valid_output(text: str,
                    max_repetition_ratio: float = 0.3,
                    ngram_size: int = 2):

    tokens = text.split()
    if not tokens:
        return False
    token_counts = Counter(tokens)
    most_common_token, most_common_count = token_counts.most_common(1)[0]
    if most_common_count / len(tokens) > max_repetition_ratio:
        return False
    ngrams = [' '.join(tokens[i:i+ngram_size]) for i in range(len(tokens) - ngram_size + 1)]
    if not ngrams:
        return True

    ngram_counts = Counter(ngrams)
    most_common_ngram, ngram_count = ngram_counts.most_common(1)[0]
    if ngram_count / len(ngrams) > max_repetition_ratio:
        return False
    if len(tokens) < 3:
        return False
    return True

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### syntax

In [19]:
def classify_schema(question):
    _, cte_score, _ = detect_cause_to_effect(question)

    _, expert_score, _ = detect_expert_opinion(question)

    _, analogy_score, _ = detect_analogy_question(question)

    _, fear_score, _ = detect_fear_appeal_question(question)

    return cte_score, expert_score, analogy_score, fear_score

### semantic

In [20]:
def is_context_question_matching(context: str,
                                 question: str,
                                 similarity_threshold: float = 0.5):

    embeddings = _embedding_model.encode([context, question])
    ctx_vec, q_vec = embeddings
    sim = cosine_similarity([ctx_vec], [q_vec])[0][0]

    return sim >= similarity_threshold

### total evaluation

In [21]:
for key, obj in results.items():
    context_text = obj["input"]
    for cq_entry in obj["cqs"]:
        schema = cq_entry.get("schema")
        question = cq_entry["cq"]

        cte_score, expert_score, analogy_score, fear_score = classify_schema(question)

        cq_entry["CauseToEffect"] = cte_score
        cq_entry["ExpertOpinion"] = expert_score
        cq_entry["Analogy"] = analogy_score
        cq_entry["FearAppeal"] = fear_score

        is_critical = False
        if schema and cq_entry.get(schema, 0) >= 7:
            is_critical = True

        cq_entry["is_critical"] = is_critical
        cq_entry["passed_rules"] = bool(is_valid_output(question))
        cq_entry["in_context"] = bool(is_context_question_matching(context_text, question))


## Save and Git

In [22]:
with open(os.path.join(os.getcwd(), f"Evaluation/Scored/{result_file}_eval.json"), "w", encoding="utf-8") as f:
  json.dump(results, f, indent=2, ensure_ascii=False)

In [23]:
!git config --global user.name "Showcas"
!git config --global user.email "cedric.bohni@gmx.de"


commit_message = f"evaluate CQs for {result_file}"
!git add .
!git commit -m "{commit_message}"
!git push

[main f154843] evaluate CQs for results_schema_Baseline_Meta-Llama-3.1-8B-Instruct-bnb-4bit
 1 file changed, 9116 insertions(+), 7628 deletions(-)
 rewrite Evaluation/Scored/results_schema_Baseline_Meta-Llama-3.1-8B-Instruct-bnb-4bit_eval.json (73%)
Enumerating objects: 9, done.
Counting objects: 100% (9/9), done.
Delta compression using up to 2 threads
Compressing objects: 100% (5/5), done.
Writing objects: 100% (5/5), 81.38 KiB | 3.39 MiB/s, done.
Total 5 (delta 4), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (4/4), completed with 4 local objects.[K
To https://github.com/RicoStaedeli/NLP2025_CQG.git
   9e3b6ce..f154843  main -> main
