<a href="https://colab.research.google.com/github/RicoStaedeli/NLP2025_CQG/blob/main/SocratiQ_final_prepro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Final preprocessing of full data set for 4 question types (! STRONG threshold here!)

In [1]:
from google.colab import userdata, drive
import os

In [3]:
token = userdata.get('GITHUB')
repo_url = f"https://{token}@github.com/RicoStaedeli/NLP2025_CQG.git"

!git clone {repo_url}

Cloning into 'NLP2025_CQG'...
remote: Enumerating objects: 585, done.[K
remote: Counting objects: 100% (71/71), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 585 (delta 48), reused 47 (delta 36), pack-reused 514 (from 1)[K
Receiving objects: 100% (585/585), 24.79 MiB | 16.04 MiB/s, done.
Resolving deltas: 100% (274/274), done.


In [4]:
os.chdir("NLP2025_CQG")
!ls

1_Preprocessing.ipynb		     Development
2a_Baseline_Evaluation.ipynb	     Doc
2_Baseline_CQS_generation.ipynb      Evaluation
2_Baseline_CQS_generation_old.ipynb  INFORMATION.md
3a_Finetuned_CQS_generation.ipynb    LICENSE
3b_Finetune_Evaluation.ipynb	     Logs
3_Training.ipynb		     README.md
4a_RAG_CQS_generation.ipynb	     requirements.txt
4b_RAG_Evaluation.ipynb		     SocratiQ_final_prepro.ipynb
4_RAG_System,.ipynb		     Training
5_Evaluation_Analytics.ipynb	     Utils
Data


In [5]:
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
import pandas as pd
import spacy
import nltk

nltk.download('framenet_v17')
nlp = spacy.load("en_core_web_sm")

chunk = "train_chunk_III"

[nltk_data] Downloading package framenet_v17 to /root/nltk_data...
[nltk_data]   Package framenet_v17 is already up-to-date!


In [19]:
filtered_df = pd.read_csv(
    f'/content/drive/MyDrive/HSG/NLP/Project NLP/Data/Final/filtered_{chunk}.csv'
)

### lentgh filtering

In [12]:
df = pd.read_csv(
    os.path.join(os.getcwd(), f"Data/Raw/SocraticQ/{chunk}.csv"),
    names=["category", "context", "question"]
)

df["context_token_len"] = df["context"].apply(lambda text: len(nlp(text)))
filtered_df = df[df["context_token_len"] >= 25].copy()

print(f"Total rows: {len(df)}")
print(f"Rows after filtering: {len(filtered_df)}")

[nltk_data] Downloading package framenet_v17 to /root/nltk_data...
[nltk_data]   Package framenet_v17 is already up-to-date!


Total rows: 28195
Rows after filtering: 27150


In [13]:
filtered_df.to_csv(f'/content/drive/MyDrive/HSG/NLP/Project NLP/Data/Final/filtered_{chunk}.csv', index=False)

### 1. cause to effect

In [8]:
from nltk.corpus import framenet as fn

def get_causal_verbs_from_framenet():
    causal_frame_names = [
        "Causation", "Cause_change", "Cause_change_of_position_on_a_scale",
        "Cause_motion", "Cause_to_amalgamate", "Cause_to_start", "Cause_to_make_progress",
        "Causation_scenario", "Cause_to_end", "Cause_to_resume",
        "Cause_to_continue", "Cause_change_of_consistency","Cause_expansion","Cause_impact"
    ]

    causal_verbs = set()
    for frame_name in causal_frame_names:
        try:
            frame = fn.frame_by_name(frame_name)
            for lu in frame.lexUnit.values():
                if '.v' in lu['name']:  # Only verbs
                    causal_verbs.add(lu['name'].split('.')[0])
        except Exception as e:
            print(f"Error loading frame '{frame_name}': {e}")

    return causal_verbs



def detect_cause_to_effect(sentence):
    doc = nlp(sentence)
    explanations = []
    score = 0

    causal_verbs = get_causal_verbs_from_framenet()

    has_condition = any(tok.dep_ == "mark" and tok.text.lower() in {"if", "when"} for tok in doc)
    if has_condition:
        explanations.append("✓ Conditional clause detected (e.g., 'if', 'when')")
        score += 2

    has_advcl = any(tok.dep_ == "advcl" for tok in doc)
    if has_advcl:
        explanations.append("✓ Adverbial clause (likely effect clause) detected")
        score += 2

    has_causal_verb_structure = False
    for tok in doc:
        if tok.lemma_ in causal_verbs and tok.pos_ == "VERB":
            subj = any(child.dep_ == "nsubj" for child in tok.children)
            obj = any(child.dep_ == "dobj" for child in tok.children)
            prep = any(child.dep_ == "prep" for child in tok.children)
            if subj or obj or prep:
                has_causal_verb_structure = True
                explanations.append(
                    f"✓ Verb '{tok.lemma_}' is listed in FrameNet under causal frames with subject/object/prep"
                )
                score += 2
                if subj: score += 1
                if obj: score += 1
                if prep: score += 1
                break

    is_causal = has_condition and has_advcl or has_causal_verb_structure
    if not is_causal:
        causal_phrases = ["result in", "lead to", "may cause", "because of", "due to","given rise to","resulting from", "stemming from", "driven by", "caused by", "attributed to", "stems from", "reason", "result of", "consequence of", "owning to", "thus", "thereby"]
        if any(phrase in sentence.lower() for phrase in causal_phrases):
            explanations.append("✓ Phrase pattern matches known cause-to-effect trigger")
            score += 1

    score = min(score, 10)
    label = "Strong CauseToEffect" if score >= 7 else "Weak/Partial CauseToEffect" if score >= 4 else "Not CauseToEffect"
    return label, score, explanations


In [9]:
print(get_causal_verbs_from_framenet())

{'shift', 'pair', 'mean', 'pitch', 'increase', 'restart', 'punt', 'keep', 'inspissate', 'elicit', 'clang', 'swell', 'see', 'merge', 'bring on', 'motivate', 'add', 'coalesce', 'thud', 'ram', 'meld', 'generate', 'press', 'slide', 'compound', 'admix', 'soften', 'push', 'slash', 'congeal', 'rap', 'throw together', 'blow up', 'lower', 'put an end to', 'bring', 'hurl', 'reduce', 'propel', 'impel', 'jerk', 'kindle', 'strike', 'actuate', 'instigate', 'convert', 'arouse', 'launch', 'lump', 'perfect', 'haul', 'widen', 'bring about', 'produce', 'click', 'run', 'advance', 'expand', 'make', 'graze', 'rustle', 'rattle', 'provoke', 'clatter', 'catapult', 'grow', 'leave', 'end', 'diminish', 'drive', 'unify', 'wreak', 'deform', 'smack', 'create', 'drag', 'yank', 'combine', 'rake', 'jell', 'draw', 'do away with', 'amalgamate', 'shove', 'throw', 'collide', 'step up', 'commingle', 'change', 'excite', 'transfer', 'attract', 'stir up', 'clink', 'reshape', 'slam', 'hit', 'give rise', 'narrow', 'cast', 'knock

### 2. expert opinion

In [11]:
def get_lexical_units_from_frames(frames):
    terms = set()
    for frame_name in frames:
        try:
            frame = fn.frame_by_name(frame_name)
            for lu in frame.lexUnit.values():
                if '.v' in lu['name']:
                    terms.add(lu['name'].split('.')[0])
        except Exception as e:
            print(f"Warning: Could not load frame '{frame_name}': {e}")
    return terms


expert_frames = [
    "Expertise", "Judgment_communication", "Opinion",
    "Authority", "Statement", "Certainty"
]
quote_frames = ["Statement", "Judgment_communication"]
clarity_frames = ["Reasoning"]
evidence_frames = ["Evidence", "Certainty", "Causation"]


expert_verbs = get_lexical_units_from_frames(expert_frames)
quote_verbs = get_lexical_units_from_frames(quote_frames)
clarity_terms = get_lexical_units_from_frames(clarity_frames)
evidence_terms = get_lexical_units_from_frames(evidence_frames)

def detect_expert_opinion(question):

    doc = nlp(question)
    score = 0
    explanations = []

    expert_titles = {"expert", "researcher", "scientist", "doctor", "analyst", "professor", "Dr."}

    for ent in doc.ents:
        if ent.label_ in {"PERSON", "ORG"}:
            if any(title in ent.text.lower() for title in expert_titles):
                explanations.append(f"✓ Expert entity detected: '{ent.text}'")
                score += 2
                break

    if any(tok.lemma_ in expert_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Detected expert-related verb from FrameNet")
        score += 2

    if any(tok.lemma_ in quote_verbs for tok in doc):
        explanations.append("✓ Quotation or claim verb found")
        score += 1

    if any(tok.lemma_ in clarity_terms for tok in doc):
        explanations.append("✓ Clarity/definition markers found")
        score += 1

    if any(tok.lemma_ in evidence_terms for tok in doc):
        explanations.append("✓ Evidence or support-related terms found")
        score += 2

    label = "Strong Expert Opinion" if score >= 6 else "Weak/Partial Expert Opinion" if score >= 3 else "Not Expert Opinion"
    return label, score, explanations


### 3. Analogy detection

In [12]:
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
analogy_synsets = [wn.synset('similar.a.01'), wn.synset('analogy.n.01'), wn.synset('compare.v.01')]

comparison_frames = ["Similarity"]
contrast_frames = ["Categorization"]
evidence_frames = ["Evidence", "Judgment_communication"]

comparison_verbs = get_lexical_units_from_frames(comparison_frames)
contrast_verbs = get_lexical_units_from_frames(contrast_frames)
evidence_verbs = get_lexical_units_from_frames(evidence_frames)

def is_semantically_analogical(token):
    token_synsets = wn.synsets(token.lemma_)
    for s in token_synsets:
        for analogy_syn in analogy_synsets:
            if s.path_similarity(analogy_syn) and s.path_similarity(analogy_syn) > 0.3:
                return True
    return False

def detect_analogy_question(question):
    doc = nlp(question)
    score = 0
    explanations = []
    noun_chunks = list(doc.noun_chunks)

    if any(tok.lemma_ in comparison_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Comparison verb detected from FrameNet")
        score += 2

    entity_tokens = [tok for tok in doc if tok.pos_ in {"PROPN", "NOUN"}]
    if len(set(tok.lemma_ for tok in entity_tokens)) >= 2:
        explanations.append("✓ Contains at least two distinct concepts/entities")
        score += 1

    if any(tok.lemma_ in contrast_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Contrast or difference verb detected from FrameNet")
        score += 1

    if any(tok.lemma_ in evidence_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Evidence or justification verb found")
        score += 1

    if any(tok.tag_ == "MD" for tok in doc):
        score += 1

    if len(noun_chunks) >= 2 and any(tok.lemma_ in {"similar", "like", "as"} for tok in doc):
        explanations.append("✓ Two concepts compared with similarity cue (e.g., 'similar', 'like')")
        score += 2

    if any(tok.text.lower() == "if" for tok in doc):
        explanations.append("✓ Conditional structure suggesting hypothetical reasoning")
        score += 1

    if any(is_semantically_analogical(tok) for tok in doc if tok.pos_ in {"ADJ", "NOUN", "VERB"}):
        explanations.append("✓ Semantic similarity to analogy-related terms detected via WordNet")
        score += 2

    if any(tok.dep_ in {"prep", "relcl"} and tok.lemma_ in {"compare", "similar"} for tok in doc):
        explanations.append("✓ Syntactic cue of analogy (e.g., 'compared with', 'similar to')")
        score += 1

    label = "Strong Analogy Question" if score >= 8 else "Weak/Partial Analogy Question" if score >= 5 else "Not Analogy Question"
    return label, score, explanations


[nltk_data] Downloading package wordnet to /root/nltk_data...


### 4. Bias

In [14]:

def is_fear_related(token):
    syns = wn.synsets(token.lemma_)
    for s in syns:
        if any(s.path_similarity(wn.synset('danger.n.01')) or
               s.path_similarity(wn.synset('fear.n.01')) or
               s.path_similarity(wn.synset('threat.n.01')) for s in syns):
            return True
    return False


# ---- FrameNet Utility ----
def get_lexical_units_from_frames(frames):
    terms = set()
    for frame_name in frames:
        try:
            frame = fn.frame_by_name(frame_name)
            for lu in frame.lexUnit.values():
                if '.v' in lu['name']:
                    terms.add(lu['name'].split('.')[0])
        except:
            continue
    return terms

# ---- Relevant Lexical Resources ----
causal_frames = ["Causation", "Cause_to_start", "Preventing", "Risk", "Threaten", "Danger"]
causal_verbs = get_lexical_units_from_frames(causal_frames)

fear_keywords = {"danger", "threat", "risky", "harm", "catastrophe", "crisis", "ruin", "fear", "worse", "bad", "fatal"}
preventive_keywords = {"prevent", "avoid", "stop", "ban", "rescue", "save", "protect"}
modal_keywords = {"might", "could", "would", "may", "should"}

def detect_fear_appeal_question(question):
    doc = nlp(question)
    score = 0
    explanations = []

    if any(tok.lemma_ in modal_keywords for tok in doc if tok.tag_ == "MD"):
        explanations.append("✓ Modal verb detected (e.g., 'might', 'would') suggesting hypothetical risk")
        score += 1

    if any(tok.lemma_.lower() in fear_keywords for tok in doc):
        explanations.append("✓ Fear-related keyword detected (e.g., 'threat', 'danger')")
        score += 2

    if any(tok.lemma_.lower() in preventive_keywords for tok in doc):
        explanations.append("✓ Preventive action verb detected (e.g., 'prevent', 'stop')")
        score += 2

    if any(tok.lemma_ in causal_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Causal/preventive verb from FrameNet detected")
        score += 2

    if any(tok.text.lower() in {"if", "unless"} for tok in doc):
        explanations.append("✓ Conditional clause found (e.g., 'if', 'unless')")
        score += 1

    if any(is_fear_related(tok) for tok in doc if tok.pos_ in {"NOUN", "VERB", "ADJ"}):
        explanations.append("✓ Semantic fear-related concept detected via WordNet")
        score += 2

    label = "Strong Fear Appeal" if score >= 6 else "Weak/Partial Fear Appeal" if score >= 4 else "Not Fear Appeal"
    return label, score, explanations


# Final method for preprocessing and filtering for all types

In [20]:
def classify_schema(row):
    question = row["question"]
    results = []

    is_critical = False

    _, cte_score, _ = detect_cause_to_effect(question)
    _, expert_score, _ = detect_expert_opinion(question)
    _, analogy_score, _ = detect_analogy_question(question)
    _, fear_score, _ = detect_fear_appeal_question(question)

    if cte_score >= 7:
      is_critical = True

    if expert_score >= 6:
      is_critical = True

    if analogy_score >= 8:
      is_critical = True

    if fear_score >= 6:
       is_critical = True

    return pd.Series({
        "is_Critical": is_critical,
        "CauseToEffect": cte_score,
        "ExpertOpinion": expert_score,
        "Analogy": analogy_score,
        "FearAppeal": fear_score,
    })




#filtered_df = filtered_df[:1000]
filtered_df[["is_Critical","CauseToEffect", "ExpertOpinion", "Analogy", "FearAppeal"]] = filtered_df.apply(classify_schema, axis=1)
filtered_df

Unnamed: 0,category,context,question,context_token_len,is_Critical,CauseToEffect,ExpertOpinion,Analogy,FearAppeal
0,56388.0,reasons_evidence: Preamble: this is my first C...,Why should the definition of a fruit be based ...,136,False,0,0,6,3
1,56389.0,reasons_evidence: Yes they do it because they ...,Why does the universe need to be functional?,72,False,0,0,2,2
2,56390.0,reasons_evidence: I'm tired of people claiming...,So why are we somehow more powerless now than ...,211,False,2,0,2,2
3,56391.0,reasons_evidence: Freakonomics did a really in...,Is there an article that you could link to ela...,33,False,0,0,3,3
4,56392.0,reasons_evidence: Very spot on. Most of my fel...,Do they even use Latiné though?,45,False,0,0,2,2
...,...,...,...,...,...,...,...,...,...
27026,84575.0,reasons_evidence: I am not aware of any legal ...,Is there a reason to not label the parties as ...,47,False,1,1,3,2
27027,84577.0,reasons_evidence: It’s super common for oppone...,But why call this racism?,70,False,0,0,2,2
27028,84578.0,"reasons_evidence: I studied a government, law ...",How can a professor teach you how without givi...,67,False,0,0,4,2
27029,84579.0,reasons_evidence: You work for money to spend ...,And why do you need to pay for food and shelter?,32,False,0,0,3,2


### final check: how many questions per category ?

In [21]:
value_counts = filtered_df['is_Critical'].value_counts(dropna=False)
print(value_counts)

columns = ["CauseToEffect", "ExpertOpinion", "Analogy", "FearAppeal"]

value_counts_schema = filtered_df[columns].apply(pd.Series.value_counts, dropna=False)
print(value_counts_schema)

is_Critical
False    26262
True       769
Name: count, dtype: int64
   CauseToEffect  ExpertOpinion  Analogy  FearAppeal
0          18467        16311.0   1044.0         141
1            708          720.0   1221.0          72
2           2074         4703.0   4953.0       16092
3           1585         2837.0  10783.0        5959
4           2891         1625.0   6367.0        3355
5            620          785.0   1907.0        1061
6            204           50.0    577.0         262
7            256            NaN    163.0          74
8            193            NaN     16.0          14
9             33            NaN      NaN           1


In [22]:
filtered_df.to_json(f'/content/drive/MyDrive/HSG/NLP/Project NLP/Data/Final/processed_{chunk}.json', index=False)

In [None]:
!git config --global user.name "Showcas"
!git config --global user.email "cedric.bohni@gmx.de"


commit_message = f"Improved Preprocessing file"
!git add .
!git commit -m "{commit_message}"
!git push