# Final preprocessing of full data set for 4 question types (! STRONG threshold here!)

### lentgh filtering

In [1]:
import pandas as pd
import spacy
import nltk

nltk.download('framenet_v17')

nlp = spacy.load("en_core_web_sm")


df = pd.read_csv(
    "Data/processed/SocratiQ/train_chunk_I.csv",
    names=["category", "context", "question"]
)

df["context_token_len"] = df["context"].apply(lambda text: len(nlp(text)))
filtered_df = df[df["context_token_len"] >= 25].copy()

print(f"Total rows: {len(df)}")
print(f"Rows after filtering: {len(filtered_df)}")

[nltk_data] Downloading package framenet_v17 to
[nltk_data]     /Users/lorenaraichle/nltk_data...
[nltk_data]   Package framenet_v17 is already up-to-date!


Total rows: 28195
Rows after filtering: 26804


### 1. cause to effect

In [2]:
from nltk.corpus import framenet as fn

def get_causal_verbs_from_framenet():
    causal_frame_names = [
        "Causation", "Cause_change", "Cause_change_of_position_on_a_scale",
        "Cause_motion", "Cause_to_amalgamate", "Cause_to_start", "Cause_to_make_progress"
    ]

    causal_verbs = set()
    for frame_name in causal_frame_names:
        try:
            frame = fn.frame_by_name(frame_name)
            for lu in frame.lexUnit.values():
                if '.v' in lu['name']:  # Only verbs
                    causal_verbs.add(lu['name'].split('.')[0])
        except Exception as e:
            print(f"Error loading frame '{frame_name}': {e}")
    
    return causal_verbs



def detect_cause_to_effect(sentence):
    doc = nlp(sentence)
    explanations = []
    score = 0

    causal_verbs = get_causal_verbs_from_framenet()

    has_condition = any(tok.dep_ == "mark" and tok.text.lower() in {"if", "when"} for tok in doc)
    if has_condition:
        explanations.append("✓ Conditional clause detected (e.g., 'if', 'when')")
        score += 2

    has_advcl = any(tok.dep_ == "advcl" for tok in doc)
    if has_advcl:
        explanations.append("✓ Adverbial clause (likely effect clause) detected")
        score += 2

    has_causal_verb_structure = False
    for tok in doc:
        if tok.lemma_ in causal_verbs and tok.pos_ == "VERB":
            subj = any(child.dep_ == "nsubj" for child in tok.children)
            obj = any(child.dep_ == "dobj" for child in tok.children)
            prep = any(child.dep_ == "prep" for child in tok.children)
            if subj or obj or prep:
                has_causal_verb_structure = True
                explanations.append(
                    f"✓ Verb '{tok.lemma_}' is listed in FrameNet under causal frames with subject/object/prep"
                )
                score += 2
                if subj: score += 1
                if obj: score += 1
                if prep: score += 1
                break

    is_causal = has_condition and has_advcl or has_causal_verb_structure
    if not is_causal:
        causal_phrases = ["result in", "lead to", "cause", "because of", "due to"]
        if any(phrase in sentence.lower() for phrase in causal_phrases):
            explanations.append("✓ Phrase pattern matches known cause-to-effect trigger")
            score += 1

    score = min(score, 10)
    label = "Strong CauseToEffect" if score >= 7 else "Weak/Partial CauseToEffect" if score >= 4 else "Not CauseToEffect"
    return label, score, explanations


### 2. expert opinion

In [3]:
def get_lexical_units_from_frames(frames):
    terms = set()
    for frame_name in frames:
        try:
            frame = fn.frame_by_name(frame_name)
            for lu in frame.lexUnit.values():
                if '.v' in lu['name']:
                    terms.add(lu['name'].split('.')[0])
        except Exception as e:
            print(f"Warning: Could not load frame '{frame_name}': {e}")
    return terms


expert_frames = [
    "Expertise", "Judgment_communication", "Opinion",
    "Authority", "Statement", "Certainty"
]
quote_frames = ["Statement", "Judgment_communication"]
clarity_frames = ["Reasoning"]
evidence_frames = ["Evidence", "Certainty", "Causation"]


expert_verbs = get_lexical_units_from_frames(expert_frames)
quote_verbs = get_lexical_units_from_frames(quote_frames)
clarity_terms = get_lexical_units_from_frames(clarity_frames)
evidence_terms = get_lexical_units_from_frames(evidence_frames)

def detect_expert_opinion(question):
    
    doc = nlp(question)
    score = 0
    explanations = []

    expert_titles = {"expert", "researcher", "scientist", "doctor", "analyst", "professor", "Dr."}

    for ent in doc.ents:
        if ent.label_ in {"PERSON", "ORG"}:
            if any(title in ent.text.lower() for title in expert_titles):
                explanations.append(f"✓ Expert entity detected: '{ent.text}'")
                score += 2
                break

    if any(tok.lemma_ in expert_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Detected expert-related verb from FrameNet")
        score += 2

    if any(tok.lemma_ in quote_verbs for tok in doc):
        explanations.append("✓ Quotation or claim verb found")
        score += 1

    if any(tok.lemma_ in clarity_terms for tok in doc):
        explanations.append("✓ Clarity/definition markers found")
        score += 1

    if any(tok.lemma_ in evidence_terms for tok in doc):
        explanations.append("✓ Evidence or support-related terms found")
        score += 2

    label = "Strong Expert Opinion" if score >= 6 else "Weak/Partial Expert Opinion" if score >= 3 else "Not Expert Opinion"
    return label, score, explanations


### 3. Analogy detection

In [4]:
from nltk.corpus import wordnet as wn
analogy_synsets = [wn.synset('similar.a.01'), wn.synset('analogy.n.01'), wn.synset('compare.v.01')]

comparison_frames = ["Similarity"]
contrast_frames = ["Categorization"]
evidence_frames = ["Evidence", "Judgment_communication"]

comparison_verbs = get_lexical_units_from_frames(comparison_frames)
contrast_verbs = get_lexical_units_from_frames(contrast_frames)
evidence_verbs = get_lexical_units_from_frames(evidence_frames)

def is_semantically_analogical(token):
    token_synsets = wn.synsets(token.lemma_)
    for s in token_synsets:
        for analogy_syn in analogy_synsets:
            if s.path_similarity(analogy_syn) and s.path_similarity(analogy_syn) > 0.3:
                return True
    return False

def detect_analogy_question(question):
    doc = nlp(question)
    score = 0
    explanations = []
    noun_chunks = list(doc.noun_chunks)

    if any(tok.lemma_ in comparison_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Comparison verb detected from FrameNet")
        score += 2

    entity_tokens = [tok for tok in doc if tok.pos_ in {"PROPN", "NOUN"}]
    if len(set(tok.lemma_ for tok in entity_tokens)) >= 2:
        explanations.append("✓ Contains at least two distinct concepts/entities")
        score += 1

    if any(tok.lemma_ in contrast_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Contrast or difference verb detected from FrameNet")
        score += 1

    if any(tok.lemma_ in evidence_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Evidence or justification verb found")
        score += 1

    if any(tok.tag_ == "MD" for tok in doc):
        score += 1

    if len(noun_chunks) >= 2 and any(tok.lemma_ in {"similar", "like", "as"} for tok in doc):
        explanations.append("✓ Two concepts compared with similarity cue (e.g., 'similar', 'like')")
        score += 2

    if any(tok.text.lower() == "if" for tok in doc):
        explanations.append("✓ Conditional structure suggesting hypothetical reasoning")
        score += 1

    if any(is_semantically_analogical(tok) for tok in doc if tok.pos_ in {"ADJ", "NOUN", "VERB"}):
        explanations.append("✓ Semantic similarity to analogy-related terms detected via WordNet")
        score += 2

    if any(tok.dep_ in {"prep", "relcl"} and tok.lemma_ in {"compare", "similar"} for tok in doc):
        explanations.append("✓ Syntactic cue of analogy (e.g., 'compared with', 'similar to')")
        score += 1

    label = "Strong Analogy Question" if score >= 8 else "Weak/Partial Analogy Question" if score >= 5 else "Not Analogy Question"
    return label, score, explanations


### 4. Bias

In [5]:

def is_fear_related(token):
    syns = wn.synsets(token.lemma_)
    for s in syns:
        if any(s.path_similarity(wn.synset('danger.n.01')) or
               s.path_similarity(wn.synset('fear.n.01')) or
               s.path_similarity(wn.synset('threat.n.01')) for s in syns):
            return True
    return False


# ---- FrameNet Utility ----
def get_lexical_units_from_frames(frames):
    terms = set()
    for frame_name in frames:
        try:
            frame = fn.frame_by_name(frame_name)
            for lu in frame.lexUnit.values():
                if '.v' in lu['name']:
                    terms.add(lu['name'].split('.')[0])
        except:
            continue
    return terms

# ---- Relevant Lexical Resources ----
causal_frames = ["Causation", "Cause_to_start", "Preventing", "Risk", "Threaten", "Danger"]
causal_verbs = get_lexical_units_from_frames(causal_frames)

fear_keywords = {"danger", "threat", "risky", "harm", "catastrophe", "crisis", "ruin", "fear", "worse", "bad", "fatal"}
preventive_keywords = {"prevent", "avoid", "stop", "ban", "rescue", "save", "protect"}
modal_keywords = {"might", "could", "would", "may", "should"}

def detect_fear_appeal_question(question):
    doc = nlp(question)
    score = 0
    explanations = []

    if any(tok.lemma_ in modal_keywords for tok in doc if tok.tag_ == "MD"):
        explanations.append("✓ Modal verb detected (e.g., 'might', 'would') suggesting hypothetical risk")
        score += 1

    if any(tok.lemma_.lower() in fear_keywords for tok in doc):
        explanations.append("✓ Fear-related keyword detected (e.g., 'threat', 'danger')")
        score += 2

    if any(tok.lemma_.lower() in preventive_keywords for tok in doc):
        explanations.append("✓ Preventive action verb detected (e.g., 'prevent', 'stop')")
        score += 2

    if any(tok.lemma_ in causal_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Causal/preventive verb from FrameNet detected")
        score += 2

    if any(tok.text.lower() in {"if", "unless"} for tok in doc):
        explanations.append("✓ Conditional clause found (e.g., 'if', 'unless')")
        score += 1

    if any(is_fear_related(tok) for tok in doc if tok.pos_ in {"NOUN", "VERB", "ADJ"}):
        explanations.append("✓ Semantic fear-related concept detected via WordNet")
        score += 2

    label = "Strong Fear Appeal" if score >= 6 else "Weak/Partial Fear Appeal" if score >= 4 else "Not Fear Appeal"
    return label, score, explanations


# Final method for preprocessing and filtering for all types

In [12]:
def classify_schema(row):
    question = row["question"]
    results = []


    cte_label, cte_score, _ = detect_cause_to_effect(question)
    if cte_score >= 7:
        results.append(("CauseToEffect", cte_score))
    elif cte_score >= 4:
        results.append(("CauseToEffect", cte_score))

    expert_label, expert_score, _ = detect_expert_opinion(question)
    if expert_score >= 6:
        results.append(("ExpertOpinion", expert_score))
    elif expert_score >= 3:
        results.append(("ExpertOpinion", expert_score))

    analogy_label, analogy_score, _ = detect_analogy_question(question)
    if analogy_score >= 8:
        results.append(("Analogy", analogy_score))
    elif analogy_score >= 5:
        results.append(("Analogy", analogy_score))

    fear_label, fear_score, _ = detect_fear_appeal_question(question)
    if fear_score >= 6:
        results.append(("FearAppeal", fear_score))
    elif fear_score >= 4:
        results.append(("FearAppeal", fear_score))

    results.sort(key=lambda x: -x[1])


    schema_labels = [label for label, _ in results]
    while len(schema_labels) < 4:
        schema_labels.append("")

    return pd.Series({
        "schema_1": schema_labels[0],
        "schema_2": schema_labels[1],
        "schema_3": schema_labels[2],
        "schema_4": schema_labels[3],
    })

  
  

# TODO: @ CEDRIC  delte and run for all 3 train files (train_chunk1, 2, 3)
#filtered_df = filtered_df[:1000]
filtered_df[["schema_1", "schema_2", "schema_3", "schema_4"]] = filtered_df.apply(classify_schema, axis=1)
filtered_df

Unnamed: 0,category,context,question,context_token_len,schema_1,schema_2,schema_3,schema_4
1,0.0,alternate_viewpoints_perspectives: A parallel ...,What about nations who have nothing?,126,,,,
2,1.0,alternate_viewpoints_perspectives: It would be...,"If not, what about this is cringe exactly?",50,CauseToEffect,,,
3,2.0,alternate_viewpoints_perspectives: I do not un...,What about public surveillance cameras?,50,,,,
4,3.0,alternate_viewpoints_perspectives: There is a ...,How about allowing some students to go straigh...,222,ExpertOpinion,,,
5,4.0,alternate_viewpoints_perspectives: You conside...,What else do you imagine is necessary to be co...,144,,,,
...,...,...,...,...,...,...,...,...
304,303.0,"alternate_viewpoints_perspectives: Typically, ...",Are the highways a little less safe?,80,,,,
305,304.0,alternate_viewpoints_perspectives: I want to s...,What about cases where the mother is suicidal ...,85,FearAppeal,,,
306,305.0,alternate_viewpoints_perspectives: I'm sorry I...,Would the same things have happened under any ...,46,,,,
307,306.0,alternate_viewpoints_perspectives: I get that ...,How about someone from Jamaica with dark skin?,88,,,,


### final check: how many questions per category ?

In [14]:
from collections import Counter

all_schemas = filtered_df["schema_1"].tolist() + filtered_df["schema_2"].tolist() + filtered_df["schema_3"].tolist() + filtered_df["schema_4"].tolist()
all_schemas = [schema for schema in all_schemas if schema]  


schema_counts = Counter(all_schemas)

schema_summary = pd.DataFrame.from_dict(schema_counts, orient='index', columns=['count'])
schema_summary["percent"] = (schema_summary["count"] / len(filtered_df)) * 100
schema_summary = schema_summary.sort_values("count", ascending=False)

print("\nSchema classification distribution:")
print(schema_summary)



Schema classification distribution:
               count    percent
ExpertOpinion     36  12.000000
FearAppeal        26   8.666667
CauseToEffect     15   5.000000
Analogy           12   4.000000


In [None]:
# TODO: check for multiple assignments in final processed training data 