<a href="https://colab.research.google.com/github/RicoStaedeli/NLP2025_CQG/blob/main/1_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://colab.research.google.com/github/RicoStaedeli/NLP2025_CQG/blob/main/1_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Final preprocessing of full data set for 4 question types (! STRONG threshold here!)

## Setup
First we define some constant values and also install all needed libraries

### Installation

In [1]:
import plotly.graph_objects as go
from collections import Counter
import pandas as pd
import spacy
import nltk
from nltk.corpus import framenet as fn
from nltk.corpus import wordnet as wn
import logging
import os

nltk.download('wordnet')
nltk.download('framenet_v17')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package framenet_v17 to /root/nltk_data...
[nltk_data]   Unzipping corpora/framenet_v17.zip.


### Colab
This part is only relevant when using the notebook in google colab

In [4]:
from google.colab import userdata, drive

Clone GitHub Repository to directly push generated files

In [5]:
token = userdata.get('GITHUB')
repo_url = f"https://{token}@github.com/RicoStaedeli/NLP2025_CQG.git"

!git clone {repo_url}

Cloning into 'NLP2025_CQG'...
remote: Enumerating objects: 1565, done.[K
remote: Counting objects: 100% (213/213), done.[K
remote: Compressing objects: 100% (102/102), done.[K
remote: Total 1565 (delta 165), reused 130 (delta 110), pack-reused 1352 (from 2)[K
Receiving objects: 100% (1565/1565), 55.17 MiB | 26.61 MiB/s, done.
Resolving deltas: 100% (895/895), done.


### Path Variables and Logger

In [8]:
################################################################################
#######################   STATIC VARIABLES      ################################
################################################################################

MIN_CONTEXT_LENGTH = 25

################################################################################
#######################   PATH VARIABLES        ################################
################################################################################

raw_socratiq_chunk_1_path = "/content/NLP2025_CQG/Data/Raw/SocraticQ/train_chunk_I.csv"
raw_socratiq_chunk_2_path = "/content/NLP2025_CQG/Data/Raw/SocraticQ/train_chunk_II.csv"
raw_socratiq_chunk_3_path = "/content/NLP2025_CQG/Data/Raw/SocraticQ/train_chunk_III.csv"

processed_sft_dataset_path = "/content/NLP2025_CQG/Data/Processed/CQ SFT Dataset.json"
processed_dpo_dataset_path = "/content/NLP2025_CQG/Data/Processed/CQ DPO Dataset.json"
processed_full_dataset_path = "/content/NLP2025_CQG/Data/Processed/CQ FULL Dataset.json"

log_file_path = f"/content/NLP2025_CQG/Logs/data_preprocessing.log"


################################################################################
#######################   LOGGER                ################################
################################################################################

# Setup logger manually
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Create file handler (only if not already added)
if not logger.handlers:
    fh = logging.FileHandler(log_file_path)
    fh.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    logger.addHandler(fh)

## Preprocessing

### lentgh filtering
In this step we filter all very small questions from the dataset.

In [47]:
logger.info("Start Preprocessing...")
logger.info(f"Start length filtering with min context size: {MIN_CONTEXT_LENGTH}")

# Define paths to the input files
input_files = [
    raw_socratiq_chunk_1_path,
    raw_socratiq_chunk_2_path,
    raw_socratiq_chunk_3_path
]

# Load and combine all CSVs
dataframes = []
for file_path in input_files:
    df = pd.read_csv(
        file_path,
        names=["category", "context", "question"]
    )
    dataframes.append(df)

combined_df = pd.concat(dataframes, ignore_index=True)


### Only for debuging purpose
combined_df = combined_df.head(200).copy()

# Compute context token lengths
combined_df["context_token_len"] = combined_df["context"].apply(lambda text: len(nlp(str(text))))

# Filter rows with context token length >= 25
filtered_df = combined_df[combined_df["context_token_len"] >= MIN_CONTEXT_LENGTH].copy()

# Print stats
print(f"Total rows: {len(combined_df)}")
print(f"Rows after filtering: {len(filtered_df)}")
logger.info(f"Total rows: {len(combined_df)}")
logger.info(f"Rows after filtering: {len(filtered_df)}")

# Save to new CSV
#filtered_df.to_csv("filtered_socraticq.csv", index=False)

INFO:__main__:Start Preprocessing...
INFO:__main__:Start length filtering with min context size: 25
INFO:__main__:Total rows: 200
INFO:__main__:Rows after filtering: 194


Total rows: 200
Rows after filtering: 194


### Argumentative schmeme scoring
In this section each entry in the dataset is scored and evaluated according four different argumentative schemes

#### 1. cause to effect

In [11]:
def get_causal_verbs_from_framenet():
    causal_frame_names = [
        "Causation", "Cause_change", "Cause_change_of_position_on_a_scale",
        "Cause_motion", "Cause_to_amalgamate", "Cause_to_start", "Cause_to_make_progress",
        "Causation_scenario", "Cause_to_end", "Cause_to_resume",
        "Cause_to_continue", "Cause_change_of_consistency","Cause_expansion","Cause_impact"
    ]

    causal_verbs = set()
    for frame_name in causal_frame_names:
        try:
            frame = fn.frame_by_name(frame_name)
            for lu in frame.lexUnit.values():
                if '.v' in lu['name']:  # Only verbs
                    causal_verbs.add(lu['name'].split('.')[0])
        except Exception as e:
            print(f"Error loading frame '{frame_name}': {e}")

    return causal_verbs


causal_meta_terms = {"generalisation", "implies", "entail", "necessitate", "follow from", "inference"}
alternative_factor_terms = {"factor", "interfere", "influence", "affect", "contribute", "complicate"}


def detect_cause_to_effect(sentence):
    doc = nlp(sentence)
    explanations = []
    score = 0

    causal_verbs = get_causal_verbs_from_framenet()

    has_condition = any(tok.dep_ == "mark" and tok.text.lower() in {"if", "when"} for tok in doc)
    if has_condition:
        explanations.append("✓ Conditional clause detected (e.g., 'if', 'when')")
        score += 3

    has_advcl = any(tok.dep_ == "advcl" for tok in doc)
    if has_advcl:
        explanations.append("✓ Adverbial clause (likely effect clause) detected")
        score += 2

    has_causal_verb_structure = False
    for tok in doc:
        if tok.lemma_ in causal_verbs and tok.pos_ == "VERB":
            subj = any(child.dep_ == "nsubj" for child in tok.children)
            obj = any(child.dep_ == "dobj" for child in tok.children)
            prep = any(child.dep_ == "prep" for child in tok.children)
            if subj or obj or prep:
                has_causal_verb_structure = True
                explanations.append(
                    f"✓ Verb '{tok.lemma_}' is listed in FrameNet under causal frames with subject/object/prep"
                )
                score += 3
                if subj: score += 0.5
                if obj: score += 0.5
                if prep: score += 0.5
                break

    if any(tok.lemma_ in causal_meta_terms for tok in doc):
      explanations.append("✓ Causal generalisation or implication term detected (e.g., 'implies', 'generalisation')")
      score += 1

    if any(tok.lemma_ in alternative_factor_terms for tok in doc):
      explanations.append("✓ Terms indicating alternative causes or interfering factors detected")
      score += 1

    is_causal = has_condition and has_advcl or has_causal_verb_structure
    if not is_causal:
        causal_phrases = ["result in", "lead to", "may cause", "because of", "due to","given rise to","resulting from", "stemming from", "driven by", "caused by", "attributed to", "stems from", "reason", "result of", "consequence of", "owning to", "thus", "so", "therefore", "hence"  "thereby"]
        if any(phrase in sentence.lower() for phrase in causal_phrases):
            explanations.append("✓ Phrase pattern matches known cause-to-effect trigger")
            score += 2

    score = min(score, 10)
    label = "Strong CauseToEffect" if score >= 7 else "Weak/Partial CauseToEffect" if score >= 4 else "Not CauseToEffect"
    logger.info(f"CauseToEffect --> Score: {score}, Label: {label}, Explanations: {explanations}")
    return label, score, explanations

#### 2. expert opinion

In [12]:
def get_lexical_units_from_frames(frames):
    terms = set()
    for frame_name in frames:
        try:
            frame = fn.frame_by_name(frame_name)
            for lu in frame.lexUnit.values():
                if '.v' in lu['name']:
                    terms.add(lu['name'].split('.')[0])
        except Exception as e:
            print(f"Warning: Could not load frame '{frame_name}': {e}")
    return terms


expert_frames = [
    "Expertise", "Judgment_communication", "Opinion",
    "Authority", "Statement", "Certainty"
]
quote_frames = ["Statement", "Judgment_communication"]
clarity_frames = ["Reasoning"]
evidence_frames = ["Evidence", "Certainty", "Causation"]


expert_verbs = get_lexical_units_from_frames(expert_frames)
quote_verbs = get_lexical_units_from_frames(quote_frames)
clarity_terms = get_lexical_units_from_frames(clarity_frames)
evidence_terms = get_lexical_units_from_frames(evidence_frames)

def detect_expert_opinion(question):

    doc = nlp(question)
    score = 0
    explanations = []

    expert_titles = {"expert", "researcher", "scientist", "doctor", "analyst", "professor", "Dr."}

    implicit_expert_terms = {"study", "research", "evidence", "report", "findings", "scientific", "government", "official", "paper", "survey", "data"}
    comparison_cues = {"consistent", "align", "similar", "agree", "disagree", "corroborate", "conflict"}
    technical_request_verbs = {"define", "explain", "describe", "elaborate", "clarify"}
    assertion_verbs = {"assert", "affirm", "pronounce", "declare", "maintain", "claim", "state"}
    reference_terms = {"quote", "reference", "cite", "check", "verify", "source"}
    domain_terms = {"science", "scientific", "domain", "field", "discipline", "area", "sector"}


    for ent in doc.ents:
        if ent.label_ in {"PERSON", "ORG"}:
            if any(title in ent.text.lower() for title in expert_titles):
                explanations.append(f"✓ Expert entity detected: '{ent.text}'")
                score += 3
                break

    if any(tok.lemma_ in expert_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Detected expert-related verb from FrameNet")
        score += 2

    if any(tok.lemma_ in quote_verbs for tok in doc):
        explanations.append("✓ Quotation or claim verb found")
        score += 1

    if any(tok.lemma_ in clarity_terms for tok in doc):
        explanations.append("✓ Clarity/definition markers found")
        score += 1

    if any(tok.lemma_ in evidence_terms for tok in doc):
        explanations.append("✓ Evidence or support-related terms found")
        score += 2

    if any(tok.lemma_.lower() in implicit_expert_terms for tok in doc):
      explanations.append("✓ Implicit expert-related term detected (e.g., 'study', 'government')")
      score += 2

    if any(tok.lemma_.lower() in comparison_cues for tok in doc):
      explanations.append("✓ Cross-study comparison term detected (e.g., 'consistent', 'similar')")
      score += 0.5

    if any(tok.lemma_.lower() in technical_request_verbs for tok in doc):
      explanations.append("✓ Technical explanation request detected (e.g., 'define', 'explain')")
      score += 1

    if any(tok.dep_ == "attr" and tok.lemma_ == "expert" for tok in doc):
      explanations.append("✓ Predicate nominative indicating expertise detected (e.g., 'X is an expert')")
      score += 2

    if any(tok.lemma_.lower() in assertion_verbs for tok in doc):
      explanations.append("✓ Assertion or claim verb detected (e.g., 'assert', 'affirm')")
      score += 1

    if any(tok.lemma_.lower() in reference_terms for tok in doc):
      explanations.append("✓ Source/reference validation term detected (e.g., 'quote', 'reference')")
      score += 1

    if any(tok.lemma_.lower() in domain_terms for tok in doc):
      explanations.append("✓ Domain relevance indicator detected (e.g., 'science', 'domainD')")
      score += 1

    score = min(score, 10)
    label = "Strong Expert Opinion" if score >= 7 else "Weak/Partial Expert Opinion" if score >= 4 else "Not Expert Opinion"
    logger.info(f"ExpertOpinion --> Score: {score}, Label: {label}, Explanations: {explanations}")
    return label, score, explanations

#### 3. Analogy detection

In [13]:
analogy_synsets = [wn.synset('similar.a.01'), wn.synset('analogy.n.01'), wn.synset('compare.v.01')]

comparison_frames = ["Similarity"]
contrast_frames = ["Categorization"]
evidence_frames = ["Evidence", "Judgment_communication"]

comparison_verbs = get_lexical_units_from_frames(comparison_frames)
contrast_verbs = get_lexical_units_from_frames(contrast_frames)
evidence_verbs = get_lexical_units_from_frames(evidence_frames)

def is_semantically_analogical(word_token):
    token_synsets = wn.synsets(word_token.lemma_)
    for s in token_synsets:
        for analogy_syn in analogy_synsets:
            if s.path_similarity(analogy_syn) and s.path_similarity(analogy_syn) > 0.3:
                return True
    return False

analogy_context_cues = {"respect", "in which", "such that", "with regard to", "in terms of"}

analogy_force_cues = {"undermine", "weaken", "strengthen", "force of similarity", "degree of analogy"}

analogy_nouns = {"analogy", "comparison", "parallel", "similarity", "analogue"}

def detect_analogy_question(question):
    doc = nlp(question)
    score = 0
    explanations = []
    noun_chunks = list(doc.noun_chunks)

    if any(tok.lemma_ in comparison_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Comparison verb detected from FrameNet")
        score += 2.5

    entity_tokens = [tok for tok in doc if tok.pos_ in {"PROPN", "NOUN"}]
    if len(set(tok.lemma_ for tok in entity_tokens)) >= 2:
        explanations.append("✓ Contains at least two distinct concepts/entities")
        score += 1

    if any(tok.lemma_ in contrast_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Contrast or difference verb detected from FrameNet")
        score += 1

    if any(tok.lemma_ in evidence_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Evidence or justification verb found")
        score += 1

    if any(tok.tag_ == "MD" for tok in doc):
        score += 0.5

    if len(noun_chunks) >= 2 and any(tok.lemma_ in {"similar", "like", "as"} for tok in doc):
        explanations.append("✓ Two concepts compared with similarity cue (e.g., 'similar', 'like')")
        score += 3

    if any(tok.text.lower() == "if" for tok in doc):
        explanations.append("✓ Conditional structure suggesting hypothetical reasoning")
        score += 1

    if any(is_semantically_analogical(tok) for tok in doc if tok.pos_ in {"ADJ", "NOUN", "VERB"}):
        explanations.append("✓ Semantic similarity to analogy-related terms detected via WordNet")
        score += 2

    if any(tok.dep_ in {"prep", "relcl"} and tok.lemma_ in {"compare", "similar"} for tok in doc):
        explanations.append("✓ Syntactic cue of analogy (e.g., 'compared with', 'similar to')")
        score += 1

    if any(phrase in question.lower() for phrase in analogy_context_cues):
      explanations.append("✓ Contextual analogy marker detected (e.g., 'in which', 'such that')")
      score += 0.5

    if any(tok.lemma_ in analogy_force_cues for tok in doc):
      explanations.append("✓ Analogy evaluation term detected (e.g., 'undermine', 'strengthen')")
      score += 0.5

    if any(tok.lemma_ in analogy_nouns for tok in doc if tok.pos_ == "NOUN"):
      explanations.append("✓ Explicit analogy noun detected (e.g., 'analogy', 'comparison')")
      score += 2

    if any(tok.dep_ == "neg" for tok in doc):
      if any(tok.lemma_ in {"similar", "compare", "alike", "match"} for tok in doc):
          explanations.append("✓ Negated comparison detected (suggesting analogy breakdown)")
          score += 1

    score = min(score, 10)
    label = "Strong Analogy Question" if score >= 7 else "Weak/Partial Analogy Question" if score >= 4 else "Not Analogy Question"
    logger.info(f"Analogy --> Score: {score}, Label: {label}, Explanations: {explanations}")
    return label, score, explanations

#### 4. Fear appeal

In [14]:
def is_fear_related(word_token):
    syns = wn.synsets(word_token.lemma_)
    for s in syns:
        if any(s.path_similarity(wn.synset('danger.n.01')) or s.path_similarity(wn.synset('problem.n.01')) or
               s.path_similarity(wn.synset('fear.n.01')) or s.path_similarity(wn.synset('harm.n.01')) or
               s.path_similarity(wn.synset('threat.n.01')) for s in syns):
            return True
    return False


# ---- FrameNet Utility ----
def get_lexical_units_from_frames(frames):
    terms = set()
    for frame_name in frames:
        try:
            frame = fn.frame_by_name(frame_name)
            for lu in frame.lexUnit.values():
                if '.v' in lu['name']:
                    terms.add(lu['name'].split('.')[0])
        except:
            continue
    return terms

# ---- Relevant Lexical Resources ----
causal_frames = ["Causation", "Cause_to_start", "Preventing", "Risk", "Threaten", "Danger"]
causal_verbs = get_lexical_units_from_frames(causal_frames)

fear_keywords = {"danger", "threat", "risky", "harm", "catastrophe", "crisis", "ruin", "fear", "worse", "bad", "fatal", "negative", "die", "death"}
preventive_keywords = {"prevent", "avoid", "stop", "ban", "rescue", "save", "protect"}

urgency_keywords = {"immediately", "soon", "before it's too late", "critical", "urgent", "suddenly", "unexpectedly"}

possibility_terms = {"possible", "possibility", "likely", "likelihood", "chance", "probability", "conceivable", "potential", "can", "could", "might", "may", "able"}


def detect_fear_appeal_question(question):
    doc = nlp(question)
    score = 0
    explanations = []

    if any(tok.lemma_.lower() in fear_keywords for tok in doc):
        explanations.append("✓ Fear-related keyword detected (e.g., 'threat', 'danger')")
        score += 3

    if any(tok.lemma_.lower() in preventive_keywords for tok in doc):
        explanations.append("✓ Preventive action verb detected (e.g., 'prevent', 'stop')")
        score += 2

    if any(tok.lemma_ in causal_verbs for tok in doc if tok.pos_ == "VERB"):
        explanations.append("✓ Causal/preventive verb from FrameNet detected")
        score += 2

    if any(tok.text.lower() in {"if", "unless"} for tok in doc):
        explanations.append("✓ Conditional clause found (e.g., 'if', 'unless')")
        score += 1

    if any(is_fear_related(tok) for tok in doc if tok.pos_ in {"NOUN", "VERB", "ADJ"}):
        explanations.append("✓ Semantic fear-related concept detected via WordNet")
        score += 2

    if any(phrase in question.lower() for phrase in urgency_keywords):
        explanations.append("✓ Urgency marker detected (e.g., 'immediately', 'before it's too late')")
        score += 1

    if any(tok.lemma_ in possibility_terms for tok in doc):
        explanations.append("✓ Possibility-related term detected (e.g., 'possible', 'feasible', 'chance')")
        score += 1

    score = min(score, 10)
    label = "Strong Fear Appeal" if score >= 7 else "Weak/Partial Fear Appeal" if score >= 4 else "Not Fear Appeal"
    logger.info(f"FearApeal --> Score: {score}, Label: {label}, Explanations: {explanations}")
    return label, score, explanations

## Final method for preprocessing and filtering for all types

In [48]:
def classify_schema(row):
    question = row["question"]

    is_critical = False

    _, cte_score, _ = detect_cause_to_effect(question)
    _, expert_score, _ = detect_expert_opinion(question)
    _, analogy_score, _ = detect_analogy_question(question)
    _, fear_score, _ = detect_fear_appeal_question(question)

    if cte_score >= 7:
      is_critical = True

    if expert_score >= 7:
      is_critical = True

    if analogy_score >= 7:
      is_critical = True

    if fear_score >= 7:
       is_critical = True

    return pd.Series({
        "is_Critical": is_critical,
        "CauseToEffect": cte_score,
        "ExpertOpinion": expert_score,
        "Analogy": analogy_score,
        "FearAppeal": fear_score,
    })

In [None]:
logger.info("--- Start evaluation with Argumentative schemes -----")
filtered_df[["is_Critical","CauseToEffect", "ExpertOpinion", "Analogy", "FearAppeal"]] = filtered_df.apply(classify_schema, axis=1)

### final check: how many questions per category ?

In [50]:
value_counts = filtered_df['is_Critical'].value_counts(dropna=False)
print(value_counts)

columns = ["CauseToEffect", "ExpertOpinion", "Analogy", "FearAppeal"]

value_counts_schema = filtered_df[columns].apply(pd.Series.value_counts, dropna=False)
print(value_counts_schema)
logger.info(f"Value counts per Schema: {value_counts_schema}")

INFO:__main__:Value counts per Schema:      CauseToEffect  ExpertOpinion  Analogy  FearAppeal
0.0          131.0          138.0     17.0         6.0
0.5            NaN            2.0      1.0         NaN
1.0            1.0            6.0     18.0         NaN
1.5            NaN            NaN      1.0         NaN
2.0           40.0           26.0     25.0       151.0
2.5            NaN            NaN     10.0         NaN
3.0            1.0           13.0     86.0        17.0
3.5           10.0            NaN     16.0         NaN
4.0            6.0            7.0     10.0        13.0
4.5            NaN            NaN      1.0         NaN
5.0            2.0            1.0      2.0         6.0
5.5            2.0            NaN      1.0         NaN
6.0            1.0            1.0      5.0         NaN
6.5            NaN            NaN      1.0         NaN
7.0            NaN            NaN      NaN         1.0


is_Critical
False    193
True       1
Name: count, dtype: int64
     CauseToEffect  ExpertOpinion  Analogy  FearAppeal
0.0          131.0          138.0     17.0         6.0
0.5            NaN            2.0      1.0         NaN
1.0            1.0            6.0     18.0         NaN
1.5            NaN            NaN      1.0         NaN
2.0           40.0           26.0     25.0       151.0
2.5            NaN            NaN     10.0         NaN
3.0            1.0           13.0     86.0        17.0
3.5           10.0            NaN     16.0         NaN
4.0            6.0            7.0     10.0        13.0
4.5            NaN            NaN      1.0         NaN
5.0            2.0            1.0      2.0         6.0
5.5            2.0            NaN      1.0         NaN
6.0            1.0            1.0      5.0         NaN
6.5            NaN            NaN      1.0         NaN
7.0            NaN            NaN      NaN         1.0


In [51]:
value_counts_schema = filtered_df[columns].apply(pd.Series.value_counts, dropna=False)

per_column_gte_7 = value_counts_schema[value_counts_schema.index >= 7].sum()

print("Entries with score >= 7 per column:")
print(per_column_gte_7)
logger.info(f"Entries with score >= 7 per column: {per_column_gte_7}")

INFO:__main__:Entries with score >= 7 per column: CauseToEffect    0.0
ExpertOpinion    0.0
Analogy          0.0
FearAppeal       1.0
dtype: float64


Entries with score >= 7 per column:
CauseToEffect    0.0
ExpertOpinion    0.0
Analogy          0.0
FearAppeal       1.0
dtype: float64


## Save and filter datasets

In [52]:
df_to_save = filtered_df.copy()

df_to_save['id'] = range(1, len(df_to_save) + 1)

columns_to_save = [
    'id',
    'context',
    'question',
    'context_token_len',
    'is_Critical',
    'CauseToEffect',
    'Analogy',
    'ExpertOpinion',
    'FearAppeal'
]

# Save the full dataset with scores per entry per scheme
df_to_save[columns_to_save].to_json(processed_full_dataset_path, orient='records', indent=2)

#### SFT Dataset
Create a daset usable during SFT Training. This dataset needs for every relevant scheme one entry.

In [53]:
full_scored_df = pd.read_json(processed_full_dataset_path, orient='records')

schema_cols = ['CauseToEffect', 'ExpertOpinion', 'Analogy', 'FearAppeal']

long_df = filtered_df.melt(
    id_vars=['context', 'question'],
    value_vars=schema_cols,
    var_name='schema',
    value_name='score'
)

qualified = long_df[long_df['score'] >= 7].copy()

qualified = qualified.drop(columns=['score'])

mask_no_schema = filtered_df[schema_cols].max(axis=1) < 7

no_schema_df = filtered_df.loc[mask_no_schema, ['context', 'question']].copy()
no_schema_df['schema'] = ''

final_df = pd.concat([qualified, no_schema_df], ignore_index=True)

final_df.shape

(194, 3)

In [54]:
final_df = final_df.reset_index(drop=True)
final_df['id'] = final_df.index + 1

final_df = final_df[['id', 'context', 'question', 'schema']]

# Save the filtered dataset
final_df.to_json(processed_sft_dataset_path, orient='records', indent=2)

Not used but with this code it is possible to create a subset of questiony scored above a certain treshhold.

In [28]:
'''
full_scored_df = pd.read_json(processed_full_dataset_path, orient='records')

score_columns = ['CauseToEffect', 'Analogy', 'ExpertOpinion', 'FearAppeal']

# Filter rows where any score is greater than 7
filtered_df = full_scored_df[full_scored_df[score_columns].gt(7).any(axis=1)].copy()

# Reset the index and update the 'id' field to start from 1
filtered_df.reset_index(drop=True, inplace=True)
filtered_df['id'] = filtered_df.index + 1

filtered_df.to_json("path goese here", orient='records', indent=2)
'''

## analyse dataset

In [55]:
df = pd.read_json(processed_full_dataset_path, orient='records')
df.shape

(194, 9)

In [56]:
score_columns = ['CauseToEffect', 'Analogy', 'ExpertOpinion', 'FearAppeal']

score_counts = pd.DataFrame()

for col in score_columns:
    counts = df[col].value_counts().sort_index()
    score_counts[col] = counts

score_counts = score_counts.fillna(0).astype(int)

print(score_counts)

               CauseToEffect  Analogy  ExpertOpinion  FearAppeal
CauseToEffect                                                   
0.0                      131       17            138           6
1.0                        1       18              6           0
2.0                       40       25             26         151
3.0                        1       86             13          17
3.5                       10       16              0           0
4.0                        6       10              7          13
5.0                        2        2              1           6
5.5                        2        1              0           0
6.0                        1        5              1           0


## Git

In [None]:
os.chdir("NLP2025_CQG")
!ls

In [None]:
!git config --global user.name "Showcas"
!git config --global user.email "cedric.bohni@gmx.de"


commit_message = f"Improved Preprocessing file"
!git add .
!git commit -m "{commit_message}"
!git push