In [12]:
import re
import pandas as pd
from pathlib import Path
import json

In [18]:
# Load the JSON file
file_path = Path("../../../Data/Processed/validation.json")
with open(file_path, "r") as f:
    data = json.load(f)

# Extract questions into a DataFrame
questions = []
for intervention_data in data.values():
    for cq in intervention_data["cqs"]:
        questions.append({
            "id": cq["id"],
            "target": cq["cq"],
            "label": cq["label"]
        })

df = pd.DataFrame(questions)

In [7]:
# Load the uploaded dataset
df = pd.read_csv('../../../Data/Processed/SocraticQ/test.csv')

In [19]:
def classify_blooms(question):
    question = question.lower()
    blooms_keywords = {
        "Remember": ["define", "list", "name", "identify", "recall"],
        "Understand": ["summarize", "explain", "describe", "interpret"],
        "Apply": ["apply", "use", "demonstrate", "solve"],
        "Analyze": ["compare", "contrast", "analyze", "differentiate", "distinguish"],
        "Evaluate": ["evaluate", "justify", "critique", "argue", "assess"],
        "Create": ["create", "design", "formulate", "develop", "propose"]
    }
    for level, keywords in reversed(blooms_keywords.items()):  # start from top
        if any(kw in question for kw in keywords):
            return level
    return "Unknown"

def detect_socratic_types(question):
    socratic_patterns = {
        "Clarification": ["what do you mean", "could you explain", "can you clarify"],
        "Assumption": ["what are you assuming", "underlying assumption", "based on assumption"],
        "Evidence": ["what evidence", "how do you know", "based on what"],
        "Perspective": ["what is another", "how might someone else", "different viewpoint"],
        "Implication": ["what are the implications", "what might happen", "what follows if"],
        "Meta": ["why is this question", "what is the point of asking"]
    }
    matches = []
    for typ, patterns in socratic_patterns.items():
        if any(re.search(pat, question.lower()) for pat in patterns):
            matches.append(typ)
    return matches

def estimate_dok(question):
    question = question.lower()
    if any(x in question for x in ["define", "list", "name", "who", "what", "when"]):
        return 1
    elif any(x in question for x in ["explain", "summarize", "compare", "describe"]):
        return 2
    elif any(x in question for x in ["analyze", "justify", "evaluate", "why", "how"]):
        return 3
    elif any(x in question for x in ["design", "construct", "formulate", "propose"]):
        return 4
    return 0

def evaluate_paul_elder(question):
    question = question.lower()
    standards = {
        "clarity": len(question) > 10 and not question.strip().endswith("?") == False,
        "relevance": True,  # Assume relevance unless flagged
        "depth": any(x in question for x in ["why", "how", "challenge", "complex"]),
        "breadth": any(x in question for x in ["other perspective", "different", "compare"]),
        "logic": not any(x in question for x in ["nonsense", "contradiction"]),
        "significance": len(question.split()) > 5,
        "fairness": not any(x in question for x in ["always", "never", "biased"])
    }
    return standards

In [28]:
# Apply the evaluation functions to the dataset
results = []
for _, row in df.iterrows():
    q = row['target']
    bloom = classify_blooms(q)
    socratic = detect_socratic_types(q)
    dok = estimate_dok(q)
    paul_elder = evaluate_paul_elder(q)
    total_pe_score = sum(paul_elder.values())

    # Final composite criticality score (example weights)
    bloom_score = {"Remember": 1, "Understand": 2, "Apply": 3, "Analyze": 4, "Evaluate": 5, "Create": 6}.get(bloom, 0)
    criticality_score = bloom_score * 1.0 + len(socratic) * 0.7 + dok * 1.0 + total_pe_score * 0.5
    label = "Useful" if criticality_score >= 10 else "Unhelpful" if criticality_score >= 6 else "Invalid"

    results.append({
        "question": q,
        "bloom_level": bloom,
        "socratic_types": socratic,
        "dok_level": dok,
        "paul_elder_score": total_pe_score,
        "criticality_score": round(criticality_score, 2),
        "label": label,
        "label_shared_task":row["label"]
    })

results_df = pd.DataFrame(results)

In [29]:
results_df.head()

Unnamed: 0,question,bloom_level,socratic_types,dok_level,paul_elder_score,criticality_score,label,label_shared_task
0,How would you address potential criticisms and...,Unknown,[],1,6,4.0,Invalid,Invalid
1,Are there other relevant goals that conflict w...,Unknown,[],0,5,2.5,Invalid,Unhelpful
2,What is the proposed plan for making the econo...,Create,[],1,5,9.5,Unhelpful,Useful
3,What specific policies would you implement to ...,Unknown,[],1,6,4.0,Invalid,Invalid
4,Could Clinton investing in you have consequenc...,Unknown,[],0,5,2.5,Invalid,Useful


In [30]:
# Calculate overall summary statistics
summary = results_df['label'].value_counts().to_dict()

# Build a readable summary string
summary_text = "\n".join([f"{label}: {count} questions" for label, count in summary.items()])
summary_text = f"**Critical Question Summary:**\n\n{summary_text}"

summary_text

'**Critical Question Summary:**\n\nInvalid: 91 questions\nUnhelpful: 34 questions\nUseful: 8 questions'

In [31]:
# Add a new column indicating whether the labels match
results_df['match'] = results_df['label'] == results_df['label_shared_task']

# Summary: count of matches and mismatches
match_summary = results_df['match'].value_counts()
match_percentage = results_df['match'].value_counts(normalize=True) * 100

# Combine both into a single DataFrame
summary_df = pd.DataFrame({
    "Count": match_summary,
    "Percentage": match_percentage
}).rename(index={True: "Match", False: "Mismatch"})

# Display the result
print(summary_df)

          Count  Percentage
match                      
Mismatch     97   72.932331
Match        36   27.067669
