## response EDA

In [1]:
from pathlib import Path
import json

# try to locate the file in the repo (works for relative or absolute layouts)
candidates = list(Path.cwd().rglob("response_v4_9.json"))
if candidates:
    path = candidates[0]
else:
    path = Path("Causal_extractor/data_extract/output/response_v4_9.json")
if not path.exists():
    raise FileNotFoundError(f"response_v3.json not found. Checked: {path}")

with open(path, "r", encoding="utf-8") as f:
    data = json.load(f)

if not isinstance(data, list):
    raise ValueError("Expected JSON file to contain a list of records")

n = len(data)
keys = set().union(*(d.keys() for d in data))

na_indices = {k: [] for k in keys}
for i, rec in enumerate(data):
    for k in keys:
        v = rec.get(k, "")
        if isinstance(v, str) and v.strip() == "N/A":
            na_indices[k].append(i)

# print concise summary
for k in sorted(keys):
    cnt = len(na_indices[k])
    pct = 100 * cnt / n if n else 0
    print(f"{k}: {cnt}/{n} ({pct:.1f}%)  indices={na_indices[k]}")

explicit_type: 0/18 (0.0%)  indices=[]
marked_type: 6/18 (33.3%)  indices=[0, 10, 11, 12, 14, 17]
marker: 0/18 (0.0%)  indices=[]
object: 0/18 (0.0%)  indices=[]
pattern_type: 0/18 (0.0%)  indices=[]
relationship: 0/18 (0.0%)  indices=[]
sentence_type: 0/18 (0.0%)  indices=[]
source_text: 0/18 (0.0%)  indices=[]
subject: 0/18 (0.0%)  indices=[]


In [2]:
import pandas as pd
from IPython.display import display

# Convert loaded JSON list-of-dicts into a DataFrame and display as a table
df = pd.DataFrame(data)
print(f"Loaded {len(df)} rows, {len(df.columns)} columns")
# show a preview (adjust or remove .head to show more)
df

# To persist or inspect further:
# df.to_csv('response_v3_table_preview.csv', index=False)
# display(df)  # uncomment to render full table in the notebook

Loaded 18 rows, 9 columns


Unnamed: 0,pattern_type,sentence_type,marked_type,explicit_type,relationship,marker,subject,object,source_text
0,F,SB,,E,Trash truck schedule -> Is agreed upon by Bang...,,The truck schedule,Agreed upon by Bangkok and university,The schedule of the truck seems to be agreed o...
1,C,SB,M,E,Some event -> Causes an unusual amount of trash,causes,Some event,An unusual amount of trash,university staff will call for trash trucks on...
2,C,SB,U,E,An unusual amount of trash -> University staff...,,An unusual amount of trash,University staff call for trash trucks,university staff will call for trash trucks on...
3,C,ES,M,E,Limited space in university -> Garbage buffer ...,due to,limited space in our university,the garbage buffer,"Also due to limited space in our university, t..."
4,C,SB,U,E,Storing trash in a buffer -> Prevents normal c...,,Garbage buffer storing trash,Normal collection points overflowing,the garbage buffer is created to store trash p...
5,C,ES,U,I,Universities accept more students -> Challenge...,,Universities accept more students,Challenges,And these challenges emerge as universities ac...
6,C,ES,U,E,Increase in the number of students -> Increase...,,Increase in the number of students,The number of trash producers,Increase in the number of students also increa...
7,C,OT,U,E,The most important factor -> Makes the project...,,The most important factor,The project in general fail,He mentions that the most important factor tha...
8,C,OT,U,E,Gains after employing developed policy -> Cons...,,gains after employing developed policy,more monthly cost,Ex. gains after employing developed policy con...
9,C,OT,U,E,Gains after employing developed policy -> Give...,,gains after employing developed policy,insignificant improvement,gains after employing developed policy... give...


In [3]:
from collections import defaultdict
import pandas as pd
from IPython.display import display

def show_grouped_rows_table(data, key, max_rows_per_group=None):
    groups = defaultdict(list)
    for rec in data:
        groups[rec.get(key, "")].append(rec)
    for val in sorted(groups.keys(), key=lambda x: str(x)):
        rows = groups[val]
        print(f'\n--- {key!r} = {val!r} ({len(rows)} rows) ---')
        if not rows:
            print("(no rows)")
            continue
        df = pd.DataFrame(rows)
        if max_rows_per_group is not None:
            display(df.head(max_rows_per_group))
        else:
            display(df)

# show rows grouped by explicit_type, marked_type, and object (rendered as tables)
show_grouped_rows_table(data, "explicit_type", max_rows_per_group=50)


--- 'explicit_type' = 'E' (12 rows) ---


Unnamed: 0,pattern_type,sentence_type,marked_type,explicit_type,relationship,marker,subject,object,source_text
0,F,SB,,E,Trash truck schedule -> Is agreed upon by Bang...,,The truck schedule,Agreed upon by Bangkok and university,The schedule of the truck seems to be agreed o...
1,C,SB,M,E,Some event -> Causes an unusual amount of trash,causes,Some event,An unusual amount of trash,university staff will call for trash trucks on...
2,C,SB,U,E,An unusual amount of trash -> University staff...,,An unusual amount of trash,University staff call for trash trucks,university staff will call for trash trucks on...
3,C,ES,M,E,Limited space in university -> Garbage buffer ...,due to,limited space in our university,the garbage buffer,"Also due to limited space in our university, t..."
4,C,SB,U,E,Storing trash in a buffer -> Prevents normal c...,,Garbage buffer storing trash,Normal collection points overflowing,the garbage buffer is created to store trash p...
5,C,ES,U,E,Increase in the number of students -> Increase...,,Increase in the number of students,The number of trash producers,Increase in the number of students also increa...
6,C,OT,U,E,The most important factor -> Makes the project...,,The most important factor,The project in general fail,He mentions that the most important factor tha...
7,C,OT,U,E,Gains after employing developed policy -> Cons...,,gains after employing developed policy,more monthly cost,Ex. gains after employing developed policy con...
8,C,OT,U,E,Gains after employing developed policy -> Give...,,gains after employing developed policy,insignificant improvement,gains after employing developed policy... give...
9,A,SP,,E,We -> may need to include financial factor and...,,We,financial factor and staff sentiment,We may need to include financial factor and st...



--- 'explicit_type' = 'I' (6 rows) ---


Unnamed: 0,pattern_type,sentence_type,marked_type,explicit_type,relationship,marker,subject,object,source_text
0,C,ES,U,I,Universities accept more students -> Challenge...,,Universities accept more students,Challenges,And these challenges emerge as universities ac...
1,A,SB,,I,University -> calls BMA to collect garbage on ...,,University,BMA (Bangkok Metropolitan Administration),ถ้าขยะเยอะเกิน -> โทรเรียกกทมมาเก็บตาม case
2,F,ES,,I,Number of people in the university -> Increases,,Number of people in the university,Increase,ปริมาณคนที่มหาลัยเพิ่ม/หน้าที่ของแต่ละคน
3,C,SP,U,I,Management -> Policy changes,,Management (ผู้บริหาร),Policy,นโยบายเปลี่ยนตามผู้บริหาร
4,F,SB,,I,Housekeepers -> Collect waste from buildings,,Housekeepers (แม่บ้าน),Waste from buildings,ภายใน = อาคาร รวมโดยแม่บ้าน -> แม่บ้านแยก -> ใ...
5,C,SB,U,I,Staff working hours overlap with housekeeper w...,,"Staff working hours, Housekeeper working hours",Garbage overflowing,"พนักงานเก็บขยะ 3-4pm(เลิก 5pm), แม่บ้านเก็บ 5p..."


In [4]:
show_grouped_rows_table(data, "marked_type", max_rows_per_group=50)



--- 'marked_type' = 'M' (3 rows) ---


Unnamed: 0,pattern_type,sentence_type,marked_type,explicit_type,relationship,marker,subject,object,source_text
0,C,SB,M,E,Some event -> Causes an unusual amount of trash,causes,Some event,An unusual amount of trash,university staff will call for trash trucks on...
1,C,ES,M,E,Limited space in university -> Garbage buffer ...,due to,limited space in our university,the garbage buffer,"Also due to limited space in our university, t..."
2,C,SB,M,E,Personnel -> Inaccurate weight measurements,เนื่องจาก,Personnel (บุคลากร),Inaccurate weight measurements (การคาดเคลื่อนก...,ปัญหา 1 -> การคาดเคลื่อนการชั่งน้ำหนักเนื่องจา...



--- 'marked_type' = 'N/A' (6 rows) ---


Unnamed: 0,pattern_type,sentence_type,marked_type,explicit_type,relationship,marker,subject,object,source_text
0,F,SB,,E,Trash truck schedule -> Is agreed upon by Bang...,,The truck schedule,Agreed upon by Bangkok and university,The schedule of the truck seems to be agreed o...
1,A,SP,,E,We -> may need to include financial factor and...,,We,financial factor and staff sentiment,We may need to include financial factor and st...
2,A,SB,,I,University -> calls BMA to collect garbage on ...,,University,BMA (Bangkok Metropolitan Administration),ถ้าขยะเยอะเกิน -> โทรเรียกกทมมาเก็บตาม case
3,F,ES,,I,Number of people in the university -> Increases,,Number of people in the university,Increase,ปริมาณคนที่มหาลัยเพิ่ม/หน้าที่ของแต่ละคน
4,F,SB,,I,Housekeepers -> Collect waste from buildings,,Housekeepers (แม่บ้าน),Waste from buildings,ภายใน = อาคาร รวมโดยแม่บ้าน -> แม่บ้านแยก -> ใ...
5,F,SB,,E,Waste collection cage size -> Is too small,,Waste collection cage size,Too small,ปัญหา 2 ขนาดกรงรวมขยะเล็กไป



--- 'marked_type' = 'U' (9 rows) ---


Unnamed: 0,pattern_type,sentence_type,marked_type,explicit_type,relationship,marker,subject,object,source_text
0,C,SB,U,E,An unusual amount of trash -> University staff...,,An unusual amount of trash,University staff call for trash trucks,university staff will call for trash trucks on...
1,C,SB,U,E,Storing trash in a buffer -> Prevents normal c...,,Garbage buffer storing trash,Normal collection points overflowing,the garbage buffer is created to store trash p...
2,C,ES,U,I,Universities accept more students -> Challenge...,,Universities accept more students,Challenges,And these challenges emerge as universities ac...
3,C,ES,U,E,Increase in the number of students -> Increase...,,Increase in the number of students,The number of trash producers,Increase in the number of students also increa...
4,C,OT,U,E,The most important factor -> Makes the project...,,The most important factor,The project in general fail,He mentions that the most important factor tha...
5,C,OT,U,E,Gains after employing developed policy -> Cons...,,gains after employing developed policy,more monthly cost,Ex. gains after employing developed policy con...
6,C,OT,U,E,Gains after employing developed policy -> Give...,,gains after employing developed policy,insignificant improvement,gains after employing developed policy... give...
7,C,SP,U,I,Management -> Policy changes,,Management (ผู้บริหาร),Policy,นโยบายเปลี่ยนตามผู้บริหาร
8,C,SB,U,I,Staff working hours overlap with housekeeper w...,,"Staff working hours, Housekeeper working hours",Garbage overflowing,"พนักงานเก็บขยะ 3-4pm(เลิก 5pm), แม่บ้านเก็บ 5p..."


In [5]:
show_grouped_rows_table(data, "object", max_rows_per_group=50)


--- 'object' = 'Agreed upon by Bangkok and university' (1 rows) ---


Unnamed: 0,pattern_type,sentence_type,marked_type,explicit_type,relationship,marker,subject,object,source_text
0,F,SB,,E,Trash truck schedule -> Is agreed upon by Bang...,,The truck schedule,Agreed upon by Bangkok and university,The schedule of the truck seems to be agreed o...



--- 'object' = 'An unusual amount of trash' (1 rows) ---


Unnamed: 0,pattern_type,sentence_type,marked_type,explicit_type,relationship,marker,subject,object,source_text
0,C,SB,M,E,Some event -> Causes an unusual amount of trash,causes,Some event,An unusual amount of trash,university staff will call for trash trucks on...



--- 'object' = 'BMA (Bangkok Metropolitan Administration)' (1 rows) ---


Unnamed: 0,pattern_type,sentence_type,marked_type,explicit_type,relationship,marker,subject,object,source_text
0,A,SB,,I,University -> calls BMA to collect garbage on ...,,University,BMA (Bangkok Metropolitan Administration),ถ้าขยะเยอะเกิน -> โทรเรียกกทมมาเก็บตาม case



--- 'object' = 'Challenges' (1 rows) ---


Unnamed: 0,pattern_type,sentence_type,marked_type,explicit_type,relationship,marker,subject,object,source_text
0,C,ES,U,I,Universities accept more students -> Challenge...,,Universities accept more students,Challenges,And these challenges emerge as universities ac...



--- 'object' = 'Garbage overflowing' (1 rows) ---


Unnamed: 0,pattern_type,sentence_type,marked_type,explicit_type,relationship,marker,subject,object,source_text
0,C,SB,U,I,Staff working hours overlap with housekeeper w...,,"Staff working hours, Housekeeper working hours",Garbage overflowing,"พนักงานเก็บขยะ 3-4pm(เลิก 5pm), แม่บ้านเก็บ 5p..."



--- 'object' = 'Inaccurate weight measurements (การคาดเคลื่อนการชั่งน้ำหนัก)' (1 rows) ---


Unnamed: 0,pattern_type,sentence_type,marked_type,explicit_type,relationship,marker,subject,object,source_text
0,C,SB,M,E,Personnel -> Inaccurate weight measurements,เนื่องจาก,Personnel (บุคลากร),Inaccurate weight measurements (การคาดเคลื่อนก...,ปัญหา 1 -> การคาดเคลื่อนการชั่งน้ำหนักเนื่องจา...



--- 'object' = 'Increase' (1 rows) ---


Unnamed: 0,pattern_type,sentence_type,marked_type,explicit_type,relationship,marker,subject,object,source_text
0,F,ES,,I,Number of people in the university -> Increases,,Number of people in the university,Increase,ปริมาณคนที่มหาลัยเพิ่ม/หน้าที่ของแต่ละคน



--- 'object' = 'Normal collection points overflowing' (1 rows) ---


Unnamed: 0,pattern_type,sentence_type,marked_type,explicit_type,relationship,marker,subject,object,source_text
0,C,SB,U,E,Storing trash in a buffer -> Prevents normal c...,,Garbage buffer storing trash,Normal collection points overflowing,the garbage buffer is created to store trash p...



--- 'object' = 'Policy' (1 rows) ---


Unnamed: 0,pattern_type,sentence_type,marked_type,explicit_type,relationship,marker,subject,object,source_text
0,C,SP,U,I,Management -> Policy changes,,Management (ผู้บริหาร),Policy,นโยบายเปลี่ยนตามผู้บริหาร



--- 'object' = 'The number of trash producers' (1 rows) ---


Unnamed: 0,pattern_type,sentence_type,marked_type,explicit_type,relationship,marker,subject,object,source_text
0,C,ES,U,E,Increase in the number of students -> Increase...,,Increase in the number of students,The number of trash producers,Increase in the number of students also increa...



--- 'object' = 'The project in general fail' (1 rows) ---


Unnamed: 0,pattern_type,sentence_type,marked_type,explicit_type,relationship,marker,subject,object,source_text
0,C,OT,U,E,The most important factor -> Makes the project...,,The most important factor,The project in general fail,He mentions that the most important factor tha...



--- 'object' = 'Too small' (1 rows) ---


Unnamed: 0,pattern_type,sentence_type,marked_type,explicit_type,relationship,marker,subject,object,source_text
0,F,SB,,E,Waste collection cage size -> Is too small,,Waste collection cage size,Too small,ปัญหา 2 ขนาดกรงรวมขยะเล็กไป



--- 'object' = 'University staff call for trash trucks' (1 rows) ---


Unnamed: 0,pattern_type,sentence_type,marked_type,explicit_type,relationship,marker,subject,object,source_text
0,C,SB,U,E,An unusual amount of trash -> University staff...,,An unusual amount of trash,University staff call for trash trucks,university staff will call for trash trucks on...



--- 'object' = 'Waste from buildings' (1 rows) ---


Unnamed: 0,pattern_type,sentence_type,marked_type,explicit_type,relationship,marker,subject,object,source_text
0,F,SB,,I,Housekeepers -> Collect waste from buildings,,Housekeepers (แม่บ้าน),Waste from buildings,ภายใน = อาคาร รวมโดยแม่บ้าน -> แม่บ้านแยก -> ใ...



--- 'object' = 'financial factor and staff sentiment' (1 rows) ---


Unnamed: 0,pattern_type,sentence_type,marked_type,explicit_type,relationship,marker,subject,object,source_text
0,A,SP,,E,We -> may need to include financial factor and...,,We,financial factor and staff sentiment,We may need to include financial factor and st...



--- 'object' = 'insignificant improvement' (1 rows) ---


Unnamed: 0,pattern_type,sentence_type,marked_type,explicit_type,relationship,marker,subject,object,source_text
0,C,OT,U,E,Gains after employing developed policy -> Give...,,gains after employing developed policy,insignificant improvement,gains after employing developed policy... give...



--- 'object' = 'more monthly cost' (1 rows) ---


Unnamed: 0,pattern_type,sentence_type,marked_type,explicit_type,relationship,marker,subject,object,source_text
0,C,OT,U,E,Gains after employing developed policy -> Cons...,,gains after employing developed policy,more monthly cost,Ex. gains after employing developed policy con...



--- 'object' = 'the garbage buffer' (1 rows) ---


Unnamed: 0,pattern_type,sentence_type,marked_type,explicit_type,relationship,marker,subject,object,source_text
0,C,ES,M,E,Limited space in university -> Garbage buffer ...,due to,limited space in our university,the garbage buffer,"Also due to limited space in our university, t..."


## Score

In [6]:
from pathlib import Path
import json
import pandas as pd
import numpy as np

# Load validation scores
scores_path = Path("output/validation_scores.json")
if not scores_path.exists():
    scores_path = Path("Causal_extractor/data_extract/output/validation_scores.json")

with open(scores_path, "r", encoding="utf-8") as f:
    validation_data = json.load(f)

# Extract all scores across all files
all_scores = []
file_stats = {}

for file_name, items in validation_data.items():
    file_scores = []
    for item_id, item_data in items.items():
        score = int(item_data.get("score", 0))
        all_scores.append(score)
        file_scores.append(score)
    
    if file_scores:
        file_stats[file_name] = {
            "count": len(file_scores),
            "mean": np.mean(file_scores),
            "std": np.std(file_scores),
            "median": np.median(file_scores),
            "min": min(file_scores),
            "max": max(file_scores)
        }

# Overall statistics
print("=" * 60)
print("📊 VALIDATION SCORES STATISTICS")
print("=" * 60)

if all_scores:
    print(f"\n📈 Overall Statistics (n={len(all_scores)}):")
    print(f"   Mean:   {np.mean(all_scores):.2f}")
    print(f"   Median: {np.median(all_scores):.2f}")
    print(f"   Std:    {np.std(all_scores):.2f}")
    print(f"   Min:    {min(all_scores)}")
    print(f"   Max:    {max(all_scores)}")
    
    # Score distribution
    print(f"\n📊 Score Distribution:")
    score_counts = pd.Series(all_scores).value_counts().sort_index()
    for score, count in score_counts.items():
        pct = 100 * count / len(all_scores)
        bar = "█" * int(pct / 2)
        print(f"   Score {score}: {count:3d} ({pct:5.1f}%) {bar}")

# Per-file statistics
if file_stats:
    print(f"\n📁 Per-File Statistics:")
    stats_df = pd.DataFrame(file_stats).T
    stats_df = stats_df.round(2)
    display(stats_df)
else:
    print("No validation scores found.")

📊 VALIDATION SCORES STATISTICS

📈 Overall Statistics (n=18):
   Mean:   4.28
   Median: 5.00
   Std:    1.24
   Min:    1
   Max:    5

📊 Score Distribution:
   Score 1:   1 (  5.6%) ██
   Score 2:   2 ( 11.1%) █████
   Score 4:   3 ( 16.7%) ████████
   Score 5:  12 ( 66.7%) █████████████████████████████████

📁 Per-File Statistics:


Unnamed: 0,count,mean,std,median,min,max
response_v4_9.json,18.0,4.28,1.24,5.0,1.0,5.0


In [7]:
# Print validation notes with full response row - only when notes are not empty
print("=" * 70)
print("📝 ITEMS WITH NOTES")
print("=" * 70)

items_with_notes = [(item_id, meta) for item_id, meta in items.items() 
                    if meta.get("notes", "").strip()]

if not items_with_notes:
    print("\n✅ No items have notes.")
else:
    print(f"\n Found {len(items_with_notes)} items with notes:\n")
    
    for item_id, meta in items_with_notes:
        idx = int(item_id)
        score = meta.get("score", "")
        notes = meta.get("notes", "")
        row = df.loc[idx]
        
        print("─" * 70)
        print(f"📍 Index: {idx}  │  ⭐ Score: {score}")
        print("─" * 70)
        print(f"📋 Notes: {notes}")
        print()
        print("📄 Response Details:")
        print(f"   • Subject:      {row.get('subject', 'N/A')}")
        print(f"   • Object:       {row.get('object', 'N/A')}")
        print(f"   • Relationship: {row.get('relationship', 'N/A')}")
        print(f"   • Marker:       {row.get('marker', 'N/A')}")
        print(f"   • Pattern:      {row.get('pattern_type', 'N/A')}")
        print(f"   • Marked Type:  {row.get('marked_type', 'N/A')}")
        print(f"   • Source Text:  {row.get('source_text', 'N/A')[:100]}...")
        print()

print("=" * 70)
print("aisjodijahwsidjawidjaoisjdpaijd;lksdfj;ldakjf;aldkjf")

📝 ITEMS WITH NOTES

 Found 7 items with notes:

──────────────────────────────────────────────────────────────────────
📍 Index: 2  │  ⭐ Score: 4
──────────────────────────────────────────────────────────────────────
📋 Notes: unmarked, when

📄 Response Details:
   • Subject:      An unusual amount of trash
   • Object:       University staff call for trash trucks
   • Relationship: An unusual amount of trash -> University staff call for trash trucks
   • Marker:       None
   • Pattern:      C
   • Marked Type:  U
   • Source Text:  university staff will call for trash trucks only when there is some event that causes an unusual amo...

──────────────────────────────────────────────────────────────────────
📍 Index: 5  │  ⭐ Score: 4
──────────────────────────────────────────────────────────────────────
📋 Notes: the "challenge" can be found in "event that causes an unusual amount of trash", but the input is unclear

📄 Response Details:
   • Subject:      Universities accept more students
 