# PC-GITA Dataset Segmentation Statistics

This notebook validates segmented artifacts and computes aggregate statistics.

In [None]:
import os
import sys
from pathlib import Path

# Add project root to sys.path
sys.path.append(os.path.abspath('..'))

from data_prepare.stats import compute_statistics, save_report, validate_triples

In [None]:
# CONFIGURATION
DATA_DIR = '../datalocal/processed/v1'  # Change this to your segmented output directory
REPORT_PATH = '../reports/dataset_stats.json'

if not Path(DATA_DIR).exists():
    print(f"Warning: DATA_DIR {DATA_DIR} not found. Please run split_sentences.py first.")

In [None]:
# VALIDATION
results = validate_triples(DATA_DIR)
print(f"Total items found: {results['total_stems']}")
print(f"Valid triples: {len(results['valid_stems'])}")

if results['missing_files']:
    print("\n--- MISSING FILES ---")
    for entry in results['missing_files']:
        print(f"{entry['stem']}: missing {', '.join(entry['missing'])}")

In [None]:
import pandas as pd

# STATISTICS
stats = compute_statistics(DATA_DIR, results['valid_stems'])

print("\n--- DATASET STATISTICS ---")

df_stats = pd.DataFrame(stats).T
display(df_stats)

In [None]:
# SAVE REPORT
save_report(stats, REPORT_PATH)
print(f"\nReport saved to {REPORT_PATH}")