# PC-GITA Dataset Segmentation Statistics

This notebook validates segmented artifacts and computes aggregate statistics.

In [1]:
import os
import sys
from pathlib import Path

# Add project root to sys.path
sys.path.append(os.path.abspath('..'))

from data_prepare.stats import compute_statistics, save_report, validate_triples, get_silence_durations

In [2]:
# CONFIGURATION
DATA_DIR = '../datalocal/v260210_24kHz/readtext_split'  # Change this to your segmented output directory
REPORT_PATH = '../reports/dataset_readtext_stats.json'

if not Path(DATA_DIR).exists():
    print(f"Warning: DATA_DIR {DATA_DIR} not found. Please run split_sentences.py first.")

In [3]:
# VALIDATION
results = validate_triples(DATA_DIR)
print(f"Total items found: {results['total_stems']}")
print(f"Valid triples: {len(results['valid_stems'])}")

if results['missing_files']:
    print("\n--- MISSING FILES ---")
    for entry in results['missing_files']:
        print(f"{entry['stem']}: missing {', '.join(entry['missing'])}")

Total items found: 499
Valid triples: 499


In [4]:
import pandas as pd

# STATISTICS
stats = compute_statistics(DATA_DIR, results['valid_stems'])

print("\n--- DATASET STATISTICS ---")

df_stats = pd.DataFrame(stats).T

# Add human-readable duration column
def format_duration(seconds):
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = int(seconds % 60)
    return f"{h:02d}:{m:02d}:{s:02d}"

df_stats['formatted_duration'] = df_stats['duration_sec'].apply(format_duration)

display(df_stats)


--- DATASET STATISTICS ---


Unnamed: 0,files,sentences,words,duration_sec,formatted_duration
total,100.0,499.0,3600.0,1811.751125,00:30:11
hc,50.0,244.0,1800.0,880.919542,00:14:40
pd,50.0,255.0,1800.0,930.831583,00:15:30


In [5]:
# SAVE REPORT
save_report(stats, REPORT_PATH)
print(f"\nReport saved to {REPORT_PATH}")


Report saved to ../reports/dataset_readtext_stats.json


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# SILENCE ANALYSIS
silences = get_silence_durations(DATA_DIR, results['valid_stems'])

for key in ['leading', 'trailing']:
    data = silences[key]
    print(f"\n--- {key.capitalize()} Silence Stats ---")
    print(f"Min: {np.min(data):.4f}s")
    print(f"Max: {np.max(data):.4f}s")
    print(f"Mean: {np.mean(data):.4f}s")

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

sns.histplot(silences['leading'], bins=30, ax=axes[0], kde=True, color='blue')
axes[0].set_title('Distribution of Leading Silence')
axes[0].set_xlabel('Duration (seconds)')

sns.histplot(silences['trailing'], bins=30, ax=axes[1], kde=True, color='green')
axes[1].set_title('Distribution of Trailing Silence')
axes[1].set_xlabel('Duration (seconds)')

plt.tight_layout()
plt.show()