In [2]:
#!/usr/bin/env python3
"""
step3_detect_disfluencies.py

Step 3: Analyze the segment-level manifest (train_manifest.jsonl) for disfluency patterns
and generate the structured disfluency dataset (CSV).
"""

import json
import re
import pandas as pd
from pathlib import Path
from typing import List, Dict, Any

# -----------------------------
# Configuration
# -----------------------------
# Assuming your output from Step 1/2 is in ./data/processed/
INPUT_MANIFEST = "train_manifest.jsonl" # Use the file you uploaded
OUTPUT_CSV = "disfluency_detections.csv" 

# -----------------------------
# Disfluency Patterns for Hindi (Devanagari)
# -----------------------------

# List of common fillers (isolated words)
# Adding 'हम्', 'एह', 'जी' (when used as filler)
FILLERS = [
    "हूँ", "हम्", "हम्म", "उम्म", "अह", "एह", "जी", "हाँ", "ठीक है", "मतलब", "तो", "बस"
]

# 1. Repetitions: Look for a word/phrase repeated, often separated by a marker.
# Simple pattern: (Word)(Marker)(Word)
# (\S+) captures one or more non-whitespace chars (the word/token).
# (?:\.\.\.|\s+-\s*|\s+) is a non-capturing group for the pause marker (..., or space-dash-space).
# \s*\1 looks for optional whitespace followed by the captured word/phrase (\1).
REPETITION_REGEX = re.compile(r"(\S+)(?:\.\.\.|\s+-\s*|\s+)\s*\1", re.IGNORECASE)

# 2. Hesitations / False Starts / Pauses: Marked by ellipsis or just a trailing pause
HESITATION_REGEX = re.compile(r"\.\.\.|\.\s+", re.IGNORECASE)

# 3. Prolongations: Find any vowel or nasal sound repeated 3 or more times consecutively
# Vowels/Matras in Devanagari: ा, ि, ी, ु, ू, ृ, े, ै, ो, ौ, ं, ँ
PROLONGATION_REGEX = re.compile(r"([ािीुूृेैोौंँ])\1{2,}", re.IGNORECASE)


def detect_disfluencies(text: str) -> List[Dict[str, Any]]:
    """Analyzes a single segment text for disfluency patterns."""
    detections = []
    
    # Clean text for simple filler/token checking (remove punctuation for comparison)
    cleaned_text = re.sub(r'[.,?!:;\'"]', '', text)
    text_tokens = cleaned_text.split()
    
    # 1. Filler Detection (isolated word check)
    for token in text_tokens:
        if token in FILLERS:
            detections.append({
                "disfluency_type": "filler",
                "detected_token": token,
                "confidence": 0.9 
            })
    
    # 2. Repetition Detection
    for match in REPETITION_REGEX.finditer(text):
        detections.append({
            "disfluency_type": "repetition",
            "detected_token": match.group(0).strip(),
            "confidence": 0.8 
        })

    # 3. Hesitation/False Start Detection (Ellipsis-based)
    for match in HESITATION_REGEX.finditer(text):
        token = match.group(0).strip()
        # Simple check to avoid recording an ellipsis if it's part of a detected repetition
        is_part_of_repetition = any(d["disfluency_type"] == "repetition" and token in d["detected_token"] for d in detections)
        if not is_part_of_repetition:
            detections.append({
                "disfluency_type": "hesitation/false_start",
                "detected_token": token,
                "confidence": 0.7 
            })

    # 4. Prolongation Detection
    for match in PROLONGATION_REGEX.finditer(text):
        detections.append({
            "disfluency_type": "prolongation",
            "detected_token": match.group(0),
            "confidence": 0.9 
        })

    # Deduplicate based on type and detected token for clean output
    unique_detections = []
    seen = set()
    for d in detections:
        key = (d["disfluency_type"], d["detected_token"])
        if key not in seen:
            unique_detections.append(d)
            seen.add(key)
    
    return unique_detections

def main():
    if not Path(INPUT_MANIFEST).exists():
        print(f"[ERROR] Input manifest not found: {INPUT_MANIFEST}. Please ensure it is in the current directory.")
        return

    segments = []
    print(f"Loading segments from {INPUT_MANIFEST}...")
    with open(INPUT_MANIFEST, 'r', encoding='utf-8') as f:
        for line in f:
            segments.append(json.loads(line))

    all_disfluencies = []
    print(f"Analyzing {len(segments)} segments for disfluencies...")

    for i, segment in enumerate(segments):
        text = segment.get("text", "")
        if not text:
            continue

        disfluencies = detect_disfluencies(text)

        for disfluency in disfluencies:
            # Combine segment metadata with disfluency details
            row = {
                "row_id": len(all_disfluencies) + 1,
                "recording_id": segment["recording_id"],
                "segment_id": segment["segment_id"],
                # The segment's start/end time in the original audio
                "start_time": segment["start_time"], 
                "end_time": segment["end_time"],     
                "duration": segment["duration"],
                "disfluency_type": disfluency["disfluency_type"],
                "detected_token": disfluency["detected_token"],
                "confidence": disfluency["confidence"],
                "audio_url": segment["audio_filepath"], # original full audio URL for clipping
                # Placeholder for the final clip path
                "clip_path": f"clips/{segment['segment_id']}.wav",
                "notes": f"Found {disfluency['disfluency_type']} near token(s): '{disfluency['detected_token']}'"
            }
            all_disfluencies.append(row)

    print(f"Total disfluency occurrences detected: {len(all_disfluencies)}")
    
    # Final Output Dataset (Sheet Format)
    output_df = pd.DataFrame(all_disfluencies)
    
    # Ensure correct column order for the deliverable sheet
    column_order = [
        "row_id", "recording_id", "segment_id", "disfluency_type", 
        "detected_token", "start_time", "end_time", "duration", 
        "clip_path", "confidence", "notes", "audio_url" # audio_url is for internal use in clipping
    ]
    
    if not output_df.empty:
        output_df = output_df.reindex(columns=column_order)
    
    output_df.to_csv(OUTPUT_CSV, index=False, encoding='utf-8')
    print(f"Disfluency metadata saved to: {OUTPUT_CSV}")

if __name__ == "__main__":
    # Ensure you have pandas installed: pip install pandas
    main()

Loading segments from train_manifest.jsonl...
Analyzing 5941 segments for disfluencies...
Total disfluency occurrences detected: 8497
Disfluency metadata saved to: disfluency_detections.csv
