In [1]:
import os, json, subprocess, cv2
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import yt_dlp
import mediapipe as mp
from aeneas.executetask import ExecuteTask
from aeneas.task import Task
from faster_whisper import WhisperModel

In [2]:
# ---------- CONFIG ----------
DATA_ROOT = "dataset-1a"
os.makedirs(DATA_ROOT, exist_ok=True)

In [3]:
# Load Whisper once (faster)
DEVICE = "cuda" if cv2.cuda.getCudaEnabledDeviceCount() else "cpu"
whisper_model = WhisperModel("medium", device=DEVICE)



In [4]:
# ---------- DOWNLOAD ----------
def download_fb_video(url, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    ydl_opts = {
        "outtmpl": os.path.join(out_dir, "%(id)s.%(ext)s"),
        "format": "best[ext=mp4]",
        "quiet": True,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=True)
    return os.path.join(out_dir, f"{info['id']}.mp4"), info["id"]

In [5]:
# ---------- AUDIO ----------
def extract_audio(video_path, out_dir, start_time=0, end_time=None):
    os.makedirs(out_dir, exist_ok=True)
    base = os.path.splitext(os.path.basename(video_path))[0]
    out_path = os.path.join(out_dir, f"{base}.wav")

    cmd = [
        "ffmpeg", "-y", "-i", video_path,
        "-ac", "1", "-ar", "16000"
    ]
    if start_time or end_time:
        cmd += ["-ss", str(start_time)]
        if end_time:
            cmd += ["-to", str(end_time)]
    cmd += [out_path]

    subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    return out_path

In [6]:
# ---------- AUDIO VALIDATION ----------
def validate_audio(audio_path, min_duration=1.0):
    """
    Validates audio file and returns detailed info.
    """
    import wave
    try:
        with wave.open(audio_path, 'rb') as wav_file:
            n_channels = wav_file.getnchannels()
            sample_rate = wav_file.getframerate()
            n_frames = wav_file.getnframes()
            duration = n_frames / sample_rate
            
            info = {
                "channels": n_channels,
                "sample_rate": sample_rate,
                "frames": n_frames,
                "duration": duration,
                "valid": duration >= min_duration
            }
            
            print(f"  📊 Audio Info:")
            print(f"     Duration: {duration:.2f}s | Sample Rate: {sample_rate}Hz | Channels: {n_channels}")
            
            if not info["valid"]:
                print(f"     ⚠️ WARNING: Audio duration ({duration:.2f}s) < minimum ({min_duration}s)")
            
            return info
    except Exception as e:
        print(f"  ❌ Error reading audio: {e}")
        return {"valid": False}

In [7]:
# ---------- LIP FRAMES ----------
def extract_lip_frames(video_path, out_dir, start_time=0, end_time=None, debug=False, verbose=True):
    """Extract lip-region frames between start_time and end_time."""
    os.makedirs(out_dir, exist_ok=True)
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = total_frames / fps if fps else 0

    if verbose:
        print(f"  📹 Video Info:")
        print(f"     FPS: {fps:.2f} | Total Frames: {total_frames} | Duration: {duration:.2f}s")

    if end_time is None or end_time > duration:
        end_time = duration

    cap.set(cv2.CAP_PROP_POS_MSEC, start_time * 1000)
    frame_idx = 0
    detected = 0
    skipped = 0

    with mp.solutions.face_detection.FaceDetection(model_selection=1, min_detection_confidence=0.3) as face_det, \
         mp.solutions.face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1,
                                         refine_landmarks=True,
                                         min_detection_confidence=0.3,
                                         min_tracking_confidence=0.3) as mesh:

        while True:
            current_time = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000.0
            if current_time > end_time:
                break
            ret, frame = cap.read()
            if not ret:
                break

            # Resize large frames
            h, w = frame.shape[:2]
            if w > 960:
                scale = 960 / w
                frame = cv2.resize(frame, None, fx=scale, fy=scale)
                h, w = frame.shape[:2]

            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            detections = face_det.process(rgb).detections

            if detections:
                for det in detections:
                    bbox = det.location_data.relative_bounding_box
                    x1 = int(bbox.xmin * w)
                    y1 = int(bbox.ymin * h)
                    bw = int(bbox.width * w)
                    bh = int(bbox.height * h)

                    # Focus on lower face (lips)
                    y1 = y1 + int(bh * 0.4)
                    bh = int(bh * 0.6)
                    x1 = max(x1 - 10, 0)
                    y1 = max(y1 - 10, 0)
                    x2 = min(x1 + bw + 20, w)
                    y2 = min(y1 + bh + 20, h)

                    crop = frame[y1:y2, x1:x2]
                    if crop.size > 0:
                        cv2.imwrite(os.path.join(out_dir, f"{frame_idx:06d}.jpg"), crop)
                        detected += 1

                        if debug:
                            dbg = frame.copy()
                            cv2.rectangle(dbg, (x1, y1), (x2, y2), (0, 255, 0), 2)
                            cv2.imshow("Lip detection", dbg)
                            if cv2.waitKey(1) & 0xFF == ord("q"):
                                break
            else:
                skipped += 1
            
            frame_idx += 1

    cap.release()
    if debug:
        cv2.destroyAllWindows()

    if verbose:
        print(f"  🎬 Frame Extraction:")
        print(f"     Detected frames: {detected} | Skipped frames: {skipped} | Total processed: {frame_idx}")
        if detected == 0:
            print(f"     ⚠️ WARNING: No faces detected in range {start_time}s - {end_time}s")
        else:
            print(f"     ✅ Saved {detected} lip frames to {out_dir}")

In [8]:
# ---------- ALIGNMENT ----------
def align_audio_text(audio_path, text, out_path, whisper_model=None, debug=True):
    """
    Aligns audio with word-level timestamps and writes TSV:
    start_time, end_time, word
    
    Args:
        audio_path: Path to audio file
        text: Text (not used in transcription, kept for compatibility)
        out_path: Output TSV path
        whisper_model: Pre-loaded WhisperModel instance
        debug: Enable detailed logging
    """
    try:
        print(f"\n  🔄 Transcribing: {audio_path}")
        
        # Validate audio first
        audio_info = validate_audio(audio_path, min_duration=0.5)
        if not audio_info["valid"]:
            print(f"  ⚠️ Audio validation failed, attempting transcription anyway...")
        
        if whisper_model is None:
            print("  ⚠️ No model provided, creating new instance")
            whisper_model = WhisperModel("medium", device=DEVICE)
        
        # Transcribe with word-level timestamps
        segments, info = whisper_model.transcribe(
            audio_path, 
            beam_size=5, 
            word_timestamps=True,
            language=None  # Auto-detect; set to "ne" for Nepali-only
        )
        
        segments_list = list(segments)
        
        if debug:
            print(f"  📋 Whisper Metadata:")
            print(f"     Detected Language: {info.language} (confidence: {info.language_probability:.2%})")
            print(f"     Number of segments: {len(segments_list)}")
        
        if not segments_list:
            print(f"  ⚠️ No transcription results for {audio_path}")
            with open(out_path, "w", encoding="utf-8") as f:
                f.write("# Empty transcription\n")
            return False
        
        # Extract word-level timestamps
        word_count = 0
        segment_details = []
        
        with open(out_path, "w", encoding="utf-8") as f:
            for seg_idx, seg in enumerate(segments_list):
                seg_text = seg.text.strip()
                seg_info = f"Segment {seg_idx}: {seg.start:.2f}s - {seg.end:.2f}s | {seg_text}"
                segment_details.append(seg_info)
                
                if seg.words:
                    for w in seg.words:
                        word_text = w.word.strip()
                        # Filter out very short or noise words
                        if word_text and len(word_text) > 0:
                            f.write(f"{w.start:.2f}\t{w.end:.2f}\t{word_text}\n")
                            word_count += 1
        
        if debug:
            print(f"  📝 Segment Details:")
            for detail in segment_details:
                print(f"     {detail}")
        
        if word_count > 0:
            print(f"  ✅ Extracted {word_count} words with timestamps")
            print(f"     Output: {out_path}")
            return True
        else:
            print(f"  ⚠️ No valid words extracted (empty segments?)")
            return False
            
    except Exception as e:
        print(f"  ❌ Error in align_audio_text: {e}")
        import traceback
        traceback.print_exc()
        # Create error TSV
        with open(out_path, "w", encoding="utf-8") as f:
            f.write(f"# Error: {str(e)}\n")
        return False

In [9]:
# ---------- PROCESS ONE SAMPLE ----------
def process_sample(url, title, start_time, end_time, whisper_model=None, verbose=True):
    try:
        print(f"\n{'='*70}")
        print(f"📥 PROCESSING VIDEO")
        print(f"{'='*70}")
        print(f"URL: {url}")
        print(f"Title: {title}")
        print(f"Time Range: {start_time}s - {end_time}s")
        
        video_path, vid_id = download_fb_video(url, os.path.join(DATA_ROOT, "videos"))
        print(f"✅ Downloaded video: {vid_id}")
        
        # Extract audio
        audio_path = extract_audio(video_path, os.path.join(DATA_ROOT, "audio"), start_time, end_time)
        print(f"✅ Extracted audio: {audio_path}")
        
        # Extract frames with verbose output
        frame_dir = os.path.join(DATA_ROOT, "frames", vid_id)
        extract_lip_frames(video_path, frame_dir, start_time, end_time, verbose=True)
        
        # Align audio
        align_path = os.path.join(DATA_ROOT, "alignments", f"{vid_id}.tsv")
        os.makedirs(os.path.dirname(align_path), exist_ok=True)
        align_success = align_audio_text(audio_path, title, align_path, whisper_model=whisper_model, debug=True)
        
        # Verify TSV was created
        if os.path.exists(align_path):
            file_size = os.path.getsize(align_path)
            with open(align_path, 'r', encoding='utf-8') as f:
                line_count = len(f.readlines())
            print(f"\n  📄 TSV File:")
            print(f"     Path: {align_path}")
            print(f"     Size: {file_size} bytes | Lines: {line_count}")
            if line_count == 0:
                print(f"     ⚠️ WARNING: TSV is empty!")
        
        # Save metadata
        meta = {
            "id": vid_id,
            "url": url,
            "text": title,
            "video": video_path,
            "audio": audio_path,
            "frames_dir": frame_dir,
            "alignment": align_path,
            "start_time": start_time,
            "end_time": end_time,
            "transcription_success": align_success,
        }
        meta_path = os.path.join(DATA_ROOT, f"{vid_id}.json")
        with open(meta_path, "w", encoding="utf-8") as f:
            json.dump(meta, f, indent=2, ensure_ascii=False)
        print(f"✅ Saved metadata: {meta_path}")
        
        print(f"{'='*70}\n")
        return vid_id
        
    except Exception as e:
        print(f"{'='*70}")
        print(f"❌ ERROR PROCESSING: {url}")
        print(f"{'='*70}")
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()
        print(f"{'='*70}\n")
        return None

In [10]:
# ---------- MAIN ----------
if __name__ == "__main__":
    videos = [
        (
            "https://www.facebook.com/share/v/1CdiEnT1xw/",
            "विराटनगरको जिल्ला कारागार मोरङमा कैदबन्दी जीवन बिताइरहेका एकजनाको आज दिउँसो मृत्यु भएको छ",
            0.0,
            6.0,
        ),
        (
            "https://www.facebook.com/share/v/17YKEbmgt8/",
            "विराटनगरको वडानम्बर १६ बर्माटोलका बासिन्दा सिनो तथा फोहोरको दुर्गन्धका कारण पीडित भएका छन फोहोरका कारण उत्सर्जन हुने हानिकारण ग्याँस र दुर्गन्धका कारण सो क्षेत्रका बासिन्दा घरकोठामैं मास्क लगाएर बस्नुपर्ने अवस्था सृजना भएको छ",
            0.0,
            17.0,
        ),
    ]

    print(f"\n{'#'*70}")
    print(f"# DATASET BUILD STARTING")
    print(f"# Device: {DEVICE} | Whisper Model: medium")
    print(f"{'#'*70}\n")

    # Pass the global whisper_model to all workers
    with ThreadPoolExecutor(max_workers=2) as ex:
        futures = [
            ex.submit(process_sample, *v, whisper_model=whisper_model) 
            for v in videos
        ]
        for future in tqdm(futures, total=len(videos), desc="Processing videos"):
            try:
                future.result()
            except Exception as e:
                print(f"❌ Worker error: {e}")
                import traceback
                traceback.print_exc()
    
    print(f"\n{'#'*70}")
    print(f"# ✅ DATASET BUILD COMPLETE")
    print(f"{'#'*70}\n")


######################################################################
# DATASET BUILD STARTING
# Device: cpu | Whisper Model: medium
######################################################################


📥 PROCESSING VIDEO
URL: https://www.facebook.com/share/v/1CdiEnT1xw/
Title: विराटनगरको जिल्ला कारागार मोरङमा कैदबन्दी जीवन बिताइरहेका एकजनाको आज दिउँसो मृत्यु भएको छ
Time Range: 0.0s - 6.0s

📥 PROCESSING VIDEO
URL: https://www.facebook.com/share/v/17YKEbmgt8/
Title: विराटनगरको वडानम्बर १६ बर्माटोलका बासिन्दा सिनो तथा फोहोरको दुर्गन्धका कारण पीडित भएका छन फोहोरका कारण उत्सर्जन हुने हानिकारण ग्याँस र दुर्गन्धका कारण सो क्षेत्रका बासिन्दा घरकोठामैं मास्क लगाएर बस्नुपर्ने अवस्था सृजना भएको छ
Time Range: 0.0s - 17.0s


Processing videos:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Downloaded video: 781989364756468
✅ Downloaded video: 2022332995220352
✅ Extracted audio: dataset-1a/audio/781989364756468.wav
✅ Extracted audio: dataset-1a/audio/2022332995220352.wav
  📹 Video Info:
     FPS: 29.97 | Total Frames: 6375 | Duration: 212.71s
  📹 Video Info:
     FPS: 29.97 | Total Frames: 1882 | Duration: 62.80s


I0000 00:00:1760443399.353688  300170 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1760443399.356081  300426 gl_context.cc:357] GL version: 3.2 (OpenGL ES 3.2 Mesa 25.2.4-arch1.2), renderer: Mesa Intel(R) UHD Graphics (CML GT2)
I0000 00:00:1760443399.358271  300169 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1760443399.359432  300436 gl_context.cc:357] GL version: 3.2 (OpenGL ES 3.2 Mesa 25.2.4-arch1.2), renderer: Mesa Intel(R) UHD Graphics (CML GT2)
I0000 00:00:1760443399.397041  300170 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1760443399.398811  300447 gl_context.cc:357] GL version: 3.2 (OpenGL ES 3.2 Mesa 25.2.4-arch1.2), renderer: Mesa Intel(R) UHD Graphics (CML GT2)
I0000 00:00:1760443399.412564  300169 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
I0000 00:00:1760443399.415118  300458

  🎬 Frame Extraction:
     Detected frames: 181 | Skipped frames: 0 | Total processed: 181
     ✅ Saved 181 lip frames to dataset-1a/frames/781989364756468

  🔄 Transcribing: dataset-1a/audio/781989364756468.wav
  📊 Audio Info:
     Duration: 6.00s | Sample Rate: 16000Hz | Channels: 1
  🎬 Frame Extraction:
     Detected frames: 511 | Skipped frames: 0 | Total processed: 511
     ✅ Saved 511 lip frames to dataset-1a/frames/2022332995220352

  🔄 Transcribing: dataset-1a/audio/2022332995220352.wav
  📊 Audio Info:
     Duration: 17.00s | Sample Rate: 16000Hz | Channels: 1


Processing videos:  50%|█████     | 1/2 [07:04<07:04, 424.72s/it]

  📋 Whisper Metadata:
     Detected Language: ne (confidence: 79.24%)
     Number of segments: 1
  📝 Segment Details:
     Segment 0: 0.00s - 5.98s | बिरातनगर को जिल्ला कारागार मृर्लग मा कईदीबण दे जिब्ण बितारी हेका एक जना को आजो दिव्शम रे तियुफलोगु.
  ✅ Extracted 18 words with timestamps
     Output: dataset-1a/alignments/781989364756468.tsv

  📄 TSV File:
     Path: dataset-1a/alignments/781989364756468.tsv
     Size: 442 bytes | Lines: 18
✅ Saved metadata: dataset-1a/781989364756468.json



Processing videos: 100%|██████████| 2/2 [07:20<00:00, 220.14s/it]

  📋 Whisper Metadata:
     Detected Language: ne (confidence: 90.13%)
     Number of segments: 2
  📝 Segment Details:
     Segment 0: 0.00s - 12.52s | बिरामा ऱ्हटूल का बासिंधा किस्थ्र को Oscillation Dedicated To Varma Tol
     Segment 1: 12.52s - 16.98s | ४।ॷ्दा, यागा समिना किस्थ्र तापार्भार रैगी Ohh Funeral,
  ✅ Extracted 19 words with timestamps
     Output: dataset-1a/alignments/2022332995220352.tsv

  📄 TSV File:
     Path: dataset-1a/alignments/2022332995220352.tsv
     Size: 471 bytes | Lines: 19
✅ Saved metadata: dataset-1a/2022332995220352.json


######################################################################
# ✅ DATASET BUILD COMPLETE
######################################################################






In [11]:
# # ---------- MAIN ----------
# if __name__ == "__main__":
#     # List of (URL, title, start_time, end_time)
#     videos = [
#         (
#             "https://www.facebook.com/share/v/1CdiEnT1xw/",
#             "विराटनगरको जिल्ला कारागार मोरङमा कैदीबन्दी जीवन बिताइरहेका एकजनाको आज दिउँसो मृत्यु भएको छ ।",
#             0.0,
#             6.0,
#         ),
#         (
#             "https://www.facebook.com/share/v/17YKEbmgt8/",
#             "विराटनगरको वडानम्बर १६ बर्माटोलका बासिन्दा सिनो तथा फोहोरको दुर्गन्धका कारण पीडित भएका छन । फोहोरका कारण उत्सर्जन हुने हानिकारण ग्याँस र दुर्गन्धका कारण सो क्षेत्रका बासिन्दा घरकोठामैं मास्क लगाएर बस्नुपर्ने अवस्था सृजना भएको छ ।",
#             0.0,
#             17.0,
#         ),
#     ]

#     with ThreadPoolExecutor(max_workers=2) as ex:
#         for _ in tqdm(ex.map(lambda v: process_sample(*v), videos), total=len(videos)):
#             pass
#     print("✅ Dataset build complete.")