In [1]:
import os, json, subprocess, cv2
from concurrent.futures import ThreadPoolExecutor
import yt_dlp, mediapipe as mp
from tqdm import tqdm
from aeneas.executetask import ExecuteTask
from aeneas.task import Task

In [None]:
# ---------- CONFIG ----------
DATA_ROOT = "dataset"
os.makedirs(DATA_ROOT, exist_ok=True)
# NOTE: Do NOT create a global FaceMesh instance. Creating and reusing a single
# FaceMesh object across threads or across different video captures can cause
# crashes (segfaults) in some environments. Instead create a FaceMesh instance
# locally inside the processing function (see `extract_lip_frames`).
# mp_face_mesh = mp.solutions.face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1)


I0000 00:00:1760431104.598061   96470 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1760431104.601003   97139 gl_context.cc:357] GL version: 3.2 (OpenGL ES 3.2 Mesa 25.2.4-arch1.2), renderer: Mesa Intel(R) UHD Graphics (CML GT2)
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1760431104.603288   97133 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [3]:
# ---------- DOWNLOAD ----------
def download_fb_video(url, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    ydl_opts = {
        "outtmpl": os.path.join(out_dir, "%(id)s.%(ext)s"),
        "format": "best[ext=mp4]",
        "quiet": True,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=True)
    return os.path.join(out_dir, f"{info['id']}.mp4"), info["id"]

W0000 00:00:1760431104.612307   97134 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [4]:
# ---------- AUDIO ----------
def extract_audio(video_path, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, os.path.basename(video_path).replace(".mp4", ".wav"))
    cmd = ["ffmpeg", "-y", "-i", video_path, "-ac", "1", "-ar", "16000", out_path]
    subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    return out_path

In [None]:
# ---------- LIP FRAMES ----------
def extract_lip_frames(video_path, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    cap = cv2.VideoCapture(video_path)
    frame_idx = 0
    # Create FaceMesh locally per invocation to avoid sharing across threads.
    with mp.solutions.face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1) as face_mesh:
        while True:
            ret, frame = cap.read()
            if not ret: break
            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = face_mesh.process(rgb)
            if results and getattr(results, 'multi_face_landmarks', None):
                landmarks = results.multi_face_landmarks[0].landmark
                # Some videos may have fewer landmark points; guard against that.
                if len(landmarks) >= 88:
                    lip_points = [(int(l.x * frame.shape[1]), int(l.y * frame.shape[0])) for l in landmarks[61:88]]
                    xs, ys = zip(*lip_points)
                    x1, x2 = max(min(xs)-10,0), min(max(xs)+10, frame.shape[1])
                    y1, y2 = max(min(ys)-10,0), min(max(ys)+10, frame.shape[0])
                    crop = frame[y1:y2, x1:x2]
                    cv2.imwrite(os.path.join(out_dir, f"{frame_idx:06d}.jpg"), crop)
            frame_idx += 1
    cap.release()


In [6]:
# ---------- ALIGNMENT ----------
def align_audio_text(audio_path, text, out_path):
    tmp_text = os.path.join(os.path.dirname(out_path), "temp.txt")
    with open(tmp_text, "w", encoding="utf-8") as f: f.write(text)
    task = Task(config_string="task_language=eng|is_text_type=plain|os_task_file_format=tsv")
    task.audio_file_path_absolute = audio_path
    task.text_file_path_absolute = tmp_text
    task.output_file_path_absolute = out_path
    ExecuteTask(task).execute()
    task.output_sync_map_file(out_path)
    os.remove(tmp_text)

In [None]:
# ---------- PROCESS ONE SAMPLE ----------
def process_sample(url, title):
    try:
        video_path, vid_id = download_fb_video(url, os.path.join(DATA_ROOT, "videos"))
        audio_path = extract_audio(video_path, os.path.join(DATA_ROOT, "audio"))
        frame_dir = os.path.join(DATA_ROOT, "frames", vid_id)
        extract_lip_frames(video_path, frame_dir)
        align_path = os.path.join(DATA_ROOT, "alignments", f"{vid_id}.tsv")
        os.makedirs(os.path.dirname(align_path), exist_ok=True)
        align_audio_text(audio_path, title, align_path)
        meta = {
            "id": vid_id,
            "url": url,
            "text": title,
            "video": video_path,
            "audio": audio_path,
            "frames_dir": frame_dir,
            "alignment": align_path,
        }
        json.dump(meta, open(os.path.join(DATA_ROOT, f"{vid_id}.json"), "w", encoding="utf-8"), indent=2)
        return vid_id
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return None

: 

In [None]:
# ---------- MAIN ----------
if __name__ == "__main__":
    # List of (URL, title) pairs
    videos = [
        ("https://www.facebook.com/share/v/1CdiEnT1xw/", "विराटनगरको जिल्ला कारागार मोरङमा कैदीबन्दी जीवन बिताइरहेका एकजनाको आज दिउँसो मृत्यु भएको छ ।"),
        ("https://www.facebook.com/share/v/17YKEbmgt8/", "विराटनगरको वडानम्बर १६ बर्माटोलका बासिन्दा सिनो तथा फोहोरको दुर्गन्धका कारण पीडित भएका छन । फोहोरका कारण उत्सर्जन हुने हानिकारण ग्याँस र दुर्गन्धका कारण सो क्षेत्रका बासिन्दा घरकोठामैं मास्क लगाएर बस्नुपर्ने अवस्था सृजना भएको छ ।"),
    ]

    with ThreadPoolExecutor(max_workers=2) as ex:
        for _ in tqdm(ex.map(lambda v: process_sample(*v), videos), total=len(videos)):
            pass
    print("✅ Dataset build complete.")

  0%|          | 0/2 [00:00<?, ?it/s]

                             