# Module 1 - Extraction (Colab)

This notebook runs Module 1 extraction pipeline to generate:
- `audio_16k.wav`
- `keyframes/`
- `scene_metadata.json`


In [None]:
from google.colab import drive
from pathlib import Path

drive.mount('/content/drive')
Path('/content').mkdir(parents=True, exist_ok=True)


In [None]:
import importlib.util
import subprocess
import sys
from pathlib import Path

def _ensure_ffmpeg():
    if Path('/usr/bin/ffmpeg').exists():
        print('ffmpeg already installed')
        return
    subprocess.check_call(['apt-get', 'update', '-y'])
    subprocess.check_call(['apt-get', 'install', '-y', 'ffmpeg'])

def _ensure_packages():
    req = {'scenedetect':'scenedetect', 'cv2':'opencv-python-headless'}
    miss = [pip for mod,pip in req.items() if importlib.util.find_spec(mod) is None]
    if not miss:
        print('python packages already satisfied')
        return
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', *miss])

_ensure_ffmpeg()
_ensure_packages()


In [None]:
import os
import shutil
import subprocess
import sys
from pathlib import Path

REPO_DIR = Path('/content/video-summary')
BRANCH_NAME = os.environ.get('VIDEO_SUMMARY_BRANCH', '02-member-2-reasoning-nlp')

if not REPO_DIR.exists():
    subprocess.check_call([
        'git', 'clone', '--single-branch', '--branch', BRANCH_NAME,
        'https://github.com/TCTri205/video-summary.git', str(REPO_DIR)
    ])
else:
    os.chdir(REPO_DIR)
    subprocess.check_call(['git', 'fetch', 'origin'])
    subprocess.check_call(['git', 'checkout', BRANCH_NAME])
    subprocess.check_call(['git', 'pull', 'origin', BRANCH_NAME])

os.chdir(REPO_DIR)

DRIVE_ROOT = Path('/content/drive/MyDrive/video-summary')
INPUT_VIDEO_DRIVE = DRIVE_ROOT / 'input' / 'raw_video.mp4'
PROCESSED_DRIVE = DRIVE_ROOT / 'processed'

LOCAL_ROOT = Path('/content/video-summary-work')
LOCAL_INPUT_DIR = LOCAL_ROOT / 'input'
LOCAL_PROCESSED = LOCAL_ROOT / 'processed'
LOCAL_INPUT_VIDEO = LOCAL_INPUT_DIR / INPUT_VIDEO_DRIVE.name

for path in [DRIVE_ROOT, PROCESSED_DRIVE, LOCAL_INPUT_DIR, LOCAL_PROCESSED]:
    path.mkdir(parents=True, exist_ok=True)

if not INPUT_VIDEO_DRIVE.exists():
    raise FileNotFoundError(f'Missing input video: {INPUT_VIDEO_DRIVE}')

if (not LOCAL_INPUT_VIDEO.exists()) or (LOCAL_INPUT_VIDEO.stat().st_size != INPUT_VIDEO_DRIVE.stat().st_size):
    shutil.copy2(INPUT_VIDEO_DRIVE, LOCAL_INPUT_VIDEO)

VIDEO_PATH = str(LOCAL_INPUT_VIDEO)
OUTPUT_ROOT = str(LOCAL_PROCESSED)
VIDEO_NAME = Path(VIDEO_PATH).stem
print('VIDEO_NAME =', VIDEO_NAME)


In [None]:
from extraction_perception.extraction.extraction import VideoPreprocessor

processor = VideoPreprocessor(video_path=VIDEO_PATH, output_root=OUTPUT_ROOT)
timestamps = processor.detect_scenes()
audio_path = processor.extract_audio()
metadata = processor.extract_keyframes_and_metadata(timestamps)

print('Detected scenes:', len(timestamps))
print('Audio path:', audio_path)
print('Keyframes:', metadata.get('total_keyframes', 0))


In [None]:
import shutil
from pathlib import Path

local_extraction = Path(OUTPUT_ROOT) / VIDEO_NAME / 'extraction'
drive_extraction = PROCESSED_DRIVE / VIDEO_NAME / 'extraction'
drive_extraction.mkdir(parents=True, exist_ok=True)

sync_items = [
    ('scene_metadata.json', 'scene_metadata.json'),
    ('audio/audio_16k.wav', 'audio/audio_16k.wav'),
    ('keyframes', 'keyframes'),
]

for src_rel, dst_rel in sync_items:
    src = local_extraction / src_rel
    dst = drive_extraction / dst_rel
    if not src.exists():
        continue
    dst.parent.mkdir(parents=True, exist_ok=True)
    if src.is_dir():
        if dst.exists():
            shutil.rmtree(dst)
        shutil.copytree(src, dst)
    else:
        shutil.copy2(src, dst)

print('Synced extraction outputs to Drive:', drive_extraction)
print('scene_metadata exists:', (drive_extraction / 'scene_metadata.json').exists())
print('audio_16k exists:', (drive_extraction / 'audio' / 'audio_16k.wav').exists())
print('keyframes dir exists:', (drive_extraction / 'keyframes').exists())
