# Module 1 - Extraction (Colab)

This notebook runs Module 1 extraction pipeline to generate:
- `audio_16k.wav`
- `keyframes/`
- `scene_metadata.json`


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!apt-get update -y
!apt-get install -y ffmpeg
!pip install scenedetect opencv-python-headless


In [None]:
import os
from pathlib import Path

REPO_DIR = '/content/video-summary'
if not os.path.exists(REPO_DIR):
    !git clone https://github.com/TCTri205/video-summary.git {REPO_DIR}
%cd {REPO_DIR}


In [None]:
from extraction_perception.extraction.extraction import VideoPreprocessor

VIDEO_PATH = '/content/drive/MyDrive/video-summary/input/raw_video.mp4'
OUTPUT_ROOT = '/content/drive/MyDrive/video-summary/processed'

processor = VideoPreprocessor(video_path=VIDEO_PATH, output_root=OUTPUT_ROOT)
timestamps = processor.detect_scenes()
audio_path = processor.extract_audio()
metadata = processor.extract_keyframes_and_metadata(timestamps)

print('Detected scenes:', len(timestamps))
print('Audio path:', audio_path)
print('Keyframes:', metadata.get('total_keyframes', 0))


In [None]:
from pathlib import Path

video_name = Path(VIDEO_PATH).stem
base = Path(OUTPUT_ROOT) / video_name / 'extraction'
print('scene_metadata exists:', (base / 'scene_metadata.json').exists())
print('audio_16k exists:', (base / 'audio' / 'audio_16k.wav').exists())
print('keyframes dir exists:', (base / 'keyframes').exists())
