# Module 2 - Perception (Colab)

This notebook runs ASR + visual captioning to generate:
- `audio_transcripts.json`
- `visual_captions.json`


In [None]:
from google.colab import drive

drive.mount('/content/drive')


In [None]:
import importlib.util
import subprocess
import sys
from pathlib import Path

def _ensure_ffmpeg():
    if Path('/usr/bin/ffmpeg').exists():
        print('ffmpeg already installed')
        return
    subprocess.check_call(['apt-get', 'update', '-y'])
    subprocess.check_call(['apt-get', 'install', '-y', 'ffmpeg'])

def _ensure_packages():
    req = {
        'faster_whisper': 'faster-whisper',
        'transformers': 'transformers',
        'PIL': 'pillow',
        'tqdm': 'tqdm',
    }
    miss = [pip for mod,pip in req.items() if importlib.util.find_spec(mod) is None]
    if not miss:
        print('python packages already satisfied')
        return
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', *miss])

_ensure_ffmpeg()
_ensure_packages()


In [None]:
import os
import shutil
import subprocess
import sys
from pathlib import Path

REPO_DIR = Path('/content/video-summary')
BRANCH_NAME = os.environ.get('VIDEO_SUMMARY_BRANCH', '02-member-2-reasoning-nlp')

if not REPO_DIR.exists():
    subprocess.check_call([
        'git', 'clone', '--single-branch', '--branch', BRANCH_NAME,
        'https://github.com/TCTri205/video-summary.git', str(REPO_DIR)
    ])
else:
    os.chdir(REPO_DIR)
    subprocess.check_call(['git', 'fetch', 'origin'])
    subprocess.check_call(['git', 'checkout', BRANCH_NAME])
    subprocess.check_call(['git', 'pull', 'origin', BRANCH_NAME])

os.chdir(REPO_DIR)

DRIVE_ROOT = Path('/content/drive/MyDrive/video-summary')
INPUT_VIDEO_DRIVE = DRIVE_ROOT / 'input' / 'raw_video.mp4'
PROCESSED_DRIVE = DRIVE_ROOT / 'processed'

LOCAL_ROOT = Path('/content/video-summary-work')
LOCAL_INPUT_DIR = LOCAL_ROOT / 'input'
LOCAL_PROCESSED = LOCAL_ROOT / 'processed'
LOCAL_INPUT_VIDEO = LOCAL_INPUT_DIR / INPUT_VIDEO_DRIVE.name

for path in [DRIVE_ROOT, PROCESSED_DRIVE, LOCAL_INPUT_DIR, LOCAL_PROCESSED]:
    path.mkdir(parents=True, exist_ok=True)

if not INPUT_VIDEO_DRIVE.exists():
    raise FileNotFoundError(f'Missing input video: {INPUT_VIDEO_DRIVE}')

if (not LOCAL_INPUT_VIDEO.exists()) or (LOCAL_INPUT_VIDEO.stat().st_size != INPUT_VIDEO_DRIVE.stat().st_size):
    shutil.copy2(INPUT_VIDEO_DRIVE, LOCAL_INPUT_VIDEO)

VIDEO_PATH = str(LOCAL_INPUT_VIDEO)
OUTPUT_ROOT = str(LOCAL_PROCESSED)
VIDEO_NAME = Path(VIDEO_PATH).stem
print('VIDEO_NAME =', VIDEO_NAME)


In [None]:
import gc
import shutil
import torch
from extraction_perception.extraction.whisper_module import WhisperExtractor
from extraction_perception.perception.caption import VisualCaptioner

device = 'cuda' if torch.cuda.is_available() else 'cpu'
compute_type = 'float16' if device == 'cuda' else 'int8'

asr = WhisperExtractor(model_size='base', device=device, compute_type=compute_type)
asr.transcribe(
    input_path=str(AUDIO_PATH),
    language='vi',
    output_root=PROCESSED_ROOT,
    output_name=VIDEO_NAME,
)

audio_transcripts_local = EXTRACTION_DIR / 'audio_transcripts.json'
captioner = VisualCaptioner()
captioner.caption_from_metadata(
    metadata_path=str(METADATA_PATH),
    output_path=str(CAPTIONS_PATH),
    batch_size=CAPTION_BATCH_SIZE,
)

DRIVE_EXTRACTION.mkdir(parents=True, exist_ok=True)
shutil.copy2(audio_transcripts_local, DRIVE_EXTRACTION / 'audio_transcripts.json')
shutil.copy2(CAPTIONS_PATH, DRIVE_EXTRACTION / 'visual_captions.json')

print('audio_transcripts:', (DRIVE_EXTRACTION / 'audio_transcripts.json').exists())
print('visual_captions:', (DRIVE_EXTRACTION / 'visual_captions.json').exists())

del asr
del captioner
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
