# Module 2 - Perception (Colab)

This notebook runs ASR + visual captioning to generate:
- `audio_transcripts.json`
- `visual_captions.json`


In [None]:
from google.colab import drive

drive.mount('/content/drive')


In [None]:
import importlib.util
import subprocess
import sys
from pathlib import Path

def _ensure_ffmpeg():
    if Path('/usr/bin/ffmpeg').exists():
        print('ffmpeg already installed')
        return
    subprocess.check_call(['apt-get', 'update', '-y'])
    subprocess.check_call(['apt-get', 'install', '-y', 'ffmpeg'])

def _ensure_packages():
    req = {
        'faster_whisper': 'faster-whisper',
        'transformers': 'transformers',
        'PIL': 'pillow',
        'tqdm': 'tqdm',
    }
    miss = [pip for mod,pip in req.items() if importlib.util.find_spec(mod) is None]
    if not miss:
        print('python packages already satisfied')
        return
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', *miss])

_ensure_ffmpeg()
_ensure_packages()


In [None]:
import os
import shutil
import subprocess
import torch
from pathlib import Path

print('CUDA available:', torch.cuda.is_available())
gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'
print('GPU:', gpu_name)

if not torch.cuda.is_available():
    RUNTIME_PROFILE = 'CPU'
elif 'L4' in gpu_name.upper():
    RUNTIME_PROFILE = 'L4'
else:
    RUNTIME_PROFILE = 'T4'

CAPTION_BATCH_BY_PROFILE = {
    'CPU': 1,
    'T4': 4,
    'L4': 8,
}
CAPTION_BATCH_SIZE = CAPTION_BATCH_BY_PROFILE[RUNTIME_PROFILE]

print('RUNTIME_PROFILE =', RUNTIME_PROFILE)
print('CAPTION_BATCH_SIZE =', CAPTION_BATCH_SIZE)

REPO_DIR = Path('/content/video-summary')
if not REPO_DIR.exists():
    subprocess.check_call(['git', 'clone', 'https://github.com/TCTri205/video-summary.git', str(REPO_DIR)])
os.chdir(REPO_DIR)

DRIVE_ROOT = Path('/content/drive/MyDrive/video-summary')
RAW_VIDEO_DRIVE = DRIVE_ROOT / 'input' / 'raw_video.mp4'
PROCESSED_DRIVE = DRIVE_ROOT / 'processed'

if not RAW_VIDEO_DRIVE.exists():
    raise FileNotFoundError(f'Missing input video: {RAW_VIDEO_DRIVE}')

VIDEO_NAME = RAW_VIDEO_DRIVE.stem
DRIVE_EXTRACTION = PROCESSED_DRIVE / VIDEO_NAME / 'extraction'

LOCAL_ROOT = Path('/content/video-summary-work')
LOCAL_EXTRACTION = LOCAL_ROOT / 'processed' / VIDEO_NAME / 'extraction'
LOCAL_EXTRACTION.mkdir(parents=True, exist_ok=True)

required_inputs = [
    ('audio/audio_16k.wav', 'audio/audio_16k.wav'),
    ('scene_metadata.json', 'scene_metadata.json'),
    ('keyframes', 'keyframes'),
]
for src_rel, dst_rel in required_inputs:
    src = DRIVE_EXTRACTION / src_rel
    dst = LOCAL_EXTRACTION / dst_rel
    if not src.exists():
        raise FileNotFoundError(f'Missing extraction artifact: {src}')
    dst.parent.mkdir(parents=True, exist_ok=True)
    if src.is_dir():
        if dst.exists():
            shutil.rmtree(dst)
        shutil.copytree(src, dst)
    else:
        shutil.copy2(src, dst)

PROCESSED_ROOT = str(LOCAL_ROOT / 'processed')
EXTRACTION_DIR = LOCAL_EXTRACTION
AUDIO_PATH = EXTRACTION_DIR / 'audio' / 'audio_16k.wav'
METADATA_PATH = EXTRACTION_DIR / 'scene_metadata.json'
CAPTIONS_PATH = EXTRACTION_DIR / 'visual_captions.json'


In [None]:
import gc
import shutil
import torch
from extraction_perception.extraction.whisper_module import WhisperExtractor
from extraction_perception.perception.caption import VisualCaptioner

device = 'cuda' if torch.cuda.is_available() else 'cpu'
compute_type = 'float16' if device == 'cuda' else 'int8'

asr = WhisperExtractor(model_size='base', device=device, compute_type=compute_type)
asr.transcribe(
    input_path=str(AUDIO_PATH),
    language='vi',
    output_root=PROCESSED_ROOT,
    output_name=VIDEO_NAME,
)

audio_transcripts_local = EXTRACTION_DIR / 'audio_transcripts.json'
captioner = VisualCaptioner()
captioner.caption_from_metadata(
    metadata_path=str(METADATA_PATH),
    output_path=str(CAPTIONS_PATH),
    batch_size=CAPTION_BATCH_SIZE,
)

DRIVE_EXTRACTION.mkdir(parents=True, exist_ok=True)
shutil.copy2(audio_transcripts_local, DRIVE_EXTRACTION / 'audio_transcripts.json')
shutil.copy2(CAPTIONS_PATH, DRIVE_EXTRACTION / 'visual_captions.json')

print('audio_transcripts:', (DRIVE_EXTRACTION / 'audio_transcripts.json').exists())
print('visual_captions:', (DRIVE_EXTRACTION / 'visual_captions.json').exists())

del asr
del captioner
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
