# Module 2 - Perception (Colab)

This notebook runs ASR + visual captioning to generate:
- `audio_transcripts.json`
- `visual_captions.json`


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!apt-get update -y
!apt-get install -y ffmpeg
!pip install faster-whisper transformers pillow tqdm


In [None]:
import os
import torch

print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0))

REPO_DIR = '/content/video-summary'
if not os.path.exists(REPO_DIR):
    !git clone https://github.com/TCTri205/video-summary.git {REPO_DIR}
%cd {REPO_DIR}


In [None]:
from pathlib import Path
from extraction_perception.extraction.whisper_module import WhisperExtractor
from extraction_perception.perception.caption import VisualCaptioner

VIDEO_NAME = 'video1'
PROCESSED_ROOT = '/content/drive/MyDrive/video-summary/processed'
EXTRACTION_DIR = Path(PROCESSED_ROOT) / VIDEO_NAME / 'extraction'
AUDIO_PATH = EXTRACTION_DIR / 'audio' / 'audio_16k.wav'
METADATA_PATH = EXTRACTION_DIR / 'scene_metadata.json'
CAPTIONS_PATH = EXTRACTION_DIR / 'visual_captions.json'

device = 'cuda' if torch.cuda.is_available() else 'cpu'
compute_type = 'float16' if device == 'cuda' else 'int8'

asr = WhisperExtractor(model_size='base', device=device, compute_type=compute_type)
asr.transcribe(
    input_path=str(AUDIO_PATH),
    language='vi',
    output_root=PROCESSED_ROOT,
    output_name=VIDEO_NAME,
)

captioner = VisualCaptioner()
captioner.caption_from_metadata(metadata_path=str(METADATA_PATH), output_path=str(CAPTIONS_PATH))

print('audio_transcripts:', (EXTRACTION_DIR / 'audio_transcripts.json').exists())
print('visual_captions:', CAPTIONS_PATH.exists())
