# Full Pipeline (Module 1 -> 2 -> 3) on Colab

This notebook runs the complete system:
1. Module 1 Extraction
2. Module 2 Perception (ASR + captions)
3. Module 3 Reasoning NLP (G1->G8)


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!apt-get update -y
!apt-get install -y ffmpeg
!pip install scenedetect opencv-python-headless faster-whisper transformers pillow tqdm jsonschema


In [None]:
import os
import uuid
import torch
from pathlib import Path

REPO_DIR = '/content/video-summary'
if not os.path.exists(REPO_DIR):
    !git clone https://github.com/TCTri205/video-summary.git {REPO_DIR}
%cd {REPO_DIR}

print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0))


In [None]:
from extraction_perception.extraction.extraction import VideoPreprocessor
from extraction_perception.extraction.whisper_module import WhisperExtractor
from extraction_perception.perception.caption import VisualCaptioner

RAW_VIDEO = '/content/drive/MyDrive/video-summary/input/raw_video.mp4'
PROCESSED_ROOT = '/content/drive/MyDrive/video-summary/processed'
ARTIFACTS_ROOT = '/content/drive/MyDrive/video-summary/artifacts'
VIDEO_NAME = Path(RAW_VIDEO).stem
RUN_ID = f'colab_full_{uuid.uuid4().hex[:8]}'

print('VIDEO_NAME =', VIDEO_NAME)
print('RUN_ID =', RUN_ID)


## Step 1 - Module 1 Extraction

In [None]:
processor = VideoPreprocessor(video_path=RAW_VIDEO, output_root=PROCESSED_ROOT)
timestamps = processor.detect_scenes()
audio_path = processor.extract_audio()
metadata = processor.extract_keyframes_and_metadata(timestamps)
print('Scenes:', len(timestamps))
print('Audio:', audio_path)
print('Keyframes:', metadata.get('total_keyframes', 0))


## Step 2 - Module 2 Perception

In [None]:
EXTRACTION_DIR = Path(PROCESSED_ROOT) / VIDEO_NAME / 'extraction'
AUDIO_PATH = EXTRACTION_DIR / 'audio' / 'audio_16k.wav'
METADATA_PATH = EXTRACTION_DIR / 'scene_metadata.json'
CAPTIONS_PATH = EXTRACTION_DIR / 'visual_captions.json'

device = 'cuda' if torch.cuda.is_available() else 'cpu'
compute_type = 'float16' if device == 'cuda' else 'int8'

asr = WhisperExtractor(model_size='base', device=device, compute_type=compute_type)
asr.transcribe(
    input_path=str(AUDIO_PATH),
    language='vi',
    output_root=PROCESSED_ROOT,
    output_name=VIDEO_NAME,
)

captioner = VisualCaptioner()
captioner.caption_from_metadata(metadata_path=str(METADATA_PATH), output_path=str(CAPTIONS_PATH))

print('audio_transcripts:', (EXTRACTION_DIR / 'audio_transcripts.json').exists())
print('visual_captions:', CAPTIONS_PATH.exists())


## Step 3 - Module 3 Reasoning (G1->G8)

In [None]:
AUDIO_TRANSCRIPTS = EXTRACTION_DIR / 'audio_transcripts.json'
VISUAL_CAPTIONS = EXTRACTION_DIR / 'visual_captions.json'

!python -m reasoning_nlp.pipeline_runner \
  --audio-transcripts "{AUDIO_TRANSCRIPTS}" \
  --visual-captions "{VISUAL_CAPTIONS}" \
  --raw-video "{RAW_VIDEO}" \
  --stage g8 \
  --run-id "{RUN_ID}" \
  --artifacts-root "{ARTIFACTS_ROOT}"


In [None]:
RUN_DIR = Path(ARTIFACTS_ROOT) / RUN_ID
ALIGNMENT = RUN_DIR / 'g2_align' / 'alignment_result.json'
SCRIPT = RUN_DIR / 'g5_segment' / 'summary_script.json'
MANIFEST = RUN_DIR / 'g5_segment' / 'summary_video_manifest.json'
REPORT = RUN_DIR / 'g8_qc' / 'quality_report.json'

!python docs/Reasoning-NLP/schema/validate_artifacts.py \
  --alignment "{ALIGNMENT}" \
  --script "{SCRIPT}" \
  --manifest "{MANIFEST}" \
  --report "{REPORT}" \
  --contracts-dir contracts/v1/template


In [None]:
from IPython.display import Video
OUTPUT_VIDEO = RUN_DIR / 'g7_assemble' / 'summary_video.mp4'
print('Output video:', OUTPUT_VIDEO)
Video(str(OUTPUT_VIDEO), embed=True)
