# Module 3 - Reasoning NLP (Colab)

This notebook runs Reasoning-NLP pipeline (G1->G8) and validates artifacts.


In [None]:
from google.colab import drive

drive.mount('/content/drive')


In [None]:
import importlib.util
import subprocess
import sys
from pathlib import Path

def _ensure_ffmpeg():
    if Path('/usr/bin/ffmpeg').exists():
        print('ffmpeg already installed')
        return
    subprocess.check_call(['apt-get', 'update', '-y'])
    subprocess.check_call(['apt-get', 'install', '-y', 'ffmpeg'])

def _ensure_packages():
    req = {'jsonschema': 'jsonschema'}
    miss = [pip for mod,pip in req.items() if importlib.util.find_spec(mod) is None]
    if not miss:
        print('python packages already satisfied')
        return
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', *miss])

_ensure_ffmpeg()
_ensure_packages()


In [None]:
import os
import time
import subprocess
import torch
from pathlib import Path

REPO_DIR = Path('/content/video-summary')
if not REPO_DIR.exists():
    subprocess.check_call(['git', 'clone', 'https://github.com/TCTri205/video-summary.git', str(REPO_DIR)])
os.chdir(REPO_DIR)

gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'
if not torch.cuda.is_available():
    RUNTIME_PROFILE = 'CPU'
elif 'L4' in gpu_name.upper():
    RUNTIME_PROFILE = 'L4'
else:
    RUNTIME_PROFILE = 'T4'

SUMMARIZE_TOKENS_BY_PROFILE = {
    'CPU': 256,
    'T4': 384,
    'L4': 512,
}
SUMMARIZE_MAX_NEW_TOKENS = SUMMARIZE_TOKENS_BY_PROFILE[RUNTIME_PROFILE]

DRIVE_ROOT = Path('/content/drive/MyDrive/video-summary')
PROCESSED_ROOT = DRIVE_ROOT / 'processed'
RAW_VIDEO = DRIVE_ROOT / 'input' / 'raw_video.mp4'
ARTIFACTS_DRIVE = DRIVE_ROOT / 'artifacts'
ARTIFACTS_LOCAL = Path('/content/video-summary-work/artifacts')

VIDEO_NAME = RAW_VIDEO.stem
EXTRACTION_DIR = PROCESSED_ROOT / VIDEO_NAME / 'extraction'
AUDIO_TRANSCRIPTS = EXTRACTION_DIR / 'audio_transcripts.json'
VISUAL_CAPTIONS = EXTRACTION_DIR / 'visual_captions.json'

if not RAW_VIDEO.exists():
    raise FileNotFoundError(f'Missing raw video: {RAW_VIDEO}')
if not AUDIO_TRANSCRIPTS.exists():
    raise FileNotFoundError(f'Missing audio transcripts: {AUDIO_TRANSCRIPTS}')
if not VISUAL_CAPTIONS.exists():
    raise FileNotFoundError(f'Missing visual captions: {VISUAL_CAPTIONS}')

REPLAY_MODE = False
RUN_ID = os.environ.get('VIDEO_SUMMARY_RUN_ID', '').strip()
if not RUN_ID:
    RUN_ID = f'colab_rnlp_{VIDEO_NAME}_{int(time.time())}'

KEEP_LAST_RUNS = 2
CLEAN_OLD_RUNS = True

ARTIFACTS_DRIVE.mkdir(parents=True, exist_ok=True)
ARTIFACTS_LOCAL.mkdir(parents=True, exist_ok=True)

print('RUNTIME_PROFILE =', RUNTIME_PROFILE)
print('SUMMARIZE_MAX_NEW_TOKENS =', SUMMARIZE_MAX_NEW_TOKENS)
print('RUN_ID =', RUN_ID)
print('REPLAY_MODE =', REPLAY_MODE)


In [None]:
import subprocess
import sys

artifacts_root = ARTIFACTS_DRIVE if REPLAY_MODE else ARTIFACTS_LOCAL
cmd = [
    sys.executable,
    '-m',
    'reasoning_nlp.pipeline_runner',
    '--audio-transcripts', str(AUDIO_TRANSCRIPTS),
    '--visual-captions', str(VISUAL_CAPTIONS),
    '--raw-video', str(RAW_VIDEO),
    '--stage', 'g8',
    '--run-id', RUN_ID,
    '--artifacts-root', str(artifacts_root),
    '--summarize-backend', 'local',
    '--summarize-fallback-backend', 'local',
    '--summarize-max-new-tokens', str(SUMMARIZE_MAX_NEW_TOKENS),
]
if REPLAY_MODE:
    cmd.append('--replay')

subprocess.check_call(cmd)
print('Pipeline completed')


In [None]:
import shutil
from pathlib import Path

source_run_dir = (ARTIFACTS_DRIVE if REPLAY_MODE else ARTIFACTS_LOCAL) / RUN_ID
drive_run_dir = ARTIFACTS_DRIVE / RUN_ID

def _copy_item(src: Path, dst: Path) -> None:
    if not src.exists():
        return
    dst.parent.mkdir(parents=True, exist_ok=True)
    if src.is_dir():
        if dst.exists():
            shutil.rmtree(dst)
        shutil.copytree(src, dst)
    else:
        shutil.copy2(src, dst)

keep_rel_paths = [
    'run_meta.json',
    'g1_validate/normalized_input.json',
    'g2_align/alignment_result.json',
    'g3_context/context_blocks.json',
    'g4_summarize/parse_meta.json',
    'g4_summarize/summary_script.internal.json',
    'g5_segment/summary_script.json',
    'g5_segment/summary_video_manifest.json',
    'g6_manifest/manifest_validation.json',
    'g7_assemble/render_meta.json',
    'g7_assemble/summary_video.mp4',
    'g8_qc/quality_report.json',
]
for rel in keep_rel_paths:
    _copy_item(source_run_dir / rel, drive_run_dir / rel)

if CLEAN_OLD_RUNS and KEEP_LAST_RUNS > 0:
    all_runs = [p for p in ARTIFACTS_DRIVE.iterdir() if p.is_dir() and p.name.startswith('colab_rnlp_')]
    all_runs.sort(key=lambda x: x.stat().st_mtime, reverse=True)
    for old in all_runs[KEEP_LAST_RUNS:]:
        if old.name != RUN_ID:
            shutil.rmtree(old, ignore_errors=True)

print('Synced balanced artifacts to Drive:', drive_run_dir)


In [None]:
from pathlib import Path
from IPython.display import Video
import subprocess
import sys

RUN_DIR = ARTIFACTS_DRIVE / RUN_ID
ALIGNMENT = RUN_DIR / 'g2_align' / 'alignment_result.json'
SCRIPT = RUN_DIR / 'g5_segment' / 'summary_script.json'
MANIFEST = RUN_DIR / 'g5_segment' / 'summary_video_manifest.json'
REPORT = RUN_DIR / 'g8_qc' / 'quality_report.json'

subprocess.check_call([
    sys.executable,
    'docs/Reasoning-NLP/schema/validate_artifacts.py',
    '--alignment', str(ALIGNMENT),
    '--script', str(SCRIPT),
    '--manifest', str(MANIFEST),
    '--report', str(REPORT),
    '--contracts-dir', 'contracts/v1/template',
])

OUTPUT_VIDEO = RUN_DIR / 'g7_assemble' / 'summary_video.mp4'
print('Output video:', OUTPUT_VIDEO)
Video(str(OUTPUT_VIDEO), embed=True)
