# Module 3 - Reasoning NLP (Colab)

This notebook runs Reasoning-NLP pipeline (G1->G8) and validates artifacts.


In [None]:
from google.colab import drive

drive.mount('/content/drive')


In [None]:
import importlib.util
import subprocess
import sys
from pathlib import Path

def _ensure_ffmpeg():
    if Path('/usr/bin/ffmpeg').exists():
        print('ffmpeg already installed')
        return
    subprocess.check_call(['apt-get', 'update', '-y'])
    subprocess.check_call(['apt-get', 'install', '-y', 'ffmpeg'])

def _ensure_packages():
    req = {'jsonschema': 'jsonschema'}
    miss = [pip for mod,pip in req.items() if importlib.util.find_spec(mod) is None]
    if not miss:
        print('python packages already satisfied')
        return
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', *miss])

_ensure_ffmpeg()
_ensure_packages()


In [None]:
import os
import shutil
import subprocess
import sys
from pathlib import Path

REPO_DIR = Path('/content/video-summary')
BRANCH_NAME = os.environ.get('VIDEO_SUMMARY_BRANCH', '02-member-2-reasoning-nlp')

if not REPO_DIR.exists():
    subprocess.check_call([
        'git', 'clone', '--single-branch', '--branch', BRANCH_NAME,
        'https://github.com/TCTri205/video-summary.git', str(REPO_DIR)
    ])
else:
    os.chdir(REPO_DIR)
    subprocess.check_call(['git', 'fetch', 'origin'])
    subprocess.check_call(['git', 'checkout', BRANCH_NAME])
    subprocess.check_call(['git', 'pull', 'origin', BRANCH_NAME])

os.chdir(REPO_DIR)

DRIVE_ROOT = Path('/content/drive/MyDrive/video-summary')
INPUT_VIDEO_DRIVE = DRIVE_ROOT / 'input' / 'raw_video.mp4'
PROCESSED_DRIVE = DRIVE_ROOT / 'processed'

LOCAL_ROOT = Path('/content/video-summary-work')
LOCAL_INPUT_DIR = LOCAL_ROOT / 'input'
LOCAL_PROCESSED = LOCAL_ROOT / 'processed'
LOCAL_INPUT_VIDEO = LOCAL_INPUT_DIR / INPUT_VIDEO_DRIVE.name

for path in [DRIVE_ROOT, PROCESSED_DRIVE, LOCAL_INPUT_DIR, LOCAL_PROCESSED]:
    path.mkdir(parents=True, exist_ok=True)

if not INPUT_VIDEO_DRIVE.exists():
    raise FileNotFoundError(f'Missing input video: {INPUT_VIDEO_DRIVE}')

if (not LOCAL_INPUT_VIDEO.exists()) or (LOCAL_INPUT_VIDEO.stat().st_size != INPUT_VIDEO_DRIVE.stat().st_size):
    shutil.copy2(INPUT_VIDEO_DRIVE, LOCAL_INPUT_VIDEO)

VIDEO_PATH = str(LOCAL_INPUT_VIDEO)
OUTPUT_ROOT = str(LOCAL_PROCESSED)
VIDEO_NAME = Path(VIDEO_PATH).stem
print('VIDEO_NAME =', VIDEO_NAME)


In [None]:
from datetime import datetime

REPLAY_MODE = False
SUMMARIZE_MAX_NEW_TOKENS = 512

ARTIFACTS_LOCAL = LOCAL_PROCESSED / 'artifacts'
ARTIFACTS_DRIVE = DRIVE_ROOT / 'artifacts'
DELIVERABLES_LOCAL = LOCAL_PROCESSED / 'deliverables'
DELIVERABLES_DRIVE = DRIVE_ROOT / 'deliverables'
for path in [ARTIFACTS_LOCAL, ARTIFACTS_DRIVE, DELIVERABLES_LOCAL, DELIVERABLES_DRIVE]:
    path.mkdir(parents=True, exist_ok=True)

RUN_ID = datetime.now().strftime('colab_rnlp_%Y%m%d_%H%M%S')

CLEAN_OLD_RUNS = True
KEEP_LAST_RUNS = 3

AUDIO_TRANSCRIPTS = PROCESSED_DRIVE / VIDEO_NAME / 'extraction' / 'audio_transcripts.json'
VISUAL_CAPTIONS = PROCESSED_DRIVE / VIDEO_NAME / 'extraction' / 'visual_captions.json'
RAW_VIDEO = INPUT_VIDEO_DRIVE

if not AUDIO_TRANSCRIPTS.exists() or not VISUAL_CAPTIONS.exists():
    raise FileNotFoundError('Missing perception outputs on Drive. Please run module1 + module2 first.')

print('REPLAY_MODE =', REPLAY_MODE)
print('RUN_ID =', RUN_ID)


In [None]:
import subprocess
import sys

artifacts_root = ARTIFACTS_DRIVE if REPLAY_MODE else ARTIFACTS_LOCAL
deliverables_root = DELIVERABLES_DRIVE if REPLAY_MODE else DELIVERABLES_LOCAL
print('artifacts_root =', artifacts_root)
print('deliverables_root =', deliverables_root)
cmd = [
    sys.executable,
    '-m',
    'reasoning_nlp.pipeline_runner',
    '--audio-transcripts', str(AUDIO_TRANSCRIPTS),
    '--visual-captions', str(VISUAL_CAPTIONS),
    '--raw-video', str(RAW_VIDEO),
    '--stage', 'g8',
    '--run-id', RUN_ID,
    '--artifacts-root', str(artifacts_root),
    '--deliverables-root', str(deliverables_root),
    '--summarize-backend', 'heuristic',
    '--summarize-fallback-backend', 'heuristic',
    '--summarize-max-new-tokens', str(SUMMARIZE_MAX_NEW_TOKENS),
]
if REPLAY_MODE:
    cmd.append('--replay')

subprocess.check_call(cmd)
print('Pipeline completed')


In [None]:
import shutil
from pathlib import Path

source_run_dir = (ARTIFACTS_DRIVE if REPLAY_MODE else ARTIFACTS_LOCAL) / RUN_ID
drive_run_dir = ARTIFACTS_DRIVE / RUN_ID
source_deliverable_dir = (DELIVERABLES_DRIVE if REPLAY_MODE else DELIVERABLES_LOCAL) / RUN_ID
drive_deliverable_dir = DELIVERABLES_DRIVE / RUN_ID

def _copy_item(src: Path, dst: Path) -> None:
    if not src.exists():
        return
    dst.parent.mkdir(parents=True, exist_ok=True)
    if src.is_dir():
        if dst.exists():
            shutil.rmtree(dst)
        shutil.copytree(src, dst)
    else:
        shutil.copy2(src, dst)

keep_rel_paths = [
    'run_meta.json',
    'g1_validate/normalized_input.json',
    'g2_align/alignment_result.json',
    'g3_context/context_blocks.json',
    'g4_summarize/parse_meta.json',
    'g4_summarize/summary_script.internal.json',
    'g5_segment/summary_script.json',
    'g5_segment/summary_video_manifest.json',
    'g6_manifest/manifest_validation.json',
    'g7_assemble/render_meta.json',
    'g7_assemble/summary_video.mp4',
    'g8_qc/quality_report.json',
]
for rel in keep_rel_paths:
    _copy_item(source_run_dir / rel, drive_run_dir / rel)
for rel in ['summary_video.mp4', 'summary_text.txt']:
    _copy_item(source_deliverable_dir / rel, drive_deliverable_dir / rel)

if CLEAN_OLD_RUNS and KEEP_LAST_RUNS > 0:
    all_runs = [p for p in ARTIFACTS_DRIVE.iterdir() if p.is_dir() and p.name.startswith('colab_rnlp_')]
    all_runs.sort(key=lambda x: x.stat().st_mtime, reverse=True)
    for old in all_runs[KEEP_LAST_RUNS:]:
        if old.name != RUN_ID:
            shutil.rmtree(old, ignore_errors=True)

print('Synced balanced artifacts to Drive:', drive_run_dir)
print('Synced final deliverables to Drive:', drive_deliverable_dir)


In [None]:
from pathlib import Path
from IPython.display import Video
import subprocess
import sys

RUN_DIR = ARTIFACTS_DRIVE / RUN_ID
FINAL_DIR = DELIVERABLES_DRIVE / RUN_ID
ALIGNMENT = RUN_DIR / 'g2_align' / 'alignment_result.json'
SCRIPT = RUN_DIR / 'g5_segment' / 'summary_script.json'
MANIFEST = RUN_DIR / 'g5_segment' / 'summary_video_manifest.json'
REPORT = RUN_DIR / 'g8_qc' / 'quality_report.json'

subprocess.check_call([
    sys.executable,
    'docs/Reasoning-NLP/schema/validate_artifacts.py',
    '--alignment', str(ALIGNMENT),
    '--script', str(SCRIPT),
    '--manifest', str(MANIFEST),
    '--report', str(REPORT),
    '--contracts-dir', 'contracts/v1/template',
])

OUTPUT_VIDEO = FINAL_DIR / 'summary_video.mp4'
OUTPUT_TEXT = FINAL_DIR / 'summary_text.txt'
print('Output video:', OUTPUT_VIDEO)
print('Output text:', OUTPUT_TEXT)
print('--- Summary text preview ---')
print(OUTPUT_TEXT.read_text(encoding='utf-8'))
Video(str(OUTPUT_VIDEO), embed=True)
