# Module 3 - Reasoning NLP (Colab)

This notebook runs Reasoning-NLP pipeline (G1->G8) and validates artifacts.


In [None]:
from google.colab import drive

drive.mount('/content/drive')


In [None]:
import importlib.util
import subprocess
import sys
from pathlib import Path

def _ensure_ffmpeg():
    if Path('/usr/bin/ffmpeg').exists():
        print('ffmpeg already installed')
        return
    subprocess.check_call(['apt-get', 'update', '-y'])
    subprocess.check_call(['apt-get', 'install', '-y', 'ffmpeg'])

def _ensure_packages():
    req = {'jsonschema': 'jsonschema'}
    miss = [pip for mod,pip in req.items() if importlib.util.find_spec(mod) is None]
    if not miss:
        print('python packages already satisfied')
        return
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', *miss])

_ensure_ffmpeg()
_ensure_packages()


In [None]:
import os
import shutil
import subprocess
import sys
from pathlib import Path

REPO_DIR = Path('/content/video-summary')
BRANCH_NAME = os.environ.get('VIDEO_SUMMARY_BRANCH', '02-member-2-reasoning-nlp')

if not REPO_DIR.exists():
    subprocess.check_call([
        'git', 'clone', '--single-branch', '--branch', BRANCH_NAME,
        'https://github.com/TCTri205/video-summary.git', str(REPO_DIR)
    ])
else:
    os.chdir(REPO_DIR)
    subprocess.check_call(['git', 'fetch', 'origin'])
    subprocess.check_call(['git', 'checkout', BRANCH_NAME])
    subprocess.check_call(['git', 'pull', 'origin', BRANCH_NAME])

os.chdir(REPO_DIR)

DRIVE_ROOT = Path('/content/drive/MyDrive/video-summary')
INPUT_VIDEO_DRIVE = DRIVE_ROOT / 'input' / 'raw_video.mp4'
PROCESSED_DRIVE = DRIVE_ROOT / 'processed'

LOCAL_ROOT = Path('/content/video-summary-work')
LOCAL_INPUT_DIR = LOCAL_ROOT / 'input'
LOCAL_PROCESSED = LOCAL_ROOT / 'processed'
LOCAL_INPUT_VIDEO = LOCAL_INPUT_DIR / INPUT_VIDEO_DRIVE.name

for path in [DRIVE_ROOT, PROCESSED_DRIVE, LOCAL_INPUT_DIR, LOCAL_PROCESSED]:
    path.mkdir(parents=True, exist_ok=True)

if not INPUT_VIDEO_DRIVE.exists():
    raise FileNotFoundError(f'Missing input video: {INPUT_VIDEO_DRIVE}')

if (not LOCAL_INPUT_VIDEO.exists()) or (LOCAL_INPUT_VIDEO.stat().st_size != INPUT_VIDEO_DRIVE.stat().st_size):
    shutil.copy2(INPUT_VIDEO_DRIVE, LOCAL_INPUT_VIDEO)

VIDEO_PATH = str(LOCAL_INPUT_VIDEO)
OUTPUT_ROOT = str(LOCAL_PROCESSED)
VIDEO_NAME = Path(VIDEO_PATH).stem
print('VIDEO_NAME =', VIDEO_NAME)


In [None]:
from datetime import datetime
import importlib.util

REPLAY_MODE = False
SUMMARIZE_MAX_NEW_TOKENS = 512
SUMMARIZE_BACKEND = os.environ.get('VIDEO_SUMMARY_SUMMARIZE_BACKEND', 'local').strip().lower()
SUMMARIZE_FALLBACK_BACKEND = os.environ.get('VIDEO_SUMMARY_SUMMARIZE_FALLBACK_BACKEND', 'api').strip().lower()
SUMMARIZE_LAST_RESORT_BACKEND = os.environ.get('VIDEO_SUMMARY_SUMMARIZE_LAST_RESORT_BACKEND', 'api').strip().lower()
LOCAL_MODEL_VERSION = os.environ.get('VIDEO_SUMMARY_LOCAL_MODEL_VERSION', 'Qwen/Qwen2.5-3B-Instruct').strip()
QC_ENFORCE_THRESHOLDS = os.environ.get('VIDEO_SUMMARY_QC_ENFORCE_THRESHOLDS', '1').strip() == '1'
_valid_backends = {'api', 'local'}
if (
    SUMMARIZE_BACKEND not in _valid_backends
    or SUMMARIZE_FALLBACK_BACKEND not in _valid_backends
    or SUMMARIZE_LAST_RESORT_BACKEND not in _valid_backends
):
    raise ValueError('Invalid summarize backend. Supported: api, local')
if not LOCAL_MODEL_VERSION:
    raise ValueError('VIDEO_SUMMARY_LOCAL_MODEL_VERSION must not be empty')

def _preflight_backend_support() -> dict:
    has_transformers = importlib.util.find_spec('transformers') is not None
    has_api_base = bool(os.environ.get('OPENAI_BASE_URL', '').strip())
    has_api_key = bool(os.environ.get('OPENAI_API_KEY', '').strip())
    api_ready = has_api_base and has_api_key
    local_ready = has_transformers
    if not os.environ.get('OPENAI_MODEL', '').strip():
        os.environ['OPENAI_MODEL'] = LOCAL_MODEL_VERSION
    return {
        'local_ready': local_ready,
        'api_ready': api_ready,
        'has_api_base': has_api_base,
        'has_api_key': has_api_key,
    }

BACKEND_PREFLIGHT = _preflight_backend_support()

ARTIFACTS_LOCAL = LOCAL_PROCESSED / 'artifacts'
ARTIFACTS_DRIVE = DRIVE_ROOT / 'artifacts'
DELIVERABLES_LOCAL = LOCAL_PROCESSED / 'deliverables'
DELIVERABLES_DRIVE = DRIVE_ROOT / 'deliverables'
for path in [ARTIFACTS_LOCAL, ARTIFACTS_DRIVE, DELIVERABLES_LOCAL, DELIVERABLES_DRIVE]:
    path.mkdir(parents=True, exist_ok=True)

RUN_ID = datetime.now().strftime('colab_rnlp_%Y%m%d_%H%M%S')

CLEAN_OLD_RUNS = True
KEEP_LAST_RUNS = 3

AUDIO_TRANSCRIPTS = PROCESSED_DRIVE / VIDEO_NAME / 'extraction' / 'audio_transcripts.json'
VISUAL_CAPTIONS = PROCESSED_DRIVE / VIDEO_NAME / 'extraction' / 'visual_captions.json'
RAW_VIDEO = INPUT_VIDEO_DRIVE

if not AUDIO_TRANSCRIPTS.exists() or not VISUAL_CAPTIONS.exists():
    raise FileNotFoundError('Missing perception outputs on Drive. Please run module1 + module2 first.')

print('REPLAY_MODE =', REPLAY_MODE)
print('RUN_ID =', RUN_ID)
print('SUMMARIZE_BACKEND =', SUMMARIZE_BACKEND)
print('SUMMARIZE_FALLBACK_BACKEND =', SUMMARIZE_FALLBACK_BACKEND)
print('SUMMARIZE_LAST_RESORT_BACKEND =', SUMMARIZE_LAST_RESORT_BACKEND)
print('LOCAL_MODEL_VERSION =', LOCAL_MODEL_VERSION)
print('BACKEND_PREFLIGHT =', BACKEND_PREFLIGHT)
print('QC_ENFORCE_THRESHOLDS =', QC_ENFORCE_THRESHOLDS)


In [None]:
import subprocess
import sys

def _build_backend_chain(primary: str, fallback: str, last_resort: str) -> list[str]:
    chain = []
    for item in [primary, fallback, last_resort]:
        if item and item not in chain:
            chain.append(item)
    return chain

def _select_chain_by_preflight(chain: list[str], preflight: dict) -> list[str]:
    ready = []
    for b in chain:
        if b == 'local':
            if preflight.get('local_ready', False):
                ready.append(b)
        elif b == 'api':
            if preflight.get('api_ready', False):
                ready.append(b)
        else:
            ready.append(b)
    return ready or ['heuristic']

def _run_pipeline_with_backend_chain(base_cmd: list[str], backend_chain: list[str]) -> None:
    collected = []
    max_tail_lines = 400
    for idx, backend in enumerate(backend_chain):
        next_backend = backend_chain[idx + 1] if idx + 1 < len(backend_chain) else backend_chain[-1]
        cmd_try = list(base_cmd) + ['--summarize-backend', backend, '--summarize-fallback-backend', next_backend]
        print(f'Attempt {idx+1}/{len(backend_chain)} with backend={backend}, fallback={next_backend}')

        proc = subprocess.Popen(
            cmd_try,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            text=True,
            bufsize=1,
        )

        tail_lines = []
        if proc.stdout is not None:
            for line in proc.stdout:
                print(line, end='')
                tail_lines.append(line)
                if len(tail_lines) > max_tail_lines:
                    tail_lines = tail_lines[-max_tail_lines:]

        return_code = proc.wait()
        tail_text = ''.join(tail_lines)

        if return_code == 0:
            print(f'Pipeline success with backend={backend}')
            return

        collected.append((backend, return_code, tail_text))
        print(f'Pipeline failed with backend={backend}, returncode={return_code}')
        if tail_text:
            print('--- combined output tail ---')
            print(tail_text[-5000:])

    lines = ['All backend attempts failed:']
    for backend, code, tail_text in collected:
        last_line = tail_text.strip().splitlines()[-1] if tail_text.strip() else f'returncode={code}'
        lines.append(f'- {backend}: {last_line}')
    raise RuntimeError('\n'.join(lines))

artifacts_root = ARTIFACTS_DRIVE if REPLAY_MODE else ARTIFACTS_LOCAL
deliverables_root = DELIVERABLES_DRIVE if REPLAY_MODE else DELIVERABLES_LOCAL
print('artifacts_root =', artifacts_root)
print('deliverables_root =', deliverables_root)
backend_plan = _build_backend_chain(SUMMARIZE_BACKEND, SUMMARIZE_FALLBACK_BACKEND, SUMMARIZE_LAST_RESORT_BACKEND)
selected_chain = _select_chain_by_preflight(backend_plan, BACKEND_PREFLIGHT)
print('Backend priority plan =', backend_plan)
print('Backend chain after preflight =', selected_chain)

cmd = [
    sys.executable,
    '-m',
    'reasoning_nlp.pipeline_runner',
    '--audio-transcripts', str(AUDIO_TRANSCRIPTS),
    '--visual-captions', str(VISUAL_CAPTIONS),
    '--raw-video', str(RAW_VIDEO),
    '--stage', 'g8',
    '--run-id', RUN_ID,
    '--artifacts-root', str(artifacts_root),
    '--deliverables-root', str(deliverables_root),
    '--summarize-max-new-tokens', str(SUMMARIZE_MAX_NEW_TOKENS),
]
if QC_ENFORCE_THRESHOLDS:
    cmd.append('--qc-enforce-thresholds')
if REPLAY_MODE:
    cmd.append('--replay')

_run_pipeline_with_backend_chain(cmd, selected_chain)
print('Pipeline completed')


In [None]:
import shutil
from pathlib import Path

source_run_dir = (ARTIFACTS_DRIVE if REPLAY_MODE else ARTIFACTS_LOCAL) / RUN_ID
drive_run_dir = ARTIFACTS_DRIVE / RUN_ID
source_deliverable_dir = (DELIVERABLES_DRIVE if REPLAY_MODE else DELIVERABLES_LOCAL) / RUN_ID
drive_deliverable_dir = DELIVERABLES_DRIVE / RUN_ID

def _copy_item(src: Path, dst: Path) -> None:
    if not src.exists():
        return
    dst.parent.mkdir(parents=True, exist_ok=True)
    if src.is_dir():
        if dst.exists():
            shutil.rmtree(dst)
        shutil.copytree(src, dst)
    else:
        shutil.copy2(src, dst)

keep_rel_paths = [
    'run_meta.json',
    'g1_validate/normalized_input.json',
    'g2_align/alignment_result.json',
    'g3_context/context_blocks.json',
    'g4_summarize/parse_meta.json',
    'g4_summarize/summary_script.internal.json',
    'g5_segment/summary_script.json',
    'g5_segment/summary_video_manifest.json',
    'g6_manifest/manifest_validation.json',
    'g7_assemble/render_meta.json',
    'g7_assemble/summary_video.mp4',
    'g8_qc/quality_report.json',
]
for rel in keep_rel_paths:
    _copy_item(source_run_dir / rel, drive_run_dir / rel)
for rel in ['summary_video.mp4', 'summary_text.txt']:
    _copy_item(source_deliverable_dir / rel, drive_deliverable_dir / rel)

if CLEAN_OLD_RUNS and KEEP_LAST_RUNS > 0:
    all_runs = [p for p in ARTIFACTS_DRIVE.iterdir() if p.is_dir() and p.name.startswith('colab_rnlp_')]
    all_runs.sort(key=lambda x: x.stat().st_mtime, reverse=True)
    for old in all_runs[KEEP_LAST_RUNS:]:
        if old.name != RUN_ID:
            shutil.rmtree(old, ignore_errors=True)

print('Synced balanced artifacts to Drive:', drive_run_dir)
print('Synced final deliverables to Drive:', drive_deliverable_dir)


In [None]:
import json
import re
from collections import Counter

RUN_DIR = ARTIFACTS_DRIVE / RUN_ID
FINAL_DIR = DELIVERABLES_DRIVE / RUN_ID
OUTPUT_TEXT = FINAL_DIR / 'summary_text.txt'
INTERNAL_SUMMARY = RUN_DIR / 'g4_summarize' / 'summary_script.internal.json'
ALIGNMENT = RUN_DIR / 'g2_align' / 'alignment_result.json'
REPORT = RUN_DIR / 'g8_qc' / 'quality_report.json'

def _safe_json(path):
    if not path.exists():
        return None
    try:
        return json.loads(path.read_text(encoding='utf-8'))
    except Exception:
        return None

def _split_sentences(text):
    chunks = [x.strip() for x in re.split(r'[\n\r]+|(?<=[.!?])\s+', text) if x.strip()]
    return chunks

def _asciiish_ratio(text):
    words = re.findall(r'\b[a-zA-Z]{3,}\b', text)
    if not words:
        return 0.0
    asciiish = sum(1 for w in words if all(ord(c) < 128 for c in w))
    return asciiish / len(words)

def _garble_ratio(lines):
    if not lines:
        return 0.0
    bad = 0
    for line in lines:
        words = re.findall(r'\w+', line, flags=re.UNICODE)
        if len(words) < 4:
            continue
        short_ratio = sum(1 for w in words if len(w) <= 2) / max(1, len(words))
        digit_ratio = sum(ch.isdigit() for ch in line) / max(1, len(line))
        if short_ratio > 0.45 or digit_ratio > 0.35:
            bad += 1
    return bad / len(lines)

def _template_hits(text):
    templates = [
        'Noi dung cho thay dien bien theo thu tu thoi gian',
        'Thong diep duoc rut ra tu cac su kien da xuat hien trong video',
        'Khong du du lieu de tao tom tat chi tiet',
        'Khong du du lieu de tom tat chi tiet',
    ]
    lowered = text.lower()
    return [t for t in templates if t.lower() in lowered]

summary_text = OUTPUT_TEXT.read_text(encoding='utf-8') if OUTPUT_TEXT.exists() else ''
internal = _safe_json(INTERNAL_SUMMARY) or {}
alignment = _safe_json(ALIGNMENT) or {}
quality = _safe_json(REPORT) or {}

lines = _split_sentences(summary_text)
backend = str((internal.get('generation_meta') or {}).get('backend', 'unknown'))
plot = str(internal.get('plot_summary', ''))
moral = str(internal.get('moral_lesson', ''))
template_matches = _template_hits('\n'.join([plot, moral, summary_text]))
ascii_ratio = _asciiish_ratio(summary_text)
garble_ratio = _garble_ratio(lines)

blocks = alignment.get('blocks', []) if isinstance(alignment, dict) else []
fallback_counter = Counter(str(x.get('fallback_type', '')) for x in blocks if isinstance(x, dict))
no_match_rate = 0.0
if blocks:
    no_match_rate = fallback_counter.get('no_match', 0) / len(blocks)

score = 100
if backend == 'heuristic':
    score -= 35
score -= min(25, int(100 * no_match_rate * 0.4))
score -= min(20, int(100 * garble_ratio * 0.4))
score -= min(20, int(100 * max(0.0, ascii_ratio - 0.55) * 0.5))
score -= 10 * len(template_matches)
score = max(0, score)

severity = 'good' if score >= 80 else ('warning' if score >= 60 else 'poor')

print('--- Auto diagnose summary quality ---')
print('Run ID:', RUN_ID)
print('Backend:', backend)
print('Score:', score, f'({severity})')
print('No-match rate:', f'{no_match_rate:.3f}')
print('Garble ratio:', f'{garble_ratio:.3f}')
print('ASCII-ish ratio:', f'{ascii_ratio:.3f}')
print('Template hits:', template_matches if template_matches else 'none')
if isinstance(quality, dict):
    print('QC overall_status:', quality.get('overall_status', 'unknown'))

issues = []
if backend == 'heuristic':
    issues.append('Dang dung heuristic summary; noi dung de bi template hoa.')
if no_match_rate > 0.30:
    issues.append('Alignment no_match_rate cao, de gay ghiep sai thoai-canh.')
if garble_ratio > 0.25:
    issues.append('Nhieu cau co dau hieu nhieu ASR noise/vo nghia.')
if ascii_ratio > 0.65:
    issues.append('Ty le tu ASCII cao, co kha nang tron caption/thoai tieng Anh.')
if template_matches:
    issues.append('Phat hien cau mau co dinh trong tom tat.')

print('Detected issues:')
if issues:
    for idx, item in enumerate(issues, start=1):
        print(f'{idx}. {item}')
else:
    print('1. Khong phat hien dau hieu bat thuong ro rang theo rule hien tai.')


In [None]:
from pathlib import Path
from IPython.display import Video
import subprocess
import sys

RUN_DIR = ARTIFACTS_DRIVE / RUN_ID
FINAL_DIR = DELIVERABLES_DRIVE / RUN_ID
ALIGNMENT = RUN_DIR / 'g2_align' / 'alignment_result.json'
SCRIPT = RUN_DIR / 'g5_segment' / 'summary_script.json'
MANIFEST = RUN_DIR / 'g5_segment' / 'summary_video_manifest.json'
REPORT = RUN_DIR / 'g8_qc' / 'quality_report.json'

subprocess.check_call([
    sys.executable,
    'docs/Reasoning-NLP/schema/validate_artifacts.py',
    '--alignment', str(ALIGNMENT),
    '--script', str(SCRIPT),
    '--manifest', str(MANIFEST),
    '--report', str(REPORT),
    '--contracts-dir', 'contracts/v1/template',
])

OUTPUT_VIDEO = FINAL_DIR / 'summary_video.mp4'
OUTPUT_TEXT = FINAL_DIR / 'summary_text.txt'
print('Output video:', OUTPUT_VIDEO)
print('Output text:', OUTPUT_TEXT)
print('--- Summary text preview ---')
print(OUTPUT_TEXT.read_text(encoding='utf-8'))
Video(str(OUTPUT_VIDEO), embed=True)
