# Full Pipeline (Module 1 -> 2 -> 3) on Colab

This notebook runs the complete system:
1. Module 1 Extraction
2. Module 2 Perception (ASR + captions)
3. Module 3 Reasoning NLP (G1->G8)


In [None]:
from google.colab import drive

drive.mount('/content/drive')



In [None]:
import os
from pathlib import Path

DRIVE_CACHE_ROOT = Path('/content/drive/MyDrive/video-summary/model_cache')
HF_HOME_DIR = DRIVE_CACHE_ROOT / 'huggingface'
XDG_CACHE_DIR = DRIVE_CACHE_ROOT / 'xdg'
TORCH_HOME_DIR = DRIVE_CACHE_ROOT / 'torch'

for path in [HF_HOME_DIR, XDG_CACHE_DIR, TORCH_HOME_DIR]:
    path.mkdir(parents=True, exist_ok=True)

os.environ['HF_HOME'] = str(HF_HOME_DIR)
os.environ['HF_HUB_CACHE'] = str(HF_HOME_DIR / 'hub')
os.environ['TRANSFORMERS_CACHE'] = str(HF_HOME_DIR / 'transformers')
os.environ['XDG_CACHE_HOME'] = str(XDG_CACHE_DIR)
os.environ['TORCH_HOME'] = str(TORCH_HOME_DIR)
os.environ.setdefault('PYTORCH_CUDA_ALLOC_CONF', 'expandable_segments:True')
os.environ.setdefault('TOKENIZERS_PARALLELISM', 'false')
os.environ.setdefault('VIDEO_SUMMARY_LOCAL_4BIT', '1')

Path(os.environ['HF_HUB_CACHE']).mkdir(parents=True, exist_ok=True)
Path(os.environ['TRANSFORMERS_CACHE']).mkdir(parents=True, exist_ok=True)

print('Model cache root:', DRIVE_CACHE_ROOT)
print('HF_HOME =', os.environ['HF_HOME'])
print('HF_HUB_CACHE =', os.environ['HF_HUB_CACHE'])
print('TRANSFORMERS_CACHE =', os.environ['TRANSFORMERS_CACHE'])
print('XDG_CACHE_HOME =', os.environ['XDG_CACHE_HOME'])
print('TORCH_HOME =', os.environ['TORCH_HOME'])


In [None]:
import importlib.util
import subprocess
import sys

def _build_backend_chain(primary: str, fallback: str, last_resort: str) -> list[str]:
    chain = []
    for item in [primary, fallback, last_resort]:
        if item and item not in chain:
            chain.append(item)
    return chain

def _select_chain_by_preflight(chain: list[str], preflight: dict) -> list[str]:
    ready = []
    for b in chain:
        if b == 'local':
            if preflight.get('local_ready', False):
                ready.append(b)
        elif b == 'api':
            if preflight.get('api_ready', False):
                ready.append(b)
        else:
            ready.append(b)
    return ready

def _run_pipeline_with_backend_chain(base_cmd: list[str], backend_chain: list[str]) -> None:
    def _diagnose_failure(text: str) -> str:
        lowered = (text or '').lower()
        if 'no space left on device' in lowered or 'insufficient free drive space' in lowered:
            return 'storage_full'
        if 'cuda out of memory' in lowered or 'cublas' in lowered:
            return 'gpu_oom'
        if 'connection' in lowered or 'timeout' in lowered or 'read timed out' in lowered:
            return 'download_or_network'
        if 'local backend requested but preflight failed' in lowered:
            return 'preflight_blocked'
        if 'response is not a valid json object' in lowered:
            return 'json_parse_failure'
        if '401' in lowered or '403' in lowered:
            return 'auth_or_access'
        return 'unknown'

    collected = []
    max_tail_lines = 400
    for idx, backend in enumerate(backend_chain):
        next_backend = backend_chain[idx + 1] if idx + 1 < len(backend_chain) else ''
        cmd_try = list(base_cmd) + ['--summarize-backend', backend]
        if next_backend:
            cmd_try += ['--summarize-fallback-backend', next_backend]
        fallback_label = next_backend if next_backend else 'none'
        print(f'Attempt {idx+1}/{len(backend_chain)} with backend={backend}, fallback={fallback_label}')

        proc = subprocess.Popen(
            cmd_try,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            text=True,
            bufsize=1,
        )

        tail_lines = []
        if proc.stdout is not None:
            for line in proc.stdout:
                print(line, end='')
                tail_lines.append(line)
                if len(tail_lines) > max_tail_lines:
                    tail_lines = tail_lines[-max_tail_lines:]

        return_code = proc.wait()
        tail_text = ''.join(tail_lines)

        if return_code == 0:
            print(f'Pipeline success with backend={backend}')
            return

        collected.append((backend, return_code, tail_text))
        print('Diagnosed failure category =', _diagnose_failure(tail_text))
        print(f'Pipeline failed with backend={backend}, returncode={return_code}')
        if tail_text:
            print('--- combined output tail ---')
            print(tail_text[-5000:])

    lines = ['All backend attempts failed:']
    for backend, code, tail_text in collected:
        last_line = tail_text.strip().splitlines()[-1] if tail_text.strip() else f'returncode={code}'
        lines.append(f'- {backend}: {last_line} [category={_diagnose_failure(tail_text)}]')
    raise RuntimeError('\n'.join(lines))
from pathlib import Path

def _ensure_ffmpeg():
    if Path('/usr/bin/ffmpeg').exists():
        print('ffmpeg already installed')
        return
    subprocess.check_call(['apt-get', 'update', '-y'])
    subprocess.check_call(['apt-get', 'install', '-y', 'ffmpeg'])

def _ensure_packages():
    req = {
        'scenedetect': 'scenedetect',
        'cv2': 'opencv-python-headless',
        'faster_whisper': 'faster-whisper',
        'transformers': 'transformers>=4.45.0',
        'accelerate': 'accelerate>=0.34.0',
        'sentencepiece': 'sentencepiece>=0.2.0',
        'bitsandbytes': 'bitsandbytes>=0.43.0',
        'PIL': 'pillow',
        'tqdm': 'tqdm',
        'jsonschema': 'jsonschema',
    }
    miss = [pip for mod,pip in req.items() if importlib.util.find_spec(mod) is None]
    if miss:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', *miss])

    critical = [
        'transformers>=4.45.0',
        'accelerate>=0.34.0',
        'sentencepiece>=0.2.0',
        'safetensors>=0.4.3',
        'huggingface_hub>=0.24.0',
        'bitsandbytes>=0.43.0',
    ]
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', '--upgrade', *critical])

def _print_runtime_versions():
    pkg_names = ['torch', 'transformers', 'accelerate', 'sentencepiece', 'huggingface_hub', 'safetensors']
    for name in pkg_names:
        try:
            ver = importlib.metadata.version(name)
            print(f'{name}={ver}')
        except Exception:
            print(f'{name}=<missing>')

_ensure_ffmpeg()
_ensure_packages()
_print_runtime_versions()



In [None]:
import os
import random
import shutil
import subprocess
import sys
from pathlib import Path

REPO_DIR = Path('/content/video-summary')
BRANCH_NAME = os.environ.get('VIDEO_SUMMARY_BRANCH', '02-member-2-reasoning-nlp')

if not REPO_DIR.exists():
    subprocess.check_call([
        'git', 'clone', '--single-branch', '--branch', BRANCH_NAME,
        'https://github.com/TCTri205/video-summary.git', str(REPO_DIR)
    ])
else:
    os.chdir(REPO_DIR)
    subprocess.check_call(['git', 'fetch', 'origin'])
    subprocess.check_call(['git', 'checkout', BRANCH_NAME])
    subprocess.check_call(['git', 'pull', 'origin', BRANCH_NAME])

os.chdir(REPO_DIR)

DRIVE_ROOT = Path('/content/drive/MyDrive/video-summary')
INPUT_VIDEO_DRIVE = DRIVE_ROOT / 'input' / 'raw_video.mp4'
PROCESSED_DRIVE = DRIVE_ROOT / 'processed'

LOCAL_ROOT = Path('/content/video-summary-work')
LOCAL_INPUT_DIR = LOCAL_ROOT / 'input'
LOCAL_PROCESSED = LOCAL_ROOT / 'processed'
LOCAL_INPUT_VIDEO = LOCAL_INPUT_DIR / INPUT_VIDEO_DRIVE.name

for path in [DRIVE_ROOT, PROCESSED_DRIVE, LOCAL_INPUT_DIR, LOCAL_PROCESSED]:
    path.mkdir(parents=True, exist_ok=True)

# Input selection policy (mac dinh: fixed de de debug va so sanh).
SOURCE_VIDEO_DIR = Path('/content/drive/MyDrive/Final-Project-AI-Sgroup/Video')
INPUT_SELECTION_MODE = os.environ.get('VIDEO_SUMMARY_INPUT_MODE', 'fixed').strip().lower()  # fixed|first|random
FIXED_SOURCE_VIDEO = Path(os.environ.get('VIDEO_SUMMARY_FIXED_SOURCE', str(INPUT_VIDEO_DRIVE)))

def _video_files(folder: Path):
    exts = {'.mp4', '.mov', '.mkv', '.avi', '.webm', '.m4v'}
    if not folder.exists():
        return []
    return [p for p in folder.rglob('*') if p.is_file() and p.suffix.lower() in exts]

selected = None
if INPUT_SELECTION_MODE == 'fixed':
    selected = FIXED_SOURCE_VIDEO
    if not selected.exists():
        raise FileNotFoundError(
            f'Khong tim thay fixed source video: {selected}. '
            'Dat VIDEO_SUMMARY_FIXED_SOURCE hoac doi INPUT_SELECTION_MODE sang first/random.'
        )
elif INPUT_SELECTION_MODE in {'first', 'random'}:
    candidates = sorted(_video_files(SOURCE_VIDEO_DIR))
    if not candidates:
        raise FileNotFoundError(
            f'Khong tim thay video trong thu muc: {SOURCE_VIDEO_DIR}. '
            'Hay kiem tra lai duong dan va upload it nhat 1 file video hop le.'
        )
    selected = candidates[0] if INPUT_SELECTION_MODE == 'first' else random.choice(candidates)
else:
    raise ValueError(f'INPUT_SELECTION_MODE khong hop le: {INPUT_SELECTION_MODE}')
INPUT_VIDEO_DRIVE.parent.mkdir(parents=True, exist_ok=True)
if selected.resolve() != INPUT_VIDEO_DRIVE.resolve():
    shutil.copy2(selected, INPUT_VIDEO_DRIVE)
print(f'Input selection mode: {INPUT_SELECTION_MODE}')
print(f'Selected source video: {selected}')
print(f'Copied to pipeline input: {INPUT_VIDEO_DRIVE}')

if (not LOCAL_INPUT_VIDEO.exists()) or (LOCAL_INPUT_VIDEO.stat().st_size != INPUT_VIDEO_DRIVE.stat().st_size):
    shutil.copy2(INPUT_VIDEO_DRIVE, LOCAL_INPUT_VIDEO)

VIDEO_PATH = str(LOCAL_INPUT_VIDEO)
OUTPUT_ROOT = str(LOCAL_PROCESSED)
VIDEO_NAME = Path(VIDEO_PATH).stem
print('VIDEO_NAME =', VIDEO_NAME)


In [None]:
from datetime import datetime
import importlib.metadata
import importlib.util

# Runtime config
REPLAY_MODE = False
CAPTION_BATCH_SIZE = 4
SUMMARIZE_MAX_NEW_TOKENS = 384
SUMMARIZE_PROMPT_MAX_CHARS = int(os.environ.get('VIDEO_SUMMARY_SUMMARIZE_PROMPT_MAX_CHARS', '9000'))
RUN_LOCAL_SMOKE_TEST = os.environ.get('VIDEO_SUMMARY_RUN_LOCAL_SMOKE_TEST', '0').strip() == '1'
SUMMARIZE_BACKEND = os.environ.get('VIDEO_SUMMARY_SUMMARIZE_BACKEND', 'local').strip().lower()
SUMMARIZE_FALLBACK_BACKEND = os.environ.get('VIDEO_SUMMARY_SUMMARIZE_FALLBACK_BACKEND', 'api').strip().lower()
SUMMARIZE_LAST_RESORT_BACKEND = os.environ.get('VIDEO_SUMMARY_SUMMARIZE_LAST_RESORT_BACKEND', 'api').strip().lower()
LOCAL_MODEL_VERSION = os.environ.get('VIDEO_SUMMARY_LOCAL_MODEL_VERSION', 'Qwen/Qwen2.5-3B-Instruct').strip()
QC_ENFORCE_THRESHOLDS = os.environ.get('VIDEO_SUMMARY_QC_ENFORCE_THRESHOLDS', '1').strip() == '1'
_valid_backends = {'api', 'local'}
if (
    SUMMARIZE_BACKEND not in _valid_backends
    or SUMMARIZE_FALLBACK_BACKEND not in _valid_backends
    or SUMMARIZE_LAST_RESORT_BACKEND not in _valid_backends
):
    raise ValueError('Invalid summarize backend. Supported: api, local')
if not LOCAL_MODEL_VERSION:
    raise ValueError('VIDEO_SUMMARY_LOCAL_MODEL_VERSION must not be empty')

def _preflight_backend_support() -> dict:
    has_transformers = importlib.util.find_spec('transformers') is not None
    has_torch = importlib.util.find_spec('torch') is not None
    has_accelerate = importlib.util.find_spec('accelerate') is not None
    has_bitsandbytes = importlib.util.find_spec('bitsandbytes') is not None
    has_sentencepiece = importlib.util.find_spec('sentencepiece') is not None
    local_deps_ready = has_transformers and has_torch and has_accelerate and has_sentencepiece and has_bitsandbytes
    has_api_base = bool(os.environ.get('OPENAI_BASE_URL', '').strip())
    has_api_key = bool(os.environ.get('OPENAI_API_KEY', '').strip())
    api_ready = has_api_base and has_api_key
    local_ready = local_deps_ready
    local_probe_error = ''
    if local_ready:
        try:
            import torch
            from transformers import AutoTokenizer
            _ = AutoTokenizer.from_pretrained(LOCAL_MODEL_VERSION)
            local_ready = torch.cuda.is_available()
            if not local_ready:
                local_probe_error = 'CUDA not available on current runtime'
        except Exception as exc:
            local_ready = False
            local_probe_error = str(exc)
    else:
        local_probe_error = 'missing deps: torch/transformers/accelerate/sentencepiece/bitsandbytes'

    if not os.environ.get('OPENAI_MODEL', '').strip():
        os.environ['OPENAI_MODEL'] = LOCAL_MODEL_VERSION
    return {
        'local_ready': local_ready,
        'api_ready': api_ready,
        'has_api_base': has_api_base,
        'has_api_key': has_api_key,
        'local_probe_error': local_probe_error,
    }

BACKEND_PREFLIGHT = _preflight_backend_support()

RAW_VIDEO_LOCAL = str(LOCAL_INPUT_VIDEO)
RAW_VIDEO_DRIVE = str(INPUT_VIDEO_DRIVE)

LOCAL_ARTIFACTS = LOCAL_PROCESSED / 'artifacts'
ARTIFACTS_DRIVE = DRIVE_ROOT / 'artifacts'
LOCAL_DELIVERABLES = LOCAL_PROCESSED / 'deliverables'
DELIVERABLES_DRIVE = DRIVE_ROOT / 'deliverables'
LOCAL_ARTIFACTS.mkdir(parents=True, exist_ok=True)
ARTIFACTS_DRIVE.mkdir(parents=True, exist_ok=True)
LOCAL_DELIVERABLES.mkdir(parents=True, exist_ok=True)
DELIVERABLES_DRIVE.mkdir(parents=True, exist_ok=True)

RUN_ID = datetime.now().strftime('colab_full_%Y%m%d_%H%M%S')

CLEAN_HEAVY_EXTRACTION = False
CLEAN_OLD_RUNS = True
KEEP_LAST_RUNS = 3

print('REPLAY_MODE =', REPLAY_MODE)
print('RUN_ID =', RUN_ID)
print('SUMMARIZE_BACKEND =', SUMMARIZE_BACKEND)
print('SUMMARIZE_FALLBACK_BACKEND =', SUMMARIZE_FALLBACK_BACKEND)
print('SUMMARIZE_LAST_RESORT_BACKEND =', SUMMARIZE_LAST_RESORT_BACKEND)
print('LOCAL_MODEL_VERSION =', LOCAL_MODEL_VERSION)
print('SUMMARIZE_PROMPT_MAX_CHARS =', SUMMARIZE_PROMPT_MAX_CHARS)
print('RUN_LOCAL_SMOKE_TEST =', RUN_LOCAL_SMOKE_TEST)
print('VIDEO_SUMMARY_LOCAL_4BIT =', os.environ.get('VIDEO_SUMMARY_LOCAL_4BIT', '1'))
print('BACKEND_PREFLIGHT =', BACKEND_PREFLIGHT)
if SUMMARIZE_BACKEND == 'local' and not BACKEND_PREFLIGHT.get('local_ready', False):
    raise RuntimeError(f"Local backend requested but preflight failed: {BACKEND_PREFLIGHT.get('local_probe_error', 'unknown')}")
if SUMMARIZE_BACKEND == 'local':
    import shutil
    usage = shutil.disk_usage('/content/drive')
    free_gb = usage.free / (1024**3)
    print('Drive free space (GB)=', round(free_gb, 2))
    try:
        import torch
        if torch.cuda.is_available():
            free_vram, total_vram = torch.cuda.mem_get_info()
            print('GPU VRAM free/total (GB)=', round(free_vram / (1024**3), 2), '/', round(total_vram / (1024**3), 2))
    except Exception as _exc:
        print('VRAM probe skipped:', _exc)
    if free_gb < 15:
        raise RuntimeError('Insufficient free Drive space for local Qwen cache (need >= 15GB).')
print('QC_ENFORCE_THRESHOLDS =', QC_ENFORCE_THRESHOLDS)


In [None]:
import importlib.metadata
import shutil
import torch
from pathlib import Path

def _quick_diagnose_status() -> None:
    usage = shutil.disk_usage('/content/drive')
    free_gb = usage.free / (1024**3)
    hf_cache = Path(os.environ.get('HF_HUB_CACHE', '/content/.cache/huggingface/hub'))
    cache_size_gb = 0.0
    try:
        if hf_cache.exists():
            cache_size_gb = sum(p.stat().st_size for p in hf_cache.rglob('*') if p.is_file()) / (1024**3)
    except Exception:
        cache_size_gb = -1.0

    versions = {}
    for pkg in ['torch', 'transformers', 'accelerate', 'sentencepiece', 'huggingface_hub', 'safetensors', 'bitsandbytes']:
        try:
            versions[pkg] = importlib.metadata.version(pkg)
        except Exception:
            versions[pkg] = '<missing>'

    rows = [
        ('backend', SUMMARIZE_BACKEND),
        ('local_model', LOCAL_MODEL_VERSION),
        ('local_ready', str(BACKEND_PREFLIGHT.get('local_ready'))),
        ('local_probe_error', str(BACKEND_PREFLIGHT.get('local_probe_error', '')) or '-'),
        ('api_ready', str(BACKEND_PREFLIGHT.get('api_ready'))),
        ('drive_free_gb', f'{free_gb:.2f}'),
        ('hf_cache_exists', str(hf_cache.exists())),
        ('hf_cache_size_gb', f'{cache_size_gb:.2f}' if cache_size_gb >= 0 else 'unknown'),
        ('torch_cuda', str(torch.cuda.is_available())),
    ]

    print('--- Quick Diagnose ---')
    for k, v in rows:
        print(f'{k:>18}: {v}')
    print('--- Runtime Versions ---')
    for pkg, ver in versions.items():
        print(f'{pkg:>18}: {ver}')

_quick_diagnose_status()


In [None]:
import torch

def _run_local_qwen_smoke_test(model_name: str) -> None:
    if SUMMARIZE_BACKEND != 'local':
        print('Skip local smoke test because summarize backend is not local')
        return

    print('--- Local Qwen smoke test ---')
    print('CUDA available =', torch.cuda.is_available())
    if torch.cuda.is_available():
        print('GPU =', torch.cuda.get_device_name(0))

    from reasoning_nlp.summarizer.llm_client import _local_transformers_completion

    payload, latency_ms, token_count = _local_transformers_completion(
        prompt='[00:00:01.000] Nhan vat mo cua buoc vao phong va noi chuyen ngan gon voi ban cua minh.',
        model_name=model_name,
        timeout_ms=30000,
        max_new_tokens=160,
        temperature=0.0,
        do_sample=False,
    )
    print('Smoke test latency_ms =', latency_ms)
    print('Smoke test token_count =', token_count)
    print('Smoke test keys =', sorted(payload.keys()))

if RUN_LOCAL_SMOKE_TEST:
    _run_local_qwen_smoke_test(LOCAL_MODEL_VERSION)
else:
    print('Skip local smoke test by default to avoid duplicate GPU residency before subprocess run')

from extraction_perception.extraction.extraction import VideoPreprocessor

if REPLAY_MODE:
    print('Replay mode enabled: skip Module 1 extraction')
else:
    processor = VideoPreprocessor(video_path=RAW_VIDEO_LOCAL, output_root=str(LOCAL_PROCESSED))
    timestamps = processor.detect_scenes()
    audio_path = processor.extract_audio()
    metadata = processor.extract_keyframes_and_metadata(timestamps)
    print('Scenes:', len(timestamps))
    print('Audio:', audio_path)
    print('Keyframes:', metadata.get('total_keyframes', 0))



## Step 1 - Module 1 Extraction

In [None]:
import gc
import torch
from extraction_perception.extraction.whisper_module import WhisperExtractor
from extraction_perception.perception.caption import VisualCaptioner

LOCAL_EXTRACTION_DIR = LOCAL_PROCESSED / VIDEO_NAME / 'extraction'
AUDIO_PATH = LOCAL_EXTRACTION_DIR / 'audio' / 'audio_16k.wav'
METADATA_PATH = LOCAL_EXTRACTION_DIR / 'scene_metadata.json'
CAPTIONS_PATH = LOCAL_EXTRACTION_DIR / 'visual_captions.json'

if REPLAY_MODE:
    print('Replay mode enabled: skip Module 2 perception')
else:
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    compute_type = 'float16' if device == 'cuda' else 'int8'

    asr = WhisperExtractor(model_size='base', device=device, compute_type=compute_type)
    asr.transcribe(
        input_path=str(AUDIO_PATH),
        language='vi',
        output_root=str(LOCAL_PROCESSED),
        output_name=VIDEO_NAME,
    )

    captioner = VisualCaptioner()
    captioner.caption_from_metadata(
        metadata_path=str(METADATA_PATH),
        output_path=str(CAPTIONS_PATH),
        batch_size=CAPTION_BATCH_SIZE,
    )

    del asr
    del captioner
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()


## Step 2 - Module 2 Perception

In [None]:
import shutil

DRIVE_EXTRACTION_DIR = PROCESSED_DRIVE / VIDEO_NAME / 'extraction'
DRIVE_EXTRACTION_DIR.mkdir(parents=True, exist_ok=True)

if not REPLAY_MODE:
    sync_to_drive = [
        ('scene_metadata.json', 'scene_metadata.json'),
        ('audio_transcripts.json', 'audio_transcripts.json'),
        ('visual_captions.json', 'visual_captions.json'),
    ]
    for src_rel, dst_rel in sync_to_drive:
        src = LOCAL_EXTRACTION_DIR / src_rel
        dst = DRIVE_EXTRACTION_DIR / dst_rel
        if src.exists():
            dst.parent.mkdir(parents=True, exist_ok=True)
            shutil.copy2(src, dst)

AUDIO_TRANSCRIPTS = DRIVE_EXTRACTION_DIR / 'audio_transcripts.json'
VISUAL_CAPTIONS = DRIVE_EXTRACTION_DIR / 'visual_captions.json'

if not AUDIO_TRANSCRIPTS.exists() or not VISUAL_CAPTIONS.exists():
    raise FileNotFoundError('Missing perception outputs on Drive for reasoning stage')

print('Drive transcripts/captions ready')



## Step 3 - Module 3 Reasoning (G1->G8)

In [None]:
import subprocess
import sys

artifacts_root = ARTIFACTS_DRIVE if REPLAY_MODE else LOCAL_ARTIFACTS
deliverables_root = DELIVERABLES_DRIVE if REPLAY_MODE else LOCAL_DELIVERABLES
print('artifacts_root =', artifacts_root)
print('deliverables_root =', deliverables_root)
backend_plan = _build_backend_chain(SUMMARIZE_BACKEND, SUMMARIZE_FALLBACK_BACKEND, SUMMARIZE_LAST_RESORT_BACKEND)
selected_chain = _select_chain_by_preflight(backend_plan, BACKEND_PREFLIGHT)
print('Backend priority plan =', backend_plan)
print('Backend chain after preflight =', selected_chain)

cmd = [
    sys.executable,
    '-m',
    'reasoning_nlp.pipeline_runner',
    '--audio-transcripts', str(AUDIO_TRANSCRIPTS),
    '--visual-captions', str(VISUAL_CAPTIONS),
    '--raw-video', RAW_VIDEO_DRIVE,
    '--stage', 'g8',
    '--run-id', RUN_ID,
    '--artifacts-root', str(artifacts_root),
    '--deliverables-root', str(deliverables_root),
    '--model-version', LOCAL_MODEL_VERSION,
    '--summarize-max-new-tokens', str(SUMMARIZE_MAX_NEW_TOKENS),
    '--summarize-prompt-max-chars', str(SUMMARIZE_PROMPT_MAX_CHARS),
]
if QC_ENFORCE_THRESHOLDS:
    cmd.append('--qc-enforce-thresholds')
if REPLAY_MODE:
    cmd.append('--replay')

_run_pipeline_with_backend_chain(cmd, selected_chain)
print('Reasoning pipeline completed')


In [None]:
import shutil
from pathlib import Path

source_run_dir = (ARTIFACTS_DRIVE if REPLAY_MODE else LOCAL_ARTIFACTS) / RUN_ID
drive_run_dir = ARTIFACTS_DRIVE / RUN_ID
source_deliverable_dir = (DELIVERABLES_DRIVE if REPLAY_MODE else LOCAL_DELIVERABLES) / RUN_ID
drive_deliverable_dir = DELIVERABLES_DRIVE / RUN_ID

def _copy_item(src: Path, dst: Path) -> None:
    if not src.exists():
        return
    dst.parent.mkdir(parents=True, exist_ok=True)
    if src.is_dir():
        if dst.exists():
            shutil.rmtree(dst)
        shutil.copytree(src, dst)
    else:
        shutil.copy2(src, dst)

keep_rel_paths = [
    'run_meta.json',
    'g1_validate/normalized_input.json',
    'g2_align/alignment_result.json',
    'g3_context/context_blocks.json',
    'g4_summarize/parse_meta.json',
    'g4_summarize/summary_script.internal.json',
    'g5_segment/summary_script.json',
    'g5_segment/summary_video_manifest.json',
    'g6_manifest/manifest_validation.json',
    'g7_assemble/render_meta.json',
    'g7_assemble/summary_video.mp4',
    'g8_qc/quality_report.json',
]
for rel in keep_rel_paths:
    _copy_item(source_run_dir / rel, drive_run_dir / rel)
for rel in ['summary_video.mp4', 'summary_text.txt']:
    _copy_item(source_deliverable_dir / rel, drive_deliverable_dir / rel)

if CLEAN_HEAVY_EXTRACTION and DRIVE_EXTRACTION_DIR.exists():
    heavy_items = [
        DRIVE_EXTRACTION_DIR / 'keyframes',
        DRIVE_EXTRACTION_DIR / 'audio' / 'audio_16k.wav',
    ]
    for item in heavy_items:
        if item.is_dir():
            shutil.rmtree(item, ignore_errors=True)
        elif item.exists():
            item.unlink()

if CLEAN_OLD_RUNS and KEEP_LAST_RUNS > 0:
    all_runs = [p for p in ARTIFACTS_DRIVE.iterdir() if p.is_dir() and p.name.startswith('colab_full_')]
    all_runs.sort(key=lambda x: x.stat().st_mtime, reverse=True)
    for old in all_runs[KEEP_LAST_RUNS:]:
        if old.name != RUN_ID:
            shutil.rmtree(old, ignore_errors=True)

print('Synced balanced artifacts to Drive:', drive_run_dir)
print('Synced final deliverables to Drive:', drive_deliverable_dir)



In [None]:
import json
import re
from collections import Counter

RUN_DIR = ARTIFACTS_DRIVE / RUN_ID
FINAL_DIR = DELIVERABLES_DRIVE / RUN_ID
OUTPUT_TEXT = FINAL_DIR / 'summary_text.txt'
INTERNAL_SUMMARY = RUN_DIR / 'g4_summarize' / 'summary_script.internal.json'
ALIGNMENT = RUN_DIR / 'g2_align' / 'alignment_result.json'
REPORT = RUN_DIR / 'g8_qc' / 'quality_report.json'

def _safe_json(path):
    if not path.exists():
        return None
    try:
        return json.loads(path.read_text(encoding='utf-8'))
    except Exception:
        return None

def _split_sentences(text):
    chunks = [x.strip() for x in re.split(r'[\n\r]+|(?<=[.!?])\s+', text) if x.strip()]
    return chunks

def _asciiish_ratio(text):
    words = re.findall(r'\b[a-zA-Z]{3,}\b', text)
    if not words:
        return 0.0
    asciiish = sum(1 for w in words if all(ord(c) < 128 for c in w))
    return asciiish / len(words)

def _garble_ratio(lines):
    if not lines:
        return 0.0
    bad = 0
    for line in lines:
        words = re.findall(r'\w+', line, flags=re.UNICODE)
        if len(words) < 4:
            continue
        short_ratio = sum(1 for w in words if len(w) <= 2) / max(1, len(words))
        digit_ratio = sum(ch.isdigit() for ch in line) / max(1, len(line))
        if short_ratio > 0.45 or digit_ratio > 0.35:
            bad += 1
    return bad / len(lines)

def _template_hits(text):
    templates = [
        'Noi dung cho thay dien bien theo thu tu thoi gian',
        'Thong diep duoc rut ra tu cac su kien da xuat hien trong video',
        'Khong du du lieu de tao tom tat chi tiet',
        'Khong du du lieu de tom tat chi tiet',
    ]
    lowered = text.lower()
    return [t for t in templates if t.lower() in lowered]

summary_text = OUTPUT_TEXT.read_text(encoding='utf-8') if OUTPUT_TEXT.exists() else ''
internal = _safe_json(INTERNAL_SUMMARY) or {}
alignment = _safe_json(ALIGNMENT) or {}
quality = _safe_json(REPORT) or {}

lines = _split_sentences(summary_text)
backend = str((internal.get('generation_meta') or {}).get('backend', 'unknown'))
plot = str(internal.get('plot_summary', ''))
moral = str(internal.get('moral_lesson', ''))
template_matches = _template_hits('\n'.join([plot, moral, summary_text]))
ascii_ratio = _asciiish_ratio(summary_text)
garble_ratio = _garble_ratio(lines)

blocks = alignment.get('blocks', []) if isinstance(alignment, dict) else []
fallback_counter = Counter(str(x.get('fallback_type', '')) for x in blocks if isinstance(x, dict))
no_match_rate = 0.0
if blocks:
    no_match_rate = fallback_counter.get('no_match', 0) / len(blocks)

score = 100
if backend == 'heuristic':
    score -= 35
score -= min(25, int(100 * no_match_rate * 0.4))
score -= min(20, int(100 * garble_ratio * 0.4))
score -= min(20, int(100 * max(0.0, ascii_ratio - 0.55) * 0.5))
score -= 10 * len(template_matches)
score = max(0, score)

severity = 'good' if score >= 80 else ('warning' if score >= 60 else 'poor')

print('--- Auto diagnose summary quality ---')
print('Run ID:', RUN_ID)
print('Backend:', backend)
print('Score:', score, f'({severity})')
print('No-match rate:', f'{no_match_rate:.3f}')
print('Garble ratio:', f'{garble_ratio:.3f}')
print('ASCII-ish ratio:', f'{ascii_ratio:.3f}')
print('Template hits:', template_matches if template_matches else 'none')
if isinstance(quality, dict):
    print('QC overall_status:', quality.get('overall_status', 'unknown'))

issues = []
if backend == 'heuristic':
    issues.append('Dang dung heuristic summary; noi dung de bi template hoa.')
if no_match_rate > 0.30:
    issues.append('Alignment no_match_rate cao, de gay ghiep sai thoai-canh.')
if garble_ratio > 0.25:
    issues.append('Nhieu cau co dau hieu nhieu ASR noise/vo nghia.')
if ascii_ratio > 0.65:
    issues.append('Ty le tu ASCII cao, co kha nang tron caption/thoai tieng Anh.')
if template_matches:
    issues.append('Phat hien cau mau co dinh trong tom tat.')

print('Detected issues:')
if issues:
    for idx, item in enumerate(issues, start=1):
        print(f'{idx}. {item}')
else:
    print('1. Khong phat hien dau hieu bat thuong ro rang theo rule hien tai.')


In [None]:
from IPython.display import Video
import subprocess
import sys

RUN_DIR = ARTIFACTS_DRIVE / RUN_ID
FINAL_DIR = DELIVERABLES_DRIVE / RUN_ID
ALIGNMENT = RUN_DIR / 'g2_align' / 'alignment_result.json'
SCRIPT = RUN_DIR / 'g5_segment' / 'summary_script.json'
MANIFEST = RUN_DIR / 'g5_segment' / 'summary_video_manifest.json'
REPORT = RUN_DIR / 'g8_qc' / 'quality_report.json'

subprocess.check_call([
    sys.executable,
    'docs/Reasoning-NLP/schema/validate_artifacts.py',
    '--alignment', str(ALIGNMENT),
    '--script', str(SCRIPT),
    '--manifest', str(MANIFEST),
    '--report', str(REPORT),
    '--contracts-dir', 'contracts/v1/template',
])

OUTPUT_VIDEO = FINAL_DIR / 'summary_video.mp4'
OUTPUT_TEXT = FINAL_DIR / 'summary_text.txt'
print('Output video:', OUTPUT_VIDEO)
print('Output text:', OUTPUT_TEXT)
print('--- Summary text preview ---')
print(OUTPUT_TEXT.read_text(encoding='utf-8'))
Video(str(OUTPUT_VIDEO), embed=True)

