# Sanity Check Pipeline (Colab, T4)

This notebook runs the independent `sanity_check` pipeline using `vblagoje/cc_news` as the text pool.
Sampling is random via dataset shuffle (not first-N rows).

In [None]:
# Optional: clone your repo in a fresh Colab runtime
# !git clone https://github.com/TMKempton/predicting_tempnorm.git
# %cd predicting_tempnorm

# If already in repo directory, run this to verify:
!pwd
!ls -la

In [None]:
# Install deps (Colab T4 is fine for OPT-125m)
!pip -q install -r sanity_check/requirements.txt

In [None]:
import random
import torch

SEED = random.randint(1, 10_000_000)
MODEL = 'facebook/opt-125m'
DATASET_NAME = 'vblagoje/cc_news'
DATASET_CONFIG = ''  # keep empty unless a specific config is needed
DATASET_SPLIT = 'train'

REF_COUNT = 2000
GEN_COUNT = 5000

print('Seed:', SEED)
print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0))

In [None]:
# Quick check: show random examples (shuffled order) from cc_news
from datasets import load_dataset

if DATASET_CONFIG:
    ds = load_dataset(DATASET_NAME, DATASET_CONFIG, split=DATASET_SPLIT)
else:
    ds = load_dataset(DATASET_NAME, split=DATASET_SPLIT)

shuffled = ds.shuffle(seed=SEED)
for i in range(3):
    row = shuffled[i]
    txt = row.get('text') or row.get('article') or row.get('content') or row.get('description') or ''
    print(f'--- sample {i} ---')
    print((txt[:400] + '...') if len(txt) > 400 else txt)

In [None]:
# Phase A: build PCA + KMeans reference space
cmd = f"python -m sanity_check.build_reference_space --output-dir sanity_check/artifacts --model {MODEL} --dataset-name {DATASET_NAME} --dataset-split {DATASET_SPLIT} --sample-count {REF_COUNT} --token-index 50 --pca-components 30 --num-clusters 50 --seed {SEED}"
if DATASET_CONFIG:
    cmd += f" --dataset-config {DATASET_CONFIG}"
print(cmd)
!$cmd

In [None]:
# Phase B: generate clustered tempnorm data
cmd = f"python -m sanity_check.generate_clustered_tempnorm_data --artifacts-dir sanity_check/artifacts --output-file sanity_check/data/clustered_tempnorm.jsonl --model {MODEL} --dataset-name {DATASET_NAME} --dataset-split {DATASET_SPLIT} --sample-count {GEN_COUNT} --prefix-len 30 --wander-len 20 --gen-number 8 --gen-length 50 --alpha 4 --eval-lengths 1 2 5 10 20 50 --seed {SEED + 1}"
if DATASET_CONFIG:
    cmd += f" --dataset-config {DATASET_CONFIG}"
print(cmd)
!$cmd

In [None]:
# Phase C: analysis
!python -m sanity_check.analyze_clustered_tempnorm --data-file sanity_check/data/clustered_tempnorm.jsonl --output-dir sanity_check/artifacts/analysis

In [None]:
# Show summary outputs
!cat sanity_check/artifacts/analysis/analysis_summary.txt

import json
from pathlib import Path
report = json.loads(Path('sanity_check/artifacts/analysis/analysis_report.json').read_text())
report.keys()