# TIUR — AdamW Deep Dive: Time-to-Loss + LR Sweep

This notebook is designed to answer the practical question:

**Does a TIUR-motivated training trick (e.g., loss-mixed sampling) improve _time-to-loss_ under AdamW?**

It runs a small **AdamW LR sweep** for:
- `baseline` (IID)
- `lossmixed` (loss-mixed sampling)

Then it computes:
- **steps-to-target** (for several loss thresholds)
- **loss AUC** (area under the loss curve)
- TIUR summaries (final churn, directed, efficiency)

All artifacts are written to **Google Drive** so you don't lose them if the instance dies.


In [None]:
#@title 1) Mount Google Drive (outputs persist here)
from google.colab import drive
drive.mount('/content/drive')

import os, datetime
OUT_ROOT = '/content/drive/MyDrive/tiur_tricks_results_adam'
run_id = datetime.datetime.now().strftime('run_%Y%m%d_%H%M%S')
out_dir = os.path.join(OUT_ROOT, run_id)
os.makedirs(out_dir, exist_ok=True)
print('Outputs will be saved to:', out_dir)


In [None]:
#@title 2) Install deps (avoid reinstalling torch/torchvision in Colab)
# If you previously ran a cell that installed torchvision, restart runtime first.
!pip install -q tqdm pandas matplotlib

import torch
print('torch:', torch.__version__)
print('cuda available:', torch.cuda.is_available())
!nvidia-smi -L


In [None]:
#@title 3) Unzip repo (set the zip path if needed)
import os

# Option A: keep the zip on Drive for convenience
REPO_ZIP = '/content/drive/MyDrive/tiur_tricks_colab_v4.zip'  # <- change if your zip has a different name
REPO_DIR = '/content/tiur_tricks_colab'

if not os.path.isdir(REPO_DIR):
    assert os.path.exists(REPO_ZIP), f'Could not find {REPO_ZIP}. Upload the zip to Drive or change REPO_ZIP.'
    !unzip -q "$REPO_ZIP" -d /content/

assert os.path.exists(os.path.join(REPO_DIR, 'tiur_tricks', '__init__.py')), 'Repo did not unzip correctly.'
print('Repo ready at:', REPO_DIR)


In [None]:
#@title 4) Robust imports (ensure we import *your* tiur_tricks)
import sys
if REPO_DIR not in sys.path:
    sys.path.insert(0, REPO_DIR)

# Clear stale import if you re-ran cells
if 'tiur_tricks' in sys.modules:
    del sys.modules['tiur_tricks']

import tiur_tricks
print('tiur_tricks loaded from:', getattr(tiur_tricks, '__file__', None))

from tiur_tricks import RunConfig, run_experiment_suite


In [None]:
#@title 5) Experiment config (small enough for Colab, meaningful enough for curves)
import math

# Core knobs
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DATASET = 'cifar10'
MODEL = 'small_cnn'  # try 'resnet18' later

# Make runs long enough that time-to-target is meaningful
EPOCHS = 10
SUBSET_TRAIN = 10000
SUBSET_EVAL = 2000

# TIUR stability: more replicates = better estimates, but slower
NUM_REPLICATES = 5

# Checkpoint frequency (controls plot resolution + overhead)
CHECKPOINT_EVERY = 100
EVAL_BATCHES = 20

# AdamW LR sweep (keep it small for Colab)
LR_LIST = [1e-3, 3e-3, 1e-2]
WEIGHT_DECAY = 0.0

BATCH_SIZE = 128

# Optional extras
INCLUDE_NOISE = False
NOISE_STDS = [1e-3]

print('DEVICE:', DEVICE)
print('LR_LIST:', LR_LIST)


In [None]:
#@title 6) Build the AdamW suite: baseline vs lossmixed (plus optional noise)
from dataclasses import replace

base = RunConfig(
    name='baseline',
    device=DEVICE,
    dataset=DATASET,
    model=MODEL,
    optimizer='adamw',
    sampler='iid',
    lr=LR_LIST[0],
    weight_decay=WEIGHT_DECAY,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    num_replicates=NUM_REPLICATES,
    subset_train=SUBSET_TRAIN,
    subset_eval=SUBSET_EVAL,
    checkpoint_every=CHECKPOINT_EVERY,
    eval_batches=EVAL_BATCHES,
)

suite = []
for lr in LR_LIST:
    suite.append(replace(base, name=f'baseline_lr{lr:g}', sampler='iid', lr=lr))
    suite.append(replace(base, name=f'lossmixed_lr{lr:g}', sampler='loss_mixed', lr=lr))

if INCLUDE_NOISE:
    for lr in LR_LIST:
        for ns in NOISE_STDS:
            suite.append(replace(base, name=f'noise{ns:g}_lr{lr:g}', sampler='iid', lr=lr, grad_noise_std=ns))

print('Number of runs in suite:', len(suite))
print('Runs:')
for c in suite:
    print(' -', c.name)


In [None]:
#@title 7) Run the suite (saves everything to Drive)
import time
t0 = time.time()

logs_df, summary_df = run_experiment_suite(
    suite,
    out_dir=out_dir,
    show_plots=False,
    save_plots=True,
    persist_checkpoints=True,
)

print('Total wall time (min):', (time.time()-t0)/60)
display(summary_df.sort_values('final_loss'))


In [None]:
#@title 8) Time-to-loss + AUC analysis + plots
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

# Ensure we have the full logs (mean curve per run)
if logs_df is None or len(logs_df)==0:
    logs_df = pd.read_csv(os.path.join(out_dir, 'all_logs.csv'))
if summary_df is None or len(summary_df)==0:
    summary_df = pd.read_csv(os.path.join(out_dir, 'summary.csv'))

logs_df = logs_df.sort_values(['name','step'])
summary_df = summary_df.copy()

# Targets: baseline final loss at each LR, plus a couple absolute targets
baseline_finals = summary_df[summary_df['name'].str.startswith('baseline_lr')].set_index('name')['final_loss'].to_dict()

abs_targets = [1.90, 1.87, 1.85, 1.83]  # adjust as you like

def steps_to_target(run_df, target):
    hit = run_df[run_df['loss_mean'] <= target]
    return float(hit['step'].iloc[0]) if len(hit) else np.inf

def loss_auc(run_df):
    x = run_df['step'].to_numpy()
    y = run_df['loss_mean'].to_numpy()
    if len(x) < 2:
        return np.nan
    return float(np.trapz(y, x) / (x[-1]-x[0]))  # normalized AUC

rows = []
for name, g in logs_df.groupby('name'):
    g = g.sort_values('step')
    row = {'name': name, 'loss_auc': loss_auc(g)}
    for tgt in abs_targets:
        row[f'steps_to_{tgt:.2f}'] = steps_to_target(g, tgt)
    rows.append(row)
extra = pd.DataFrame(rows)

merged = summary_df.merge(extra, on='name', how='left')

# Pick best LR per method family by final_loss and by AUC
merged['family'] = merged['name'].apply(lambda s: 'lossmixed' if s.startswith('lossmixed_') else ('baseline' if s.startswith('baseline_') else 'other'))

best_by_final = merged[merged['family'].isin(['baseline','lossmixed'])].sort_values('final_loss').groupby('family').head(1)
best_by_auc = merged[merged['family'].isin(['baseline','lossmixed'])].sort_values('loss_auc').groupby('family').head(1)

print('Best by final_loss:')
display(best_by_final[['name','final_loss','final_efficiency','final_churn_frac','churn_integral','directed_integral','loss_auc']])

print('Best by loss_auc (lower is better):')
display(best_by_auc[['name','final_loss','final_efficiency','final_churn_frac','churn_integral','directed_integral','loss_auc']])

# Save merged table
merged_path = os.path.join(out_dir, 'adam_time_to_loss_summary.csv')
merged.to_csv(merged_path, index=False)
print('Saved:', merged_path)

# Plot loss curves for the best-by-final-loss runs
plt.figure(figsize=(7,4))
for _, r in best_by_final.iterrows():
    name = r['name']
    g = logs_df[logs_df['name']==name].sort_values('step')
    plt.plot(g['step'], g['loss_mean'], label=name)
plt.xlabel('step')
plt.ylabel('eval loss (mean over replicates)')
plt.title('Best runs (by final loss) — loss vs step')
plt.legend()
plt.grid(True, alpha=0.3)
fig1 = os.path.join(out_dir, 'best_final_loss_curves.png')
plt.savefig(fig1, dpi=160, bbox_inches='tight')
plt.show()
print('Saved plot:', fig1)

# Plot steps-to-target comparison at a chosen absolute target (if reached)
target = 1.85
col = f'steps_to_{target:.2f}'
sub = merged[merged['family'].isin(['baseline','lossmixed'])].sort_values(['family','name'])[['name','family',col,'final_loss','loss_auc']]
sub = sub.replace([np.inf], np.nan)

plt.figure(figsize=(7,4))
x = np.arange(len(sub))
plt.bar(x, sub[col].fillna(0.0))
plt.xticks(x, sub['name'], rotation=45, ha='right')
plt.ylabel(f'steps to reach loss <= {target:.2f} (0 if never reached)')
plt.title('Time-to-target (steps)')
plt.grid(True, axis='y', alpha=0.3)
fig2 = os.path.join(out_dir, f'steps_to_{target:.2f}.png')
plt.savefig(fig2, dpi=160, bbox_inches='tight')
plt.show()
print('Saved plot:', fig2)


## Next step if you still see “no time-to-loss benefit”

If lossmixed ends lower but doesn’t reach intermediate thresholds faster, the most common reasons are:

1. **Baseline LR is already optimal in this tiny regime**, so lossmixed can’t show a speedup without changing the stability boundary.
2. **Lossmixed mostly helps late training** (better asymptote), not early convergence.

Two practical follow-ups:
- **Increase the LR grid upward** (e.g., `[1e-3, 3e-3, 1e-2, 2e-2]`) and see if lossmixed stays stable at higher LR.
- **Try a larger model** (`resnet18`) and slightly more steps. Many “training tricks” don’t show clean speedups at very small scale.

This notebook already supports that: just change `LR_LIST`, `MODEL`, and/or `EPOCHS`.
