# SigilDERG Rust-QLoRA Phase 1 — Colab (A100/L4/T4)

This notebook:

- Mounts **Google Drive** and writes training logs to a Drive `logs` folder  
- Clones **SigilDERG-Finetuner** and installs dependencies  
- Logs into **Hugging Face** and **Weights & Biases**  
- Downloads your Rust-QLoRA adapter `checkpoint-9000` from Hugging Face  
- Generates a **GPU-aware** Phase-1 config tuned for A100/L4/T4 on Colab  
- Continues training from `checkpoint-9000`  
- Uploads **model checkpoints to W&B** as a model artifact

You can adjust paths and run lengths via environment variables in the notebook.

In [None]:
import os
from google.colab import drive

# Set PyTorch CUDA allocator configuration before any CUDA imports
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Mount Google Drive
drive.mount("/content/drive")

# Default log directory on Google Drive
default_log_dir = "/content/drive/MyDrive/SigilDERG/logs/phase1_a100_run1"

# You can override this before running the config cell if you want
os.environ.setdefault("SIGILDERG_LOG_DIR", default_log_dir)
print("SIGILDERG_LOG_DIR =", os.environ["SIGILDERG_LOG_DIR"])

# Optional: where to store *local* checkpoints (ephemeral, will also go to W&B)
os.environ.setdefault("SIGILDERG_OUTPUT_DIR", "/content/sigilderg-out/phase1")
print("SIGILDERG_OUTPUT_DIR =", os.environ["SIGILDERG_OUTPUT_DIR"])

Mounted at /content/drive
SIGILDERG_LOG_DIR = /content/drive/MyDrive/SigilDERG/logs/phase1_a100_run1
SIGILDERG_OUTPUT_DIR = /content/sigilderg-out/phase1


In [None]:
import os
import subprocess
import pathlib

print("Python version:")
!python --version

print("\nDetected GPU via nvidia-smi:")
try:
    display_str = subprocess.check_output(
        ["nvidia-smi", "--query-gpu=name,memory.total", "--format=csv,noheader"],
        encoding="utf-8",
    )
    print(display_str)
except Exception as e:
    print("Could not query GPU:", e)

# Set a base HF cache dir for the session
os.environ.setdefault("HF_HOME", "/content/hf-home")
pathlib.Path(os.environ["HF_HOME"]).mkdir(parents=True, exist_ok=True)

print("\nHF_HOME:", os.environ["HF_HOME"])

Python version:
Python 3.12.12

Detected GPU via nvidia-smi:
NVIDIA A100-SXM4-40GB, 40960 MiB


HF_HOME: /content/hf-home


In [None]:
import os
import subprocess
import multiprocessing

# --- SigilDERG-Finetuner repository setup ---
REPO_DIR = "/content/SigilDERG-Finetuner"
REPO_URL = "https://github.com/Superuser666-Sigil/SigilDERG-Finetuner"

# Clone or update the repository
if not os.path.exists(REPO_DIR):
    print(f"Cloning {REPO_URL} into {REPO_DIR}...")
    subprocess.run(["git", "clone", REPO_URL, REPO_DIR], check=True)
else:
    print(f"Repository already exists at {REPO_DIR}, fetching latest changes...")
    # Explicitly fetch origin to ensure we have the latest commits
    subprocess.run(["git", "-C", REPO_DIR, "fetch", "origin"], check=True)
    # Reset local to match the fresh origin/main
    subprocess.run(["git", "-C", REPO_DIR, "reset", "--hard", "origin/main"], check=True)

# Change current directory to the repository for dependency installation
os.chdir(REPO_DIR)
print(f"Current working directory changed to: {os.getcwd()}")

# --- Dependency Installation ---

# Set MAX_JOBS to use all cores
os.environ["MAX_JOBS"] = str(multiprocessing.cpu_count())
print(f"\nSetting build parallelism: MAX_JOBS={os.environ['MAX_JOBS']}")

# Install ninja for build speed
print("\nInstalling ninja...")
!pip install ninja

# SKIP flash-attn to prevent build failures (config is already updated to disable it)
print("\nSkipping flash-attn installation...")

# Install dependencies from requirements.txt directly.
# Removed the manual numpy/protobuf upgrades to prevent conflicts with requirements.txt constraints.
print("\nInstalling SigilDERG-Finetuner dependencies and the package itself...")
!pip install -r requirements.txt
!pip install -e .

print("\nSigilDERG-Finetuner installed successfully.")

Repository already exists at /content/SigilDERG-Finetuner, fetching latest changes...
Current working directory changed to: /content/SigilDERG-Finetuner

Setting build parallelism: MAX_JOBS=12

Installing ninja...

Skipping flash-attn installation...

Installing SigilDERG-Finetuner dependencies and the package itself...
Obtaining file:///content/SigilDERG-Finetuner
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: sigilderg-finetuner
  Building editable for sigilderg-finetuner (pyproject.toml) ... [?25l[?25hdone
  Created wheel for sigilderg-finetuner: filename=sigilderg_finetuner-3.0.0-0.editable-py3-none-any.whl size=15470 sha256=b3c234226a9c12071d29b77eafd21fc908e4d0226b808b6a5116076836f4f7d2
  Stored in directory: /tmp/pip-ephem-wheel-cac

In [None]:
import os, getpass
from huggingface_hub import login as hf_login
import wandb

# ----- Hugging Face login -----
if "HF_TOKEN" in os.environ and os.environ["HF_TOKEN"].strip():
    print("Using HF_TOKEN from environment")
    hf_login(token=os.environ["HF_TOKEN"])
else:
    hf_token = getpass.getpass("Enter your Hugging Face token (with write access to models): ")
    hf_login(token=hf_token)

# ----- Weights & Biases login -----
if "WANDB_API_KEY" in os.environ and os.environ["WANDB_API_KEY"].strip():
    print("Using WANDB_API_KEY from environment")
    wandb.login(key=os.environ["WANDB_API_KEY"])
else:
    wandb.login()

# Basic W&B metadata
os.environ.setdefault("WANDB_PROJECT", "rust-qlora-phase1-colab")
os.environ.setdefault("WANDB_NOTEBOOK_NAME", "SigilDERG_Phase1_Colab")
os.environ.setdefault("WANDB_LOG_MODEL", "checkpoint")  # Updated from "true" to "checkpoint"

print("W&B project:", os.environ["WANDB_PROJECT"])

Enter your Hugging Face token (with write access to models): ··········


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 2


[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdavetmire85[0m ([33mdavetmire85-southern-new-hampshire-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


W&B project: rust-qlora-phase1-colab


In [None]:
import os
from pathlib import Path
from huggingface_hub import snapshot_download

adapter_repo = "Superuser666-Sigil/Llama-3.1-8B-Instruct-Rust-QLora"
local_adapter_root = "/content/rust-qlora-adapter"

print(f"Downloading adapter repo: {adapter_repo}")
local_repo = snapshot_download(
    repo_id=adapter_repo,
    local_dir=local_adapter_root,
    repo_type="model"
)

checkpoint_dir = os.path.join(local_repo, "checkpoint-9000")
if not os.path.isdir(checkpoint_dir):
    raise RuntimeError(f"Expected checkpoint directory not found: {checkpoint_dir}")

print("Local adapter checkpoint path:", checkpoint_dir)
os.environ["SIGILDERG_LOAD_FROM"] = checkpoint_dir


Downloading adapter repo: Superuser666-Sigil/Llama-3.1-8B-Instruct-Rust-QLora


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 236 files:   0%|          | 0/236 [00:00<?, ?it/s]

adapter_config.json: 0.00B [00:00, ?B/s]

adapter_config.json: 0.00B [00:00, ?B/s]

checkpoint-1000/adapter_model.safetensor(…):   0%|          | 0.00/168M [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

checkpoint-1000/checkpoint-1000/optimize(…):   0%|          | 0.00/85.7M [00:00<?, ?B/s]

checkpoint-1000/checkpoint-1000/rng_stat(…):   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-1000/checkpoint-1000/adapter_(…):   0%|          | 0.00/168M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

checkpoint-1000/checkpoint-1000/rng_stat(…):   0%|          | 0.00/15.4k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

trainer_state.json: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

checkpoint-1000/checkpoint-1000/training(…):   0%|          | 0.00/6.22k [00:00<?, ?B/s]

errors.jsonl: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

checkpoint-1000/rng_state_0.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-1000/checkpoint-1000/rng_stat(…):   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-1000/rng_state_2.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

checkpoint-1000/checkpoint-1000/rng_stat(…):   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-1000/checkpoint-1000/schedule(…):   0%|          | 0.00/1.47k [00:00<?, ?B/s]

checkpoint-1000/checkpoint-1000/tokenize(…):   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

metrics.jsonl: 0.00B [00:00, ?B/s]

checkpoint-1000/optimizer.pt:   0%|          | 0.00/85.7M [00:00<?, ?B/s]

checkpoint-1000/tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

checkpoint-1000/training_args.bin:   0%|          | 0.00/6.22k [00:00<?, ?B/s]

checkpoint-1000/rng_state_1.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

adapter_config.json: 0.00B [00:00, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

checkpoint-1000/rng_state_3.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

samples.jsonl: 0.00B [00:00, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

checkpoint-1000/scheduler.pt:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

checkpoint-2000/checkpoint-2000/optimize(…):   0%|          | 0.00/85.7M [00:00<?, ?B/s]

checkpoint-2000/checkpoint-2000/rng_stat(…):   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-2000/checkpoint-2000/rng_stat(…):   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-2000/checkpoint-2000/rng_stat(…):   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-2000/checkpoint-2000/rng_stat(…):   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-2000/checkpoint-2000/schedule(…):   0%|          | 0.00/1.47k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

checkpoint-2000/checkpoint-2000/tokenize(…):   0%|          | 0.00/17.2M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

trainer_state.json: 0.00B [00:00, ?B/s]

errors.jsonl: 0.00B [00:00, ?B/s]

checkpoint-2000/optimizer.pt:   0%|          | 0.00/85.7M [00:00<?, ?B/s]

checkpoint-2000/rng_state_0.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-2000/rng_state_1.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-2000/rng_state_2.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-2000/rng_state_3.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

trainer_state.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

checkpoint-2000/adapter_model.safetensor(…):   0%|          | 0.00/168M [00:00<?, ?B/s]

adapter_config.json: 0.00B [00:00, ?B/s]

checkpoint-2000/checkpoint-2000/adapter_(…):   0%|          | 0.00/168M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

trainer_state.json: 0.00B [00:00, ?B/s]

metrics.jsonl: 0.00B [00:00, ?B/s]

checkpoint-2000/checkpoint-2000/training(…):   0%|          | 0.00/6.22k [00:00<?, ?B/s]

samples.jsonl: 0.00B [00:00, ?B/s]

checkpoint-4000/adapter_model.safetensor(…):   0%|          | 0.00/168M [00:00<?, ?B/s]

adapter_config.json: 0.00B [00:00, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

adapter_config.json: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

checkpoint-4000/checkpoint-4000/adapter_(…):   0%|          | 0.00/168M [00:00<?, ?B/s]

checkpoint-4000/checkpoint-4000/optimize(…):   0%|          | 0.00/85.7M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

checkpoint-2000/scheduler.pt:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

checkpoint-4000/checkpoint-4000/rng_stat(…):   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-2000/tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

checkpoint-4000/checkpoint-4000/rng_stat(…):   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-4000/checkpoint-4000/rng_stat(…):   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-4000/checkpoint-4000/schedule(…):   0%|          | 0.00/1.47k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

errors.jsonl: 0.00B [00:00, ?B/s]

checkpoint-4000/optimizer.pt:   0%|          | 0.00/85.7M [00:00<?, ?B/s]

checkpoint-4000/rng_state_0.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-2000/training_args.bin:   0%|          | 0.00/6.22k [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

checkpoint-4000/rng_state_1.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-4000/rng_state_2.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-4000/rng_state_3.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

samples.jsonl: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

checkpoint-4000/checkpoint-4000/rng_stat(…):   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-4000/tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

trainer_state.json: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

adapter_config.json: 0.00B [00:00, ?B/s]

checkpoint-5000/adapter_model.safetensor(…):   0%|          | 0.00/168M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

checkpoint-4000/checkpoint-4000/training(…):   0%|          | 0.00/6.22k [00:00<?, ?B/s]

adapter_config.json: 0.00B [00:00, ?B/s]

checkpoint-5000/checkpoint-5000/adapter_(…):   0%|          | 0.00/168M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

checkpoint-5000/checkpoint-5000/optimize(…):   0%|          | 0.00/85.7M [00:00<?, ?B/s]

checkpoint-4000/checkpoint-4000/tokenize(…):   0%|          | 0.00/17.2M [00:00<?, ?B/s]

trainer_state.json: 0.00B [00:00, ?B/s]

metrics.jsonl: 0.00B [00:00, ?B/s]

checkpoint-5000/checkpoint-5000/rng_stat(…):   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-5000/checkpoint-5000/schedule(…):   0%|          | 0.00/1.47k [00:00<?, ?B/s]

checkpoint-5000/checkpoint-5000/tokenize(…):   0%|          | 0.00/17.2M [00:00<?, ?B/s]

trainer_state.json: 0.00B [00:00, ?B/s]

checkpoint-5000/checkpoint-5000/training(…):   0%|          | 0.00/6.22k [00:00<?, ?B/s]

checkpoint-4000/scheduler.pt:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

metrics.jsonl: 0.00B [00:00, ?B/s]

errors.jsonl: 0.00B [00:00, ?B/s]

checkpoint-5000/rng_state_0.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-4000/training_args.bin:   0%|          | 0.00/6.22k [00:00<?, ?B/s]

checkpoint-5000/rng_state_2.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-5000/rng_state_3.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

samples.jsonl: 0.00B [00:00, ?B/s]

checkpoint-5000/scheduler.pt:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

checkpoint-5000/tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

checkpoint-5000/checkpoint-5000/rng_stat(…):   0%|          | 0.00/15.4k [00:00<?, ?B/s]

trainer_state.json: 0.00B [00:00, ?B/s]

checkpoint-5000/training_args.bin:   0%|          | 0.00/6.22k [00:00<?, ?B/s]

checkpoint-6000/adapter_model.safetensor(…):   0%|          | 0.00/168M [00:00<?, ?B/s]

adapter_config.json: 0.00B [00:00, ?B/s]

checkpoint-5000/checkpoint-5000/rng_stat(…):   0%|          | 0.00/15.4k [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

checkpoint-5000/checkpoint-5000/rng_stat(…):   0%|          | 0.00/15.4k [00:00<?, ?B/s]

adapter_config.json: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

checkpoint-6000/checkpoint-6000/adapter_(…):   0%|          | 0.00/168M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

checkpoint-6000/checkpoint-6000/optimize(…):   0%|          | 0.00/85.7M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

checkpoint-6000/checkpoint-6000/rng_stat(…):   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-6000/checkpoint-6000/rng_stat(…):   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-6000/checkpoint-6000/schedule(…):   0%|          | 0.00/1.47k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

checkpoint-6000/checkpoint-6000/tokenize(…):   0%|          | 0.00/17.2M [00:00<?, ?B/s]

checkpoint-5000/optimizer.pt:   0%|          | 0.00/85.7M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

checkpoint-6000/checkpoint-6000/training(…):   0%|          | 0.00/6.22k [00:00<?, ?B/s]

trainer_state.json: 0.00B [00:00, ?B/s]

errors.jsonl: 0.00B [00:00, ?B/s]

checkpoint-5000/rng_state_1.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-6000/optimizer.pt:   0%|          | 0.00/85.7M [00:00<?, ?B/s]

checkpoint-6000/rng_state_0.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-6000/rng_state_1.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-6000/rng_state_3.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-6000/scheduler.pt:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

checkpoint-6000/checkpoint-6000/rng_stat(…):   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-6000/tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

checkpoint-6000/training_args.bin:   0%|          | 0.00/6.22k [00:00<?, ?B/s]

trainer_state.json: 0.00B [00:00, ?B/s]

adapter_config.json: 0.00B [00:00, ?B/s]

checkpoint-6000/checkpoint-6000/rng_stat(…):   0%|          | 0.00/15.4k [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

adapter_config.json: 0.00B [00:00, ?B/s]

checkpoint-7000/checkpoint-7000/adapter_(…):   0%|          | 0.00/168M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

metrics.jsonl: 0.00B [00:00, ?B/s]

checkpoint-7000/checkpoint-7000/optimize(…):   0%|          | 0.00/85.7M [00:00<?, ?B/s]

checkpoint-6000/rng_state_2.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-7000/checkpoint-7000/rng_stat(…):   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-7000/checkpoint-7000/rng_stat(…):   0%|          | 0.00/15.4k [00:00<?, ?B/s]

samples.jsonl: 0.00B [00:00, ?B/s]

checkpoint-7000/checkpoint-7000/rng_stat(…):   0%|          | 0.00/15.4k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

checkpoint-7000/checkpoint-7000/tokenize(…):   0%|          | 0.00/17.2M [00:00<?, ?B/s]

trainer_state.json: 0.00B [00:00, ?B/s]

checkpoint-7000/checkpoint-7000/training(…):   0%|          | 0.00/6.22k [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

metrics.jsonl:   0%|          | 0.00/653 [00:00<?, ?B/s]

checkpoint-7000/adapter_model.safetensor(…):   0%|          | 0.00/168M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

checkpoint-7000/optimizer.pt:   0%|          | 0.00/85.7M [00:00<?, ?B/s]

checkpoint-7000/rng_state_1.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-7000/rng_state_2.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

samples.jsonl: 0.00B [00:00, ?B/s]

checkpoint-7000/checkpoint-7000/rng_stat(…):   0%|          | 0.00/15.4k [00:00<?, ?B/s]

errors.jsonl: 0.00B [00:00, ?B/s]

checkpoint-7000/checkpoint-7000/schedule(…):   0%|          | 0.00/1.47k [00:00<?, ?B/s]

checkpoint-7000/rng_state_0.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

checkpoint-7000/training_args.bin:   0%|          | 0.00/6.22k [00:00<?, ?B/s]

trainer_state.json: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

adapter_config.json: 0.00B [00:00, ?B/s]

checkpoint-8000/adapter_model.safetensor(…):   0%|          | 0.00/168M [00:00<?, ?B/s]

errors.jsonl: 0.00B [00:00, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

checkpoint-8000/optimizer.pt:   0%|          | 0.00/85.7M [00:00<?, ?B/s]

checkpoint-7000/rng_state_3.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-7000/scheduler.pt:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

checkpoint-8000/rng_state_2.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

checkpoint-8000/rng_state_3.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-8000/scheduler.pt:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

checkpoint-8000/tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

checkpoint-8000/training_args.bin:   0%|          | 0.00/6.22k [00:00<?, ?B/s]

checkpoint-7000/tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

metrics.jsonl: 0.00B [00:00, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

errors.jsonl: 0.00B [00:00, ?B/s]

checkpoint-8000/rng_state_0.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-8000/rng_state_1.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-9000/optimizer.pt:   0%|          | 0.00/85.7M [00:00<?, ?B/s]

checkpoint-9000/rng_state_0.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

samples.jsonl: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

checkpoint-9000/rng_state_2.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

checkpoint-9000/rng_state_3.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

samples.jsonl: 0.00B [00:00, ?B/s]

checkpoint-9000/scheduler.pt:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

trainer_state.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

adapter_config.json: 0.00B [00:00, ?B/s]

checkpoint-9000/adapter_model.safetensor(…):   0%|          | 0.00/168M [00:00<?, ?B/s]

checkpoint-9000/tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

metrics.jsonl:   0%|          | 0.00/569 [00:00<?, ?B/s]

trainer_state.json: 0.00B [00:00, ?B/s]

checkpoint-9000/rng_state_1.pth:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

checkpoint-9000/training_args.bin:   0%|          | 0.00/6.22k [00:00<?, ?B/s]

Local adapter checkpoint path: /content/rust-qlora-adapter/checkpoint-9000


In [None]:
import os
import subprocess
from pathlib import Path
import yaml

def detect_gpu():
    try:
        out = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=name,memory.total", "--format=csv,noheader"],
            encoding="utf-8",
        ).strip().splitlines()[0]
        name_raw, mem_raw = [x.strip() for x in out.split(",")]
        mem_gb = int(int(mem_raw.split()[0]) / 1024)
        return name_raw, mem_gb
    except Exception as e:
        print("WARNING: could not detect GPU via nvidia-smi:", e)
        return "unknown", 0

gpu_name, gpu_mem_gb = detect_gpu()
print(f"Detected GPU: {gpu_name} ({gpu_mem_gb} GB)")

CFG_DIR = Path("/content/SigilDERG-Finetuner/rust-qlora/configs")
base_cfg = CFG_DIR / "llama8b-phase1.yml"
colab_cfg = CFG_DIR / "llama8b-phase1-colab.yml"

if not base_cfg.exists():
    raise FileNotFoundError(f"Base config not found: {base_cfg}")

with open(base_cfg, "r") as f:
    cfg = yaml.safe_load(f)

cfg["dataset"]["use_cache"] = False
cfg["dataset"]["cache_dir"] = os.environ.get("SIGILDERG_CACHE_DIR")

output_dir = os.environ.get("SIGILDERG_OUTPUT_DIR", "/content/sigilderg-out/phase1")
log_dir_env = os.environ.get("SIGILDERG_LOG_DIR")
logging_dir = log_dir_env if log_dir_env else os.path.join(output_dir, "logs")

os.makedirs(output_dir, exist_ok=True)
os.makedirs(logging_dir, exist_ok=True)

if os.environ.get("SIGILDERG_MAX_SEQ_LEN"): # Manual override via env vars
    max_seq_len = int(os.environ["SIGILDERG_MAX_SEQ_LEN"])
    micro_batch = int(os.environ.get("SIGILDERG_MICRO_BATCH", 1))
    grad_accum = int(os.environ.get("SIGILDERG_GRAD_ACCUM", 8))
    grad_checkpointing = True
    use_flash_attn = False
    bf16 = cfg["train"].get("bf16", True)
else: # Auto-tune by GPU memory
    if "T4" in gpu_name or gpu_mem_gb <= 16: # Tiny GPUs
        max_seq_len = 1536
        micro_batch = 1
        grad_accum = 8
        grad_checkpointing = True
        use_flash_attn = False
        bf16 = False
    elif gpu_mem_gb <= 40: # 24–40 GB (including your A100 40GB)
        max_seq_len = 2048
        micro_batch = 1
        grad_accum = 8 # effective batch 8
        grad_checkpointing = True
        use_flash_attn = False # keep it simple, we can turn this back on later
        bf16 = True
    else: # True big iron (80GB+)
        max_seq_len = 4096
        micro_batch = 4
        grad_accum = 4
        grad_checkpointing = True
        use_flash_attn = True
        bf16 = True

cfg["max_seq_len"] = max_seq_len
cfg["train"].update({
    "micro_batch_size": int(os.environ.get("SIGILDERG_MICRO_BATCH", micro_batch)),
    "gradient_accumulation": int(os.environ.get("SIGILDERG_GRAD_ACCUM", grad_accum)),
    "num_steps": int(os.environ.get("SIGILDERG_NUM_STEPS", 4000)),
    "logging_steps": int(os.environ.get("SIGILDERG_LOGGING_STEPS", 20)),
    "save_every": int(os.environ.get("SIGILDERG_SAVE_EVERY", 500)),
    "log_backend": "wandb",
    "grad_checkpointing": grad_checkpointing,
    "bf16": bf16,
    "dataloader_num_workers": int(os.environ.get("SIGILDERG_DATALOADER_WORKERS", 0)),
    "dataloader_prefetch_factor": int(os.environ.get("SIGILDERG_PREFETCH_FACTOR", 1)),
    "clear_cache_every_n_steps": int(os.environ.get("SIGILDERG_CLEAR_CACHE_STEPS", 50)),
})
cfg["train"]["use_flash_attention"] = use_flash_attn

load_from = os.environ.get("SIGILDERG_LOAD_FROM")
if load_from:
    cfg.setdefault("misc", {})["load_from"] = load_from
    print("Configured misc.load_from =", load_from)

cfg["misc"]["output_dir"] = output_dir
cfg["misc"]["logging_dir"] = logging_dir

with open(colab_cfg, "w") as f:
    yaml.safe_dump(cfg, f, sort_keys=False)

eff_batch = cfg["train"]["micro_batch_size"] * cfg["train"]["gradient_accumulation"]
print("Wrote Colab config to", colab_cfg)
print("Effective batch size:", eff_batch)
print("max_seq_len:", cfg["max_seq_len"])
print("bf16:", cfg["train"]["bf16"], "use_flash_attention:", cfg["train"]["use_flash_attention"])
print("Outputs (checkpoints) will be stored in:", output_dir)
print("Logs will be stored in:", logging_dir)

Detected GPU: NVIDIA A100-SXM4-40GB (40 GB)
Configured misc.load_from = /content/rust-qlora-adapter/checkpoint-9000
Wrote Colab config to /content/SigilDERG-Finetuner/rust-qlora/configs/llama8b-phase1-colab.yml
Effective batch size: 16
max_seq_len: 3072
bf16: True use_flash_attention: False
Outputs (checkpoints) will be stored in: /content/sigilderg-out/phase1
Logs will be stored in: /content/sigilderg-out/phase1/logs


In [None]:
# Setup: Clear CUDA cache and reset memory stats before training
import torch
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
print("CUDA cache cleared and memory stats reset. Ready for training.")

In [None]:
import os

# Ensure we are in the repo directory
os.chdir("/content/SigilDERG-Finetuner")

cfg_path = "rust-qlora/configs/llama8b-phase1-colab.yml"

print(f"Running training command for config: {cfg_path}")
print("Streamed output should appear below:")

# Run with ! to stream stdout/stderr to the notebook cell
!python -m rust_qlora.train --cfg {cfg_path}

Running training command for config: rust-qlora/configs/llama8b-phase1-colab.yml
Streamed output should appear below:
2025-12-08 21:52:29.791529: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-08 21:52:29.811344: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765230749.834139   27612 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765230749.840757   27612 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1765230749.85

In [None]:
import os
import wandb

output_dir = os.environ.get("SIGILDERG_OUTPUT_DIR", "/content/sigilderg-out/phase1")
project = os.environ.get("WANDB_PROJECT", "rust-qlora-phase1-colab")

print("Preparing to upload checkpoints from:", output_dir)
if not os.path.isdir(output_dir):
    raise RuntimeError(f"Output directory not found: {output_dir}")

run = wandb.init(project=project, job_type="checkpoint-upload")
artifact_name = os.environ.get(
    "SIGILDERG_WANDB_ARTIFACT_NAME",
    "llama-3.1-8B-rust-qlora-phase1-colab"
)

artifact = wandb.Artifact(
    artifact_name,
    type="model",
    metadata={
        "source": "colab_phase1",
        "load_from": os.environ.get("SIGILDERG_LOAD_FROM", ""),
    },
)

artifact.add_dir(output_dir)
run.log_artifact(artifact)
run.finish()

print("Uploaded checkpoints from", output_dir, "to W&B artifact:", artifact_name)


Preparing to upload checkpoints from: /content/sigilderg-out/phase1


[34m[1mwandb[0m: Adding directory to artifact (/content/sigilderg-out/phase1)... Done. 0.0s


Uploaded checkpoints from /content/sigilderg-out/phase1 to W&B artifact: llama-3.1-8B-rust-qlora-phase1-colab
