In [5]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-3.9.0-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-skinny==3.9.0 (from mlflow)
  Downloading mlflow_skinny-3.9.0-py3-none-any.whl.metadata (32 kB)
Collecting mlflow-tracing==3.9.0 (from mlflow)
  Downloading mlflow_tracing-3.9.0-py3-none-any.whl.metadata (19 kB)
Collecting Flask-CORS<7 (from mlflow)
  Downloading flask_cors-6.0.2-py3-none-any.whl.metadata (5.3 kB)
Collecting Flask<4 (from mlflow)
  Downloading flask-3.1.2-py3-none-any.whl.metadata (3.2 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.18.3-py3-none-any.whl.metadata (7.2 kB)
Collecting cryptography<47,>=43.0.0 (from mlflow)
  Downloading cryptography-46.0.4-cp311-abi3-manylinux_2_34_x86_64.whl.metadata (5.7 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting huey<3,>=2.5.4 (from mlflow)


In [1]:
# Load model directly

import os, sys
# sys.path += ["/teamspace/studios/this_studio"]

import torch
import random
from torch import nn

from typing import Iterator, Tuple
import json
import numpy as np
import pandas as pd
from gpt2tiny.tokenizer import Tokenizer
import glob
from dataclasses import dataclass
import math
from pytorch_lightning.loggers import MLFlowLogger

from gpt2tiny.model import GPT2, GPTConfig
# from dataset import PreTokDataset
from gpt2tiny.dataset import SFTDataset, collator 
from gpt2tiny.trainer import TrainingConfig, SFTGPT2Module
import torch.distributed as dist
from typing import Iterator, Tuple
from pathlib import Path
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")

BASE_DIR = "/teamspace/studios/this_studio/gpt2tiny"
DATA_CACHE_DIR = Path(BASE_DIR) / "data"

In [2]:
mlf_logger = MLFlowLogger(
    experiment_name="test",
    tracking_uri=f"{BASE_DIR}/mlruns",  # Colab-local (ephemeral) filesystem
)

  return FileStore(store_uri, store_uri)


In [3]:
torch.cuda.is_available()

True

In [4]:
config = GPTConfig(flash=True, vocab_size=8192, dropout=0.1)#, block_size=64)#, load_loss_coef=0.5)#,n_layer=1, n_head=2, n_embed=8)

In [5]:
config

GPTConfig(block_size=512, vocab_size=8192, n_layer=8, n_head=8, n_embed=512, n_expert=2, k=1, dropout=0.1, bias=False, use_rotary=False, flash=True, noisy_gating=True, capacity_factor=10, load_loss_coef=0.01)

In [6]:
trainer_config = TrainingConfig(
    batch_size=32,
    num_workers=4,
    max_iters=1000,
    gradient_accumulation_steps=8,
    eval_interval=10,
    log_interval=1,
)

In [7]:
trainer_config

TrainingConfig(learning_rate=0.0006, max_iters=1000, weight_decay=0.1, beta1=0.9, beta2=0.95, grad_clip=1.0, decay_lr=True, warmup_iters=1000, lr_decay_iters=30000, min_lr=6e-05, eval_interval=10, log_interval=1, eval_iters=200, gradient_accumulation_steps=8, batch_size=32, num_workers=4, device='cuda', dtype='bfloat16', compile=True)

In [8]:
tokenizer = Tokenizer(f"{BASE_DIR}/data/tok8192.model")

In [9]:
train_dataloader = DataLoader(
    SFTDataset(
        split="train",
        data_dir=[DATA_CACHE_DIR / "MetaMathQA"],# DATA_CACHE_DIR / "TinyStories_all_data"],
        weights="Balanced",
    ),
    collate_fn=lambda batch: collator(batch, 0),
    batch_size=trainer_config.batch_size,
    num_workers=trainer_config.num_workers,
)

eval_dataloader = DataLoader(
    SFTDataset(
        split="validation",
        data_dir=[DATA_CACHE_DIR / "MetaMathQA"],# DATA_CACHE_DIR / "TinyStories_all_data"],
        weights="Balanced",
    ),
    collate_fn=lambda batch: collator(batch, 0),
    batch_size=trainer_config.batch_size,
    num_workers=trainer_config.num_workers,
)

In [10]:
module = SFTGPT2Module.load_from_checkpoint("scratch/best-step=9775-val_loss=0.4077.ckpt", weights_only=False, config=config)

In [11]:
checkpoint_cb = ModelCheckpoint(
    monitor="val_loss",     # must match the name you log (self.log("val_loss", ...))
    mode="min",
    save_top_k=1,           # keep only the best
    dirpath=f"{BASE_DIR}/mlruns/{mlf_logger.experiment_id}/{mlf_logger.run_id}/artifacts/",
    filename="best-{step}-{val_loss:.4f}",
)

In [12]:
trainer = pl.Trainer(
    accelerator="auto",
    devices=1,             # number of GPUs
    precision="16-mixed",  # optional, T4 benefits from AMP    
    max_steps=trainer_config.max_iters,        # total training steps (defines run length)
    val_check_interval=trainer_config.eval_interval,  # run validation every 2k training steps
    limit_val_batches=200,   # cap validation to 200 batches per val loop
    logger=mlf_logger,
    callbacks=[checkpoint_cb],
    log_every_n_steps=trainer_config.log_interval,
    accumulate_grad_batches=trainer_config.gradient_accumulation_steps,
    gradient_clip_val=trainer_config.grad_clip,
)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores


In [None]:
trainer.fit(module, train_dataloader, eval_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/zeus/miniconda3/envs/cloudspace/lib/python3.12/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.


Output()

In [17]:
model = GPT2Module.load_from_checkpoint(
    "./mlruns/833148605187015421/964cb2ae845841cf90237bc1826cd54e/artifacts/best-step=4550-val_loss=1.4789.ckpt",
    config=config,
    tokenizer=tokenizer,
    gen_every_n_epochs=500,
    prompts=["A dragon in a cave", "1+1 is", "The CMB is the"]
)

In [41]:
prompt = "There once was a man"
# prompt = "A dragon in a cave"
# prompt = "I like video games"
# prompt = "2 + 2 is"
prompt = "$(x+1)^2="
raw_model = model.model
_ = raw_model.eval()
output = raw_model.generate(prompt, 55, top_k=10, top_p=None, tokenizer=tokenizer, temperature=1.5)
print(output)

$(x+1)^2= The area of one square is calculated by multiplying its length by itself to $8\times 10\times x$.
Since there are two legs, the perimeter of one leg must be $84/2=24$ cm.
Therefore


In [37]:
prompt = "There once was a girl named lily"
# prompt = "A dragon in a cave"
# prompt = "What is going on?"
# prompt = "2 + 2 is"
prompt = "$(x+1)^2="
idx = tokenizer.encode(prompt, bos=True, eos=True)
idx = torch.tensor(idx, dtype=torch.long).unsqueeze(0)
device = next(raw_model.parameters()).device
idx = idx.to(device)

In [38]:
B, T = idx.shape

x = raw_model.transformer.wte(idx)

pos_emb = raw_model.transformer.wpe(
    torch.arange(T, device = idx.device, dtype=torch.long)
)

x = x + pos_emb


In [39]:
moe_data = []
for bl in raw_model.transformer.h:
    x = x + bl.attn(bl.ln_1(x))
    
    x_moe = bl.moe(bl.ln_2(x))

    x = x + x_moe.y
    moe_data.append(x_moe)

In [40]:
[d.importance for d in moe_data]

[tensor([0.4918, 0.5082], device='cuda:0', grad_fn=<DivBackward0>),
 tensor([0.5232, 0.4768], device='cuda:0', grad_fn=<DivBackward0>),
 tensor([0.5178, 0.4822], device='cuda:0', grad_fn=<DivBackward0>),
 tensor([0.5096, 0.4904], device='cuda:0', grad_fn=<DivBackward0>),
 tensor([0.5084, 0.4916], device='cuda:0', grad_fn=<DivBackward0>),
 tensor([0.6319, 0.3681], device='cuda:0', grad_fn=<DivBackward0>),
 tensor([0.5076, 0.4924], device='cuda:0', grad_fn=<DivBackward0>),
 tensor([0.5725, 0.4275], device='cuda:0', grad_fn=<DivBackward0>)]

In [27]:
[d.importance for d in moe_data]

[tensor([0.4910, 0.5090], device='cuda:0', grad_fn=<DivBackward0>),
 tensor([0.5436, 0.4564], device='cuda:0', grad_fn=<DivBackward0>),
 tensor([0.5078, 0.4922], device='cuda:0', grad_fn=<DivBackward0>),
 tensor([0.4938, 0.5062], device='cuda:0', grad_fn=<DivBackward0>),
 tensor([0.5289, 0.4711], device='cuda:0', grad_fn=<DivBackward0>),
 tensor([0.4715, 0.5285], device='cuda:0', grad_fn=<DivBackward0>),
 tensor([0.4841, 0.5159], device='cuda:0', grad_fn=<DivBackward0>),
 tensor([0.5791, 0.4209], device='cuda:0', grad_fn=<DivBackward0>)]

In [47]:
[d.importance for d in moe_data]

[tensor([0.3290, 0.6710], grad_fn=<DivBackward0>),
 tensor([0.3958, 0.6042], grad_fn=<DivBackward0>),
 tensor([0.4671, 0.5329], grad_fn=<DivBackward0>),
 tensor([0.3412, 0.6588], grad_fn=<DivBackward0>),
 tensor([0.4800, 0.5200], grad_fn=<DivBackward0>),
 tensor([0.5341, 0.4659], grad_fn=<DivBackward0>),
 tensor([0.7485, 0.2515], grad_fn=<DivBackward0>),
 tensor([0.2666, 0.7334], grad_fn=<DivBackward0>)]

In [43]:
[d.importance for d in moe_data]

[tensor([0.3527, 0.6473], grad_fn=<DivBackward0>),
 tensor([0.4996, 0.5004], grad_fn=<DivBackward0>),
 tensor([0.5462, 0.4538], grad_fn=<DivBackward0>),
 tensor([0.3743, 0.6257], grad_fn=<DivBackward0>),
 tensor([0.5625, 0.4375], grad_fn=<DivBackward0>),
 tensor([0.5724, 0.4276], grad_fn=<DivBackward0>),
 tensor([0.4320, 0.5680], grad_fn=<DivBackward0>),
 tensor([0.3397, 0.6603], grad_fn=<DivBackward0>)]

In [18]:
raw_model.transformer

GPT2(
  (transformer): ModuleDict(
    (wte): Embedding(4096, 512)
    (drop): Dropout(p=0.2, inplace=False)
    (h): ModuleList(
      (0-7): 8 x Block(
        (ln_1): RMSNorm((512,), eps=None, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=512, out_features=1536, bias=False)
          (c_proj): Linear(in_features=512, out_features=512, bias=False)
          (attn_dropout): Dropout(p=0.2, inplace=False)
          (resid_dropout): Dropout(p=0.2, inplace=False)
        )
        (ln_2): RMSNorm((512,), eps=None, elementwise_affine=True)
        (moe): MOE(
          (router): TopKRouter(
            (gate): Linear(in_features=512, out_features=2, bias=False)
            (noisy_gate): Linear(in_features=512, out_features=2, bias=False)
          )
          (experts): ModuleList(
            (0-1): 2 x ExpertNN(
              (fc1): Linear(in_features=512, out_features=1365, bias=True)
              (fc2): Linear(in_features=1365, ou