In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import random
import numpy as np


import math
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [4]:
import argparse
import json
from pathlib import Path
import random
import os
import schedulefree

import numpy as np
import torch
import wandb

import config
from data.utils import DataReader, get_dataset
import distributed
from models.utils import get_model
from optim.base import train
from optim.utils import cos_inf_schedule, wsd_schedule, get_batch

import sys
if 'ipykernel_launcher' in sys.argv[0]:
   sys.argv = sys.argv[:1]

def get_args():
    parser = argparse.ArgumentParser(allow_abbrev=False)
    parser.add_argument(
        "--config_format", default="base", choices=config.registered_formats()
    )
    args, rem_args = parser.parse_known_args()
    args.n_layer=24
    args.n_head=16
    args.n_embd=1024
    args.datasets_dir = "/chenyupeng/data_files/llm_datasets"
    return config.parse_args_with_format(
        format=args.config_format, base_parser=parser, args=rem_args, namespace=args
    )

In [5]:
args = get_args()

import copy
def get_data_readers(args, verbose=True):
    data_srcs = get_dataset(args)
    train_reader = DataReader(
        data_src=data_srcs["train"],
        batch_size=args.batch_size,
        sequence_length=args.sequence_length,
        seed=args.data_seed,
        with_replacement=False,
        auto_shard=True,
        keep_in_ram=args.data_in_ram,
    )
    val_reader = DataReader(
        data_src=data_srcs["val"],
        batch_size=args.batch_size,
        sequence_length=args.sequence_length,
        seed=args.data_seed,
        with_replacement=False,
        auto_shard=False,  # NOTE Identical Per Rank
        keep_in_ram=args.data_in_ram,
    )

    if verbose:
        print(f"Num training tokens: {train_reader.num_tokens}")
        print(f"Num validation tokens: {val_reader.num_tokens}")

    return {
        "train": train_reader,
        "val": val_reader,
    }
data = get_data_readers(args)


model = get_model(args)

/chenyupeng/data_files/llm_datasets/slimpajama6B/
Num training tokens: 5827933038
Num validation tokens: 9479563


In [9]:
val_batches = []
data_reader = get_data_readers(args)["val"]
for _ in range(10):
    x, y = get_batch(data_reader, device="cuda")
    val_batches.append((x, y))
eval_batches = val_batches[:10]  # 使用前10个batch评估


def compute_grad(model,eval_batches):
    model.train()
    total_loss = 0
    n_batches = 0
    scaled_loss = 0
    # 清空梯度
    for p in model.parameters():
        p.grad = None
    
    # 梯度累积
    for x, y in eval_batches:
        outputs = model(x, targets=y, get_logits=True)
        batch_loss = outputs["loss"]
        
        # 通过缩放损失实现梯度累积，相当于平均梯度
        scaled_loss += batch_loss / len(eval_batches)
        scaled_loss.backward()  # 梯度会累积

/chenyupeng/data_files/llm_datasets/slimpajama6B/
Num training tokens: 5827933038
Num validation tokens: 9479563


In [10]:
model

Llama(
  (transformer): ModuleDict(
    (wte): Embedding(50304, 1024)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-23): 24 x LlamaBlock(
        (ln_1): RMSNorm()
        (attn): LlamaAttention(
          (c_attn): Linear(in_features=1024, out_features=3072, bias=False)
          (c_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): RMSNorm()
        (mlp): LlamaMLP(
          (w1): Linear(in_features=1024, out_features=2816, bias=False)
          (w2): Linear(in_features=1024, out_features=2816, bias=False)
          (c_proj): Linear(in_features=2816, out_features=1024, bias=False)
        )
      )
    )
    (ln_f): RMSNorm()
  )
  (lm_head): Linear(in_features=1024, out_features=50304, bias=False)
)

In [12]:
def get_hessian(model,eval_batches,a,r):
    compute_grad(model,eval_batches)
    grad_original = model.lm_head.grad.detach().clone()
    original_weight = model.lm_head.weight.data.detahc().clone()
    set_seed(42)
    for i in range(100):
        random_phi = torch.randn_like(model.lm_head.weight)
        #random_phi = random_phi/torch.norm(random_phi)
        model.lm_head.weight.data.add_((random_phi/torch.norm(random_phi))*a)
        compute_grad(model,eval_batches)
        grad_after_pertu = model.lm_head.weight.data.detach().clone()
        random_phi = (1-r)*random_phi + (r/a)*(grad_after_pertu-grad_original)
        model.lm_head.weight.data.copy_(original_weight)
        weight_norm_of_random = random_phi.norm()
        print(f"i-th iteration, grad norm of phi: {weight_norm_of_random}")
    return random_phi

In [14]:
model = model.cuda()
phi = get_hessian(model,eval_batches,1e-2,1e-2)

OutOfMemoryError: CUDA out of memory. Tried to allocate 276.00 MiB. GPU 0 has a total capacity of 10.57 GiB of which 222.00 MiB is free. Including non-PyTorch memory, this process has 10.35 GiB memory in use. Of the allocated memory 10.16 GiB is allocated by PyTorch, and 8.11 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)