In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import random
import numpy as np


import math
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [20]:
import argparse
import json
from pathlib import Path
import random
import os
import schedulefree

import numpy as np
import torch
import wandb

import config
from data.utils import DataReader, get_dataset
import distributed
from models.utils import get_model
from optim.base import train
from optim.utils import cos_inf_schedule, wsd_schedule, get_batch

import sys
if 'ipykernel_launcher' in sys.argv[0]:
   sys.argv = sys.argv[:1]

def get_args():
    parser = argparse.ArgumentParser(allow_abbrev=False)
    parser.add_argument(
        "--config_format", default="base", choices=config.registered_formats()
    )
    args, rem_args = parser.parse_known_args()
    args.n_layer=3
    args.n_head=6
    args.n_embd=60
    args.multiple_of=1
    args.batch_size=1
    args.dtype = "float32"
    args.datasets_dir = "/chenyupeng/data_files/llm_datasets"
    return config.parse_args_with_format(
        format=args.config_format, base_parser=parser, args=rem_args, namespace=args
    )

In [21]:
args = get_args()

import copy
def get_data_readers(args, verbose=True):
    data_srcs = get_dataset(args)
    train_reader = DataReader(
        data_src=data_srcs["train"],
        batch_size=args.batch_size,
        sequence_length=args.sequence_length,
        seed=args.data_seed,
        with_replacement=False,
        auto_shard=True,
        keep_in_ram=args.data_in_ram,
    )
    val_reader = DataReader(
        data_src=data_srcs["val"],
        batch_size=args.batch_size,
        sequence_length=args.sequence_length,
        seed=args.data_seed,
        with_replacement=False,
        auto_shard=False,  # NOTE Identical Per Rank
        keep_in_ram=args.data_in_ram,
    )

    if verbose:
        print(f"Num training tokens: {train_reader.num_tokens}")
        print(f"Num validation tokens: {val_reader.num_tokens}")

    return {
        "train": train_reader,
        "val": val_reader,
    }
data = get_data_readers(args)


model = get_model(args)

/chenyupeng/data_files/llm_datasets/slimpajama6B/
Num training tokens: 5827933038
Num validation tokens: 9479563


In [10]:
model

Llama(
  (transformer): ModuleDict(
    (wte): Embedding(50304, 100)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-2): 3 x LlamaBlock(
        (ln_1): RMSNorm()
        (attn): LlamaAttention(
          (c_attn): Linear(in_features=100, out_features=300, bias=False)
          (c_proj): Linear(in_features=100, out_features=100, bias=False)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): RMSNorm()
        (mlp): LlamaMLP(
          (w1): Linear(in_features=100, out_features=266, bias=False)
          (w2): Linear(in_features=100, out_features=266, bias=False)
          (c_proj): Linear(in_features=266, out_features=100, bias=False)
        )
      )
    )
    (ln_f): RMSNorm()
  )
  (lm_head): Linear(in_features=100, out_features=50304, bias=False)
)

In [18]:
val_batches = []
data_reader = get_data_readers(args)["val"]
for _ in range(10):
    z_x, z_y = get_batch(data_reader, device="cuda")
    val_batches.append((z_x, z_y))
eval_batches = val_batches[0]  # 使用前10个batch评估

#set_seed(100)
#for i in range(eval_batches[0].shape[1]):
#    eval_batches[0][0,i].data.copy_(random.randint(0, 19))
#    if i>=1:
#        eval_batches[1][0,i] = eval_batches[0][0,i-1]
#eval_batches[1][0,-1] = random.randint(0, 19)
def compute_grad(model,eval_batches):
    model.train()
    total_loss = 0
    n_batches = 0
    # 清空梯度
    for p in model.parameters():
        p.grad = None
    
    # 梯度累积
    #for x, y in eval_batches:
    x = eval_batches[0]
    y = eval_batches[1]
    outputs = model(x, targets=y, get_logits=True)
    batch_loss = outputs["loss"]*1e4
    
    # 通过缩放损失实现梯度累积，相当于平均梯度
    batch_loss.backward()  # 梯度会累积

/chenyupeng/data_files/llm_datasets/slimpajama6B/
Num training tokens: 5827933038
Num validation tokens: 9479563


In [15]:
import torch.nn.functional as F
def get_hessian(model,eval_batches,a,r):
    compute_grad(model,eval_batches)
    grad_original = model.transformer.h[-1].mlp.c_proj.weight.grad.detach().clone()
    original_weight = model.transformer.h[-1].mlp.c_proj.weight.data.detach().clone()
    set_seed(42)
    random_phi = torch.randn_like(model.transformer.h[-1].mlp.c_proj.weight)
    for i in range(2000):
        #random_phi = random_phi/torch.norm(random_phi)
        model.transformer.h[-1].mlp.c_proj.weight.data.add_((random_phi/torch.norm(random_phi))*a)
        compute_grad(model,eval_batches)
        grad_after_pertu = model.transformer.h[-1].mlp.c_proj.weight.grad.data.detach().clone()
        random_phi = (1-r)*random_phi + (r/a)*(grad_after_pertu-grad_original)
        model.transformer.h[-1].mlp.c_proj.weight.data.copy_(original_weight)
        weight_norm_of_random = random_phi.norm()
        simi = F.cosine_similarity(grad_original.reshape(-1), (random_phi/torch.norm(random_phi)).reshape(-1), dim=0)
        print(f"{i}-th iteration, grad norm of phi: {weight_norm_of_random}, simi : {simi}")

    random_phi = random_phi/random_phi.norm()
    #cosine_simi = F.cosine_similarity(grad_original.reshape(-1), random_phi.reshape(-1), dim=0)
    return random_phi,simi

In [16]:
model = model.cuda()
model.eval()
phi,simi = get_hessian(model,eval_batches,0.1,0.1)

0-th iteration, grad norm of phi: 175.6248016357422, simi : 0.003978067077696323
1-th iteration, grad norm of phi: 158.19244384765625, simi : 0.003338780254125595
2-th iteration, grad norm of phi: 142.643310546875, simi : 0.0030107153579592705
3-th iteration, grad norm of phi: 128.86228942871094, simi : 0.0028320476412773132
4-th iteration, grad norm of phi: 116.85551452636719, simi : 0.002635692246258259
5-th iteration, grad norm of phi: 106.9105453491211, simi : 0.0016653588972985744
6-th iteration, grad norm of phi: 100.17897033691406, simi : -0.0013499893248081207
7-th iteration, grad norm of phi: 100.11438751220703, simi : -0.00937785767018795
8-th iteration, grad norm of phi: 113.89456176757812, simi : -0.023671213537454605
9-th iteration, grad norm of phi: 146.63888549804688, simi : -0.039449892938137054
10-th iteration, grad norm of phi: 192.316650390625, simi : -0.05132779851555824
11-th iteration, grad norm of phi: 241.82017517089844, simi : -0.05948349088430405
12-th iterati


KeyboardInterrupt



In [11]:
phi.shape

torch.Size([20, 53])

In [60]:
model.transformer.h[-1].mlp.c_proj.weight.shape

torch.Size([20, 53])

In [31]:
model.transformer.h[-1].mlp.c_proj.weight

Parameter containing:
tensor([[ 0.0006, -0.0034,  0.0140,  ...,  0.0186, -0.0074,  0.0134],
        [-0.0225,  0.0089, -0.0028,  ..., -0.0131,  0.0037, -0.0078],
        [ 0.0032,  0.0076, -0.0307,  ...,  0.0057,  0.0003, -0.0009],
        ...,
        [ 0.0147, -0.0062, -0.0018,  ...,  0.0036,  0.0254, -0.0131],
        [-0.0121,  0.0002,  0.0063,  ...,  0.0144,  0.0037,  0.0080],
        [-0.0209,  0.0114,  0.0203,  ..., -0.0019,  0.0037, -0.0361]],
       device='cuda:0', requires_grad=True)

In [32]:
for p in model.parameters():
    p.grad = None

In [6]:
model = model.cuda()
model.zero_grad(set_to_none = True)
model.eval()
x = eval_batches[0]
y = eval_batches[1]
outputs = model(x, targets=y, get_logits=True)
batch_loss = outputs["loss"]*1e4

In [84]:
parameters = [p for n,p in model.named_parameters() if "mlp.c_proj" in n]

In [85]:
first_order_grads = torch.autograd.grad(batch_loss, parameters, create_graph=True,retain_graph=True)

In [62]:
hessian = torch.zeros((parameters[0].numel(), parameters[0].numel()), device=parameters[0].device)

In [63]:
#first_order_grads = first_order_grads[0].view(-1)
#hessian_vector = torch.autograd.grad(first_order_grads, parameters, retain_graph=True)


for i in range(parameters[0].numel()):
    grad2 = torch.autograd.grad(first_order_grads[0].flatten()[i].double(), parameters, retain_graph=True)[0]
            
    if grad2 is not None:
        hessian[i, :] = grad2.flatten()
    else:
        print("none")

In [22]:
model.zero_grad(set_to_none = True)
model = model.cuda()
x = eval_batches[0]
y = eval_batches[1]
model.eval()
import time
def hessian_calculation(g_tensor, params):
    g_tensor = g_tensor.cuda()
    total_params = g_tensor.size(0)
    hessian_list = []
    t_d = time.time()
    for d in range(total_params):
        unit_vector = torch.zeros_like(g_tensor)
        unit_vector[d] = 1
        l = torch.sum(g_tensor * unit_vector)
        grad_2 = torch.autograd.grad(l, params[0], create_graph=True)
        #l.backward(retain_graph= True)
        hessian_row = []
        #print('name',name, param.grad)
        hessian_row.append(grad_2[0].double().data.clone())
        
        model.zero_grad(set_to_none = True)
        hessian_row = [g.flatten() for g in hessian_row] 
        hessian_row = [g.cpu() for g in hessian_row]
        hessian_row = torch.cat(hessian_row)
        #print('hessian_row', hessian_row)   
        hessian_list.append(hessian_row)
        # if d % 1000 == 0:
        #     print(f'Computing hessian: current batch = {batch_idx}/{self.num_batches}, current row of a hessian: {d}/{total_params}, total time = {time.time()- t_d} ')
    hessian = torch.stack(hessian_list, dim = 1)
    #print('hessian', hessian)   
    return hessian
full_hessian = 0
outputs = model(x, targets=y, get_logits=True)
batch_loss = outputs["loss"]*1e4
#batch_loss.backward(create_graph= True)
#g_list = []
#count = 0
parameters = [p for n,p in model.named_parameters() if "mlp.c_proj" in n]
#if parameters[0].requires_grad:
#    count += parameters[0].numel()
#    #print('g shape', param.grad , param.grad.shape)
#    g_list.append(torch.flatten(parameters[0].grad.double()))
#    #print('name',name, g_list[-1].size())
#g_tensor = torch.cat(g_list, dim = 0)
grad_para = torch.autograd.grad(batch_loss, parameters, create_graph=True,retain_graph=True)
g_tensor = torch.flatten(grad_para[0].double())
#print('g_tensor',g_tensor)
model.zero_grad(set_to_none = True)
H = hessian_calculation(g_tensor,parameters)
full_hessian += H
full_hessian = torch.nan_to_num(full_hessian, nan = 0, posinf = 0, neginf = 0 )  # change nan, postive inf , negative inf, to 0
t_svd = time.time()
#print('doing EVD')
# _, eigenvalues, _ = torch.linalg.svd(full_hessian)  # ascending
#eigenvalues, _  = torch.eig(full_hessian)
full_hessian = full_hessian.numpy().astype(np.float64)
full_hessian = (full_hessian + full_hessian.T)/2 # make symetric, to 



#avoid numerical issue
#full_hessian = full_hessian.cuda()
#eigenvalues, _  = torch.linalg.eig(full_hessian)
# eigenvalues, _  = np.linalg.eigh(full_hessian)
# #_, eigenvalues, _ = np.linalg.svd(full_hessian) 
# eigenvalues = [eigen.item().real for eigen in eigenvalues]
# file_name = self.file_dir + 'eigenvalues.txt'
# with open(file_name, "w") as file:
#     for item in eigenvalues:
#         file.write(str(item)+"\n")

In [14]:
409600 / 32768

12.5

In [15]:
1*512*4*2

4096

In [23]:
full_hessian = torch.tensor(full_hessian).cuda()

In [25]:
full_hessian.shape

torch.Size([9600, 9600])

In [9]:
1e10/1e

torch.Size([1, 512])

In [10]:
y.shape

torch.Size([1, 512])

In [10]:
full_hessian.shape

(5120, 5120)

In [9]:
u,v,d = torch.linalg.svd(torch.tensor(full_hessian).cuda((.double())

In [10]:
v

tensor([1.1715e+01, 9.1922e+00, 8.8347e+00,  ..., 6.7539e-07, 5.1691e-07,
        9.5747e-08], dtype=torch.float64)

In [11]:
F.cosine_similarity(phi.reshape(-1), d[0,:],dim=0)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

In [23]:
u,v,d = torch.linalg.svd(hessian.double())

AttributeError: 'function' object has no attribute 'double'

In [65]:
v

tensor([8.4398e+00, 7.2601e+00, 6.8293e+00,  ..., 4.8793e-07, 2.4023e-07,
        2.1420e-07], device='cuda:0', dtype=torch.float64)

In [102]:
torch.matmul(hessian, phi.reshape(-1))

tensor([-2.8252e-07,  3.4268e-06,  1.2450e-04,  ...,  5.5907e-05,
        -1.9799e-05,  4.9336e-05], device='cuda:0')

In [54]:
(torch.matmul(hessian, phi.reshape(-1))).norm()

tensor(7.2583, device='cuda:0')

In [55]:
(phi.reshape(-1)*7.122495174407959 - torch.matmul(hessian, phi.reshape(-1))).norm()

tensor(0.2283, device='cuda:0')

In [56]:
F.cosine_similarity(phi.reshape(-1), d[0,:],dim=0)

tensor(0.0005, device='cuda:0', dtype=torch.float64)

In [26]:
torch.matmul(hessian, phi.reshape(-1))

tensor([-3.2368e-07,  5.7056e-06,  1.2471e-05,  ...,  3.9668e-07,
         2.2853e-07, -1.4662e-06], device='cuda:0')

In [28]:
phi.reshape(-1)*8e-4

tensor([-1.6113e-05,  3.2472e-06,  1.0434e-05,  ..., -5.9264e-07,
         4.9677e-07, -2.0703e-06], device='cuda:0')

In [122]:
phi[:2,:10]

tensor([[-0.0012,  0.0051,  0.1549,  0.0139, -0.0057,  0.0166, -0.0391, -0.0140,
          0.0126,  0.0058],
        [ 0.0037, -0.0004, -0.0201, -0.0034, -0.0018,  0.0006, -0.0068, -0.0008,
         -0.0059,  0.0016]], device='cuda:0')

In [126]:
v[:7]

tensor([0.0011, 0.0010, 0.0009, 0.0008, 0.0008, 0.0008, 0.0007],
       device='cuda:0', dtype=torch.float64)