In [31]:
import torch
a = torch.tensor([[1, 2, 3],[4,5,6],[7,8,9]])
a.stride(0)
a.stride(1)


1

In [32]:

def same_storage(a, b):
    return a.untyped_storage().data_ptr() == b.untyped_storage().data_ptr()
b = a[0]
print(b)
print(same_storage(a, b))
c = torch.tensor([1,2,3])
print(same_storage(a, c))

view_a = a.view(1,9)
print(same_storage(a, view_a))


tensor([1, 2, 3])
True
False
True


张量的转置会导致不连续，这个很好想，使用contiguous可以重新排序，但是copy了张量

In [33]:
x= torch.tensor([[1,2,3],[4,5,6]])
y = x.t().contiguous()
print(x.is_contiguous())
print(y.is_contiguous())
same_storage(x, y)

True
True


False

torch einops 的操作

In [34]:
from jaxtyping import Float
import torch
from einops import einsum
x: Float[torch.Tensor, "batch seq1 hidden"] = torch.randn(2, 3, 4)
y: Float[torch.Tensor, "batch seq2 hidden"] = torch.randn(2, 3, 4)


z = x @ y.transpose(-1, -2)  # (batch, seq1, seq2)
print(z.shape)  # (2, 3, 3)

z_einsum = einsum(x, y, "batch seq1 hideen, batch seq2 hidden -> batch seq1 seq2")



torch.Size([2, 3, 3])


这里学习的是einops里面的reduce操作

In [35]:


from einops import reduce
x :Float[torch.Tensor,"batch seq hidden"] = torch.ones(3,4,5)
print(x.shape)
mean_1 = x.sum(dim=-1)
mean_2 = reduce(x,"... hid -> ...", "sum")
print(mean_1)
print(mean_2)

torch.Size([3, 4, 5])
tensor([[5., 5., 5., 5.],
        [5., 5., 5., 5.],
        [5., 5., 5., 5.]])
tensor([[5., 5., 5., 5.],
        [5., 5., 5., 5.],
        [5., 5., 5., 5.]])


这里学习的是rearrange的操作

In [39]:
from einops import rearrange
w: Float[torch.Tensor,' batch seq hidden'] = torch.ones(3,4,768)
w = rearrange(w,'... (head head_dim) -> ... head head_dim', head=12)
print(w.shape)

torch.Size([3, 4, 12, 64])


下面来学习一下tensor的flops：

In [58]:

import torch
import timeit

_4060Ti_FP32_TFLOPS = 22.06e+12  # 22.06 TFLOPS
# 计算矩阵乘法的FLOPS
def matmul_flops(m, n, p):
    return 2 * m * n * p  # 2 * (m * n * p) 次浮点运算  

x : Float[torch.Tensor, "m n"] = torch.randn(10, 10, device='cuda')
y : Float[torch.Tensor, "n p"] = torch.randn(10, 10, device='cuda')

def run(a, b):
    a @ b

    if torch.cuda.is_available():
        torch.cuda.synchronize()

t = timeit.timeit(lambda: run(x, y), number=10) / 10  # 平均每次运行时间
print(f"Time taken for 10 runs: {t:.6f} seconds")

print(f"FLOPS: {matmul_flops(1024, 512, 2048) / t:.2f}  | {_4060Ti_FP32_TFLOPS:.2f} FLOPS | MFU : {matmul_flops(1024, 512, 2048) / t / _4060Ti_FP32_TFLOPS:.2f}")


Time taken for 10 runs: 0.000122 seconds
FLOPS: 17580521874086.14  | 22060000000000.00 FLOPS | MFU : 0.80


Xavier 初始化（Glorot Initialization 就是为了解决在前向传播过程中，方差爆炸的问题，这个问题本质是输入维度对于生成值产生了印象，所以处以维度进行归一化

In [60]:
import torch.nn as nn
import numpy as np
input_dim = 512
hidden_dim = 2048
# To be extra safe, we truncate the normal distribution to [-3, 3].
w = nn.Parameter(nn.init.trunc_normal_(torch.empty(input_dim, hidden_dim), std=1/np.sqrt(input_dim)))


In [63]:
import numpy as np


origin_data = np.arange(10,dtype=np.int32)
origin_data.tofile('data.npy')


data = np.memmap('data.npy', dtype=np.int32, mode='r', shape=(10,))
print(data)

[0 1 2 3 4 5 6 7 8 9]


In [80]:

import torch.nn as nn
import torch
import numpy as np
import random
from typing import Iterable
import torch.nn.functional as F
from copy import deepcopy

#  realize the model
def get_num_parameters(model:nn.Module):
   
   return sum(p.numel() for p in model.parameters())
    
class Linear(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.weight = nn.Parameter(torch.rand((input_dim,output_dim)) / np.sqrt(input_dim))
    
    def forward(self, x : torch.Tensor) -> torch.Tensor:
        out = x @ self.weight
        return out   
        
        
class Cruncher(nn.Module):
    def __init__(self, dim:int, num_layers:int =2):
        super().__init__()
        self.layers = nn.ModuleList([
            Linear(dim, dim)
            for i in range(num_layers)
        ])
        self.final = Linear(dim, 1)
        
        
    def forward(self, x: torch.Tensor)-> torch.Tensor:
        for layer in self.layers:
            x = layer(x)
        out = self.final(x)
        
        out = out.squeeze(-1)
        return out
        

# 定义adagrad优化器 
class AdaGrad(torch.optim.Optimizer):
    def __init__(self, params: Iterable[nn.Parameter], lr :float = 0.01):
        super(AdaGrad, self).__init__(params, dict(lr = lr))
        
    def step(self):
        for group in self.param_groups:
            lr = group["lr"]
            for p in group["params"]:
                
                # optimize state 
                state = self.state[p]
                grad = p.grad.data
                
                # # Get squared gradients g2 
                g2 = state.get('g2', torch.zeros_like(grad))
                
                # update optimizer state
                g2 += torch.square(grad)
                state['g2'] = g2
                # update parameters
                p.data -= lr * grad / torch.sqrt(g2 + 1e-5)



# 初始化模型
def custom_model(B,D):
    seed = 2025
    # three places
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    
    device = torch.device("cuda:0")
    
    
    num_layers = 2
    model = Cruncher(dim = D, num_layers=num_layers)
    
    param_sizes = [
        (name, param.numel())
        for name, param in model.state_dict().items()
    ]
    
    print(param_sizes)
    
    # generate some fake data
    model = model.to(device)
    x = torch.randn(B, D, device= device)
    y = model(x)
    assert y.size() == torch.Size([B])
    
    
    num_parameters = get_num_parameters(model)
    
    # assert num_parameters == num_layers * (D * D) + D


    return model
    
    
def optimizer():
    B = 2
    D = 64
    model = custom_model(B=B, D=D).to(torch.device("cuda:0"))
    optimizer = AdaGrad(model.parameters(), lr = 0.01)
    state = deepcopy(model.state_dict())

    
    optimizer.zero_grad()
    x = torch.randn(B, D, device=torch.device("cuda:0"))
    label = torch.tensor([3.2, 5], dtype=torch.float32, device = torch.device("cuda:0"))
    
    y = model(x)
    
    loss = F.mse_loss(input = y, target = label)
    print(f"loss   is : {loss.item()}")
    loss.backward()
    
    
    optimizer.step()
    optimizer.zero_grad()
    
    new_state = model.state_dict()
    assert state.keys() == new_state.keys()
    for k in state.keys():
        print(f"old state is {state[k]}")
        print(f"new state is {new_state[k]}")
        
        assert not torch.equal(state[k], new_state[k])

optimizer()
    

[('layers.0.weight', 4096), ('layers.1.weight', 4096), ('final.weight', 64)]
loss   is : 3.1030330657958984
old state is tensor([[0.0856, 0.1169, 0.0363,  ..., 0.1039, 0.0838, 0.0706],
        [0.0455, 0.1041, 0.0479,  ..., 0.0307, 0.0349, 0.0337],
        [0.0963, 0.0055, 0.1020,  ..., 0.0699, 0.1210, 0.0122],
        ...,
        [0.0649, 0.0155, 0.1231,  ..., 0.0144, 0.0966, 0.1246],
        [0.0433, 0.0471, 0.0882,  ..., 0.0896, 0.0504, 0.0154],
        [0.1087, 0.0594, 0.1148,  ..., 0.0792, 0.1116, 0.1016]],
       device='cuda:0')
new state is tensor([[ 0.0757,  0.1070,  0.0263,  ...,  0.0941,  0.0739,  0.0607],
        [ 0.0555,  0.1141,  0.0579,  ...,  0.0407,  0.0449,  0.0437],
        [ 0.0863, -0.0045,  0.0920,  ...,  0.0599,  0.1110,  0.0022],
        ...,
        [ 0.0549,  0.0055,  0.1131,  ...,  0.0044,  0.0866,  0.1146],
        [ 0.0533,  0.0571,  0.0982,  ...,  0.0996,  0.0604,  0.0254],
        [ 0.0988,  0.0494,  0.1048,  ...,  0.0692,  0.1016,  0.0916]],
       dev