In [None]:
import torch
import torch.nn as nn
from torch.autograd import Function
from typing import Tuple, Any
from torch.optim import Optimizer

Задача 1. RMSNorm

In [None]:
class MyRMSNorm(nn.Module):
    def __init__(self, dim: int, eps: float = 1e-8):
        super().__init__()
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(dim))  # обучаемый масштаб

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x: (..., dim)
        rms = x.pow(2).mean(dim=-1, keepdim=True).sqrt()  # RMS по последнему измерению
        x_norm = x / (rms + self.eps)  # нормализация
        return self.gamma * x_norm  # масштабируем

In [None]:
def test_rmsnorm():
    torch.manual_seed(42)
    x = torch.randn(2, 5, 10)  # batch=2, seq_len=5, dim=10

    my_norm = MyRMSNorm(dim=10)
    torch_norm = nn.RMSNorm(normalized_shape=10)

    # синхронизируем веса
    with torch.no_grad():
        torch_norm.weight.copy_(my_norm.gamma)

    out_my = my_norm(x)
    out_torch = torch_norm(x)

    print("Разница (L2 norm):", (out_my - out_torch).norm().item())
    print("Совпадают ли:", torch.allclose(out_my, out_torch, atol=1e-6))

In [None]:
test_rmsnorm()

Разница (L2 norm): 1.1754623301385436e-06
Совпадают ли: True


Задача 2. AutoGrad

In [None]:
class ExpCosFunction(Function):
    @staticmethod
    def forward(ctx, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
        # Сохраняем входы для backward
        ctx.save_for_backward(x, y)
        return torch.exp(x) + torch.cos(y)

    @staticmethod
    def backward(ctx, grad_output: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        x, y = ctx.saved_tensors

        dx = torch.exp(x) * grad_output       # ∂f/∂x = e^x
        dy = -torch.sin(y) * grad_output      # ∂f/∂y = -sin(y)

        return dx, dy

In [None]:
def test_exp_cos_autograd():
    torch.manual_seed(42)

    # Входы
    x = torch.randn(4, 5, requires_grad=True)
    y = torch.randn(4, 5, requires_grad=True)

    # Обычная функция
    fx = torch.exp(x) + torch.cos(y)
    loss1 = fx.sum()
    loss1.backward()
    grad_x_ref = x.grad.clone()
    grad_y_ref = y.grad.clone()

    # Обнуляем градиенты
    x.grad.zero_()
    y.grad.zero_()

    # Кастомная функция
    fx_custom = ExpCosFunction.apply(x, y)
    loss2 = fx_custom.sum()
    loss2.backward()
    grad_x_custom = x.grad
    grad_y_custom = y.grad

    # Проверки
    print("dx совпадает:", torch.allclose(grad_x_ref, grad_x_custom, atol=1e-6))
    print("dy совпадает:", torch.allclose(grad_y_ref, grad_y_custom, atol=1e-6))
    print("Разница L2 (dx):", (grad_x_ref - grad_x_custom).norm().item())
    print("Разница L2 (dy):", (grad_y_ref - grad_y_custom).norm().item())

In [None]:
test_exp_cos_autograd()

dx совпадает: True
dy совпадает: True
Разница L2 (dx): 0.0
Разница L2 (dy): 0.0


Задача 3. Lion

In [None]:
class Lion(Optimizer):
    def __init__(
        self,
        params,
        lr=1e-3,
        betas=(0.9, 0.99),
        weight_decay=0.0,
    ):
        defaults = dict(lr=lr, betas=betas, weight_decay=weight_decay)
        super(Lion, self).__init__(params, defaults)

    @torch.no_grad()
    def step(self, closure=None):
        for group in self.param_groups:
            lr = group['lr']
            beta1, beta2 = group['betas']
            weight_decay = group['weight_decay']

            for p in group['params']:
                if p.grad is None:
                    continue

                grad = p.grad.data
                state = self.state[p]

                if 'exp_avg' not in state:
                    state['exp_avg'] = torch.zeros_like(p.data)

                exp_avg = state['exp_avg']

                update = (1 - beta1) * grad + beta1 * exp_avg
                update = torch.sign(update)

                if weight_decay != 0:
                    update = update + weight_decay * p.data

                p.data.add_(update, alpha=-lr)

                exp_avg.mul_(beta2).add_(grad, alpha=1 - beta2)

In [None]:
def test_lion_optimizer():
    torch.manual_seed(0)

    model = nn.Sequential(
        nn.Linear(10, 1)
    ).to('cuda' if torch.cuda.is_available() else 'cpu')

    x = torch.randn(100, 10).to(model[0].weight.device)
    y = torch.randn(100, 1).to(model[0].weight.device)

    criterion = nn.MSELoss()
    optimizer = Lion(model.parameters(), lr=1e-3)

    for epoch in range(100):
        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            print(f"Epoch {epoch} | Loss: {loss.item():.4f}")

In [None]:
test_lion_optimizer()

Epoch 0 | Loss: 1.0317
Epoch 10 | Loss: 1.0078
Epoch 20 | Loss: 0.9856
Epoch 30 | Loss: 0.9651
Epoch 40 | Loss: 0.9463
Epoch 50 | Loss: 0.9291
Epoch 60 | Loss: 0.9135
Epoch 70 | Loss: 0.8997
Epoch 80 | Loss: 0.8875
Epoch 90 | Loss: 0.8770
