In [3]:
import torch
from typing import Tuple, List, Callable
import torch
from torch import Tensor
import matplotlib.pyplot as plt


m=20
n=10

In [4]:
W=torch.rand((m,n))

In [5]:
relu=torch.nn.ReLU()

In [6]:
z=torch.rand((n))
b=torch.rand((n))
c=torch.rand((m))

In [7]:
W@relu(z+b)+c

tensor([4.7856, 5.1820, 3.8472, 6.0928, 7.6088, 5.9614, 7.5286, 6.7736, 5.4397,
        6.3900, 7.0223, 6.1708, 6.6048, 4.7607, 4.6629, 6.8944, 5.9208, 3.9657,
        7.8036, 7.6734])

In [8]:
import torch

x = torch.tensor([2.0], requires_grad=True)  # Pick x > 0 to avoid the ReLU non-smooth point
f = torch.relu(x)

# First derivative
df_dx = torch.autograd.grad(f, x, grad_outputs=torch.ones_like(f),create_graph=True)[0]

# Second derivative
d2f_dx2 = torch.autograd.grad(df_dx, x, grad_outputs=torch.ones_like(f),create_graph=True)[0]

# Third derivative
d3f_dx3 = torch.autograd.grad(d2f_dx2, x, grad_outputs=torch.ones_like(f),create_graph=True)[0]

print("f:", f.item())
print("First derivative:", df_dx.item())
print("Second derivative:", d2f_dx2.item())
print("Third derivative:", d3f_dx3.item())


f: 2.0
First derivative: 1.0
Second derivative: 0.0
Third derivative: 0.0


In [9]:
import torch
import torch.nn.functional as F


In [10]:
import torch
import torch.nn as nn

class LlamaMLP(nn.Module):
    def __init__(self, hidden_size=4096, intermediate_size=11008):
        super().__init__()
        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
        self.activation = nn.SiLU()

    def forward(self, x):
        x_gate = self.activation(self.gate_proj(x))       # [B, I]
        x_up = self.up_proj(x)            # [B, I]
        x_inner_prod =  x_gate * x_up  # [B, I]
        output = self.down_proj(x_inner_prod)          # [B, H]
        return output

    def call(self,x):
        return self.forward(x)

In [11]:
import unittest
import torch

class TestMatrixDifference(unittest.TestCase):
    def test_matrix_difference(self):
        # Example matrices
        A = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
        B = torch.tensor([[1.0, 2.0001], [3.0, 4.0]])

        # Using torch.testing.assert_close for detailed comparison
        torch.testing.assert_close(A, B, rtol=1e-3, atol=1e-4)

# Run the tests in the notebook
suite = unittest.TestLoader().loadTestsFromTestCase(TestMatrixDifference)
runner = unittest.TextTestRunner(verbosity=2)
runner.run(suite)


test_matrix_difference (__main__.TestMatrixDifference.test_matrix_difference) ... ok

----------------------------------------------------------------------
Ran 1 test in 0.005s

OK


<unittest.runner.TextTestResult run=1 errors=0 failures=0>

In [12]:
def taylor_series_components(
    W: Tensor,
    b: Tensor,
    c: Tensor,
    f: Callable[[Tensor], Tensor],
    iters: int = 10
) -> Tuple[List[Tensor], Tensor]:
    """
    Compute components for a Taylor series expansion with a neural activation.

    Args:
        W (Tensor): Weight matrix of shape (m, n).
        b (Tensor): Bias vector of shape (n,).
        c (Tensor): Coefficient vector of shape (m,).
        f (Callable[[Tensor], Tensor]): Activation function.
        iters (int, optional): Number of iterations. Defaults to 10.

    Returns:
        Tuple[List[Tensor], Tensor]: A tuple containing
            - a list of weight matrices modified by x_0 and c_i
            - the initial vector x_0
    """
    #this taylor series approximates W @ f(x+b)

    m, n = W.size()
    results: List[Tensor] = []

    # Placeholder logic for demonstration
    x0 = torch.rand((n), requires_grad=True)  # or some initialization, apparently b is supposed to be used here?

    tensors:torch.Tensor = []

    activated_gradient_iterative = f(x0)

    # calculate \theta_i_0 = W_i ⊙ f^0(x0) / 0! + c[i]/n
    matrix = []



    for i in range(m):
      matrix.append(( ((W[i] * activated_gradient_iterative) / 1)+c[i]/n).detach() )
    activated_gradient_iterative = torch.autograd.grad(activated_gradient_iterative, x0, grad_outputs=torch.ones_like(activated_gradient_iterative), create_graph=True)[0]

    tensors.append(torch.stack(matrix,dim=0))


    for k in range(1,iters):
        matrix = []


        for i in range(m):
            matrix.append( ((W[i] * activated_gradient_iterative) / torch.exp(torch.lgamma(torch.tensor(k+1)))).detach() )
        activated_gradient_iterative = torch.autograd.grad(activated_gradient_iterative, x0, grad_outputs=torch.ones_like(activated_gradient_iterative), create_graph=True)[0]

        tensors.append(torch.stack(matrix,dim=0))


    return torch.stack(tensors, dim=0), x0


In [13]:
m, n = (1,1)
W = torch.rand((m,n))
b = torch.rand((n))
c = torch.rand((m))
f = torch.nn.SiLU()


In [14]:
M, x0 = taylor_series_components(W,b,c,f,iters=10)

In [15]:
"""def taylor_series_estimate(x:torch.Tensor,M:torch.Tensor,x0:torch.Tensor):

  M (iters ,m ,n)
  x0 (,n)
  x (,n)



  sum(for n,W in enumerate(M)
    W @ (x-x0)**n)"""

def taylor_series_estimate0(x: torch.Tensor, M: torch.Tensor, x0: torch.Tensor) -> torch.Tensor:
  dx = x - x0                       # shape (n,)
  powers = torch.stack([dx**n for n in range(M.shape[0])])  # shape (iters, n)
  # batch matrix multiplication: (iters, m, n) @ (iters, n, 1) -> (iters, m, 1)
  terms = torch.bmm(M, powers.unsqueeze(-1))  # (iters, m, 1)
  return terms.sum(dim=0).squeeze(-1)         # (m,)

def taylor_series_estimate_batched_einsum(x: torch.Tensor, M: torch.Tensor, x0: torch.Tensor) -> torch.Tensor:
    """
    Taylor series estimation using einsum for cleaner batched operations.
    
    Args:
        x: Input tensor, shape (batch_size, n) or (n,)
        M: Coefficient matrices, shape (iters, m, n)
        x0: Expansion point, shape (n,) or broadcastable to x
    
    Returns:
        Estimated values, shape (batch_size, m) or (m,)
    """
    # Ensure x is at least 2D for consistent handling
    if x.dim() == 1:
        x = x.unsqueeze(0)  # (1, n)
        squeeze_output = True
    else:
        squeeze_output = False
    
    dx = x - x0  # shape (batch_size, n)
    
    # Compute powers: (batch_size, n) -> (iters, batch_size, n)
    powers = torch.stack([dx**k for k in range(M.shape[0])])
    
    # Einstein summation: sum over iterations and n dimension
    # 'imn,ibn->bm' means: (iters,m,n) * (iters,batch,n) -> (batch,m)
    result = torch.einsum('imn,ibn->bm', M, powers)
    
    if squeeze_output:
        result = result.squeeze(0)  # (m,)
    
    return result

import torch

def taylor_series_estimate2(x: torch.Tensor, M: torch.Tensor, x0: torch.Tensor) -> torch.Tensor:
    """
    Approximates a function using Taylor series expansion.

    Args:
        x:  Tensor of shape (..., n)
        M:  Tensor of shape (iters, m, n) where each M[k] is the coefficient matrix for order k
        x0: Tensor of shape (n,)

    Returns:
        Tensor of shape (..., m) representing the Taylor approximation.
    """
    dx = x - x0
    result = torch.zeros(M.shape[1], device=x.device, dtype=x.dtype)

    # Use enumerate over M for matrix multiplications
    for n, W in enumerate(M):
        result = result + W @ (dx ** n)
        #print(f"Calculating W_{n} @ dx ** {n} = {W.item():.2f}@{dx.item() ** n:.2f}")

    return result



In [16]:
x = x0
x=torch.rand((n))
a = taylor_series_estimate0(x, M[:7],x0)
b = taylor_series_estimate_batched_einsum(x, M[:1],x0)
print(a,b)

tensor([1.0086], grad_fn=<SqueezeBackward1>) tensor([1.2247], grad_fn=<SqueezeBackward1>)


In [17]:
(W @ f(x))+c


tensor([1.0086])

In [18]:
sample_count = 5
X = torch.rand((sample_count,n))
y = taylor_series_estimate_batched_einsum(X, M,x0)

print(y.shape)

torch.Size([5, 1])


In [19]:
X_with_bias = torch.cat([f(X), torch.ones(sample_count, 1)], dim=1)
solution = torch.linalg.pinv(X_with_bias) @ y
A = solution[:-1, :]  
b = solution[-1, :]   

print(f"A shape: {A.shape}")  # (n, m)
print(f"b shape: {b.shape}")  # (m,)

# Verify the fit
y_pred = f(X) @ A + b
mse = torch.mean((y - y_pred)**2)
print(f"Mean squared error: {mse.item():.6f}")

print(f"\nA =\n{A}")
print(f"\nb = {b}")


A shape: torch.Size([1, 1])
b shape: torch.Size([1])
Mean squared error: 0.000000

A =
tensor([[0.7732]], grad_fn=<SliceBackward0>)

b = tensor([0.8172], grad_fn=<SliceBackward0>)


In [20]:


# SVD
print(c)

U, S, V = torch.linalg.svd(c)

print(S.shape)

# Plot singular values
plt.figure()
plt.plot(S.numpy(), marker="o")
plt.xlabel("Index")
plt.ylabel("Singular Value")
plt.title("Singular Values of c")
plt.grid(True)
plt.yscale('log')
plt.show()

tensor([0.8172])


RuntimeError: linalg.svd: The input tensor A must have at least 2 dimensions.

In [None]:
d_in, d_out = 10,100
samples = 1000

x = torch.rand(d_out,d_in)#10,100
c_vec = torch.rand((d_out))

samples_list = []
for i in range(samples):
  samples_list.append(x@torch.rand((d_in)))

samples_list = torch.stack(samples_list)
c = torch.stack([c_vec]*samples)

W_plus_c = samples_list + c

# SVD
U, S, V = torch.linalg.svd(W_plus_c)

print(S.shape)

# Plot singular values
plt.figure()
plt.plot(S.numpy(), marker="o")
plt.xlabel("Index")
plt.ylabel("Singular Value")
plt.title("Singular Values of W_plus_c")
plt.grid(True)
plt.yscale('log')
plt.show()


In [None]:
W, c

In [None]:

# Solve least squares: (X^T X)^(-1) X^T y
# X_with_bias: (sample_count, n+1)
# y: (sample_count, m)
# Solution will be shape (n+1, m)

# Extract A and b



In [None]:
import torch


In [None]:
a = torch.rand((10,1))

In [None]:
b = a.repeat(2,1)
print(b)

True

In [63]:
b.view(2, -1, 5)

tensor([[[0.0679, 0.7327, 0.0394, 0.5948, 0.7531],
         [0.1648, 0.0825, 0.8914, 0.7887, 0.5388]],

        [[0.0679, 0.7327, 0.0394, 0.5948, 0.7531],
         [0.1648, 0.0825, 0.8914, 0.7887, 0.5388]]])

In [62]:
b.view(2, -1, 10)

tensor([[[0.0679, 0.7327, 0.0394, 0.5948, 0.7531, 0.1648, 0.0825, 0.8914,
          0.7887, 0.5388]],

        [[0.0679, 0.7327, 0.0394, 0.5948, 0.7531, 0.1648, 0.0825, 0.8914,
          0.7887, 0.5388]]])