Automatic Differentiation
=========================

In [2]:
import torch
import torchvision
import torchvision.transforms as transforms

In [3]:
# Cargar el conjunto de datos MNIST
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=transform, download=True)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data\MNIST\raw\train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:04<00:00, 2208406.31it/s]


Extracting ./data\MNIST\raw\train-images-idx3-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data\MNIST\raw\train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 4817294.75it/s]


Extracting ./data\MNIST\raw\train-labels-idx1-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data\MNIST\raw\t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 2144345.42it/s]


Extracting ./data\MNIST\raw\t10k-images-idx3-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data\MNIST\raw\t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<?, ?it/s]

Extracting ./data\MNIST\raw\t10k-labels-idx1-ubyte.gz to ./data\MNIST\raw






Easy example of derivative with dual numbers
==========================

![example](ejemplo.PNG)

In [6]:
# Definir números duales
class DualNumber:
    def __init__(self, real, dual):
        self.real = real
        self.dual = dual

    
def mul_dual(x,y):
    '''
    We need to define how we perform product in dual space
    '''
    return DualNumber(x.real@y.real,x.real@y.dual+x.dual@y.real)

def add_dual(x,y):
    return DualNumber(x.real+y.real,x.dual+y.dual)

# Función de activación sigmoide
def sigmoid(x):
    ''' 
    The real part will be the sigmoid and the dual part would be the analytic derivative of the sigmoid
    '''
    sig = 1.0 / (1.0 + torch.exp(-x.real))
    return DualNumber(sig, sig * (1.0 - sig) * x.dual)

# Entropía cruzada para múltiples clases
def cross_entropy_loss(predictions, targets):
    ''' 
    The real part will be the cross entropy and the dual part would be the analytic derivative of the ce
    '''
    #print('las predicciones son',predictions.real.shape)
    #print('las targets son',targets.shape)
    loss = -torch.sum(targets * torch.log(predictions.real + 1e-8))
    #print('la loss es',loss)
    loss_dual = -torch.sum(targets * (1.0 / (predictions.real + 1e-8)) * predictions.dual)
    #print('la loss dual es',loss_dual)
    return DualNumber(loss, loss_dual)

def softmax_with_dual(x_dual):
    ''' 
    The real part will be the softmax and the dual part would be the analytic derivative of the softmax
    '''
    exp_real = torch.exp(x_dual.real)
    sum_exp_real = torch.sum(exp_real, dim=1, keepdim=True)
    softmax_real = exp_real / sum_exp_real

    softmax_dual = (exp_real * (sum_exp_real - exp_real)) / (sum_exp_real * sum_exp_real) * x_dual.dual

    return DualNumber(softmax_real, softmax_dual)
    

# Definir una MLP simple con diferenciación automática
class SimpleMLP:
    def __init__(self, input_size, hidden_size, output_size):
        #the dual part of the weights is initialized as ones. 
        self.W1 = DualNumber(torch.randn(input_size, hidden_size), torch.ones(input_size, hidden_size))
        #self.b1 = DualNumber(torch.randn(1, hidden_size), torch.ones_like(1, hidden_size))
        self.W2 = DualNumber(torch.randn(hidden_size, output_size), torch.ones(hidden_size, output_size))
        #self.b2 = DualNumber(torch.randn(1, output_size), torch.ones_like(1, output_size))

    def forward(self, x):
        #the dual part of our samples are zeros because we are not interested in the derivatives with respect to x 
        x_dual = DualNumber(x, torch.zeros_like(x))  # Crear número dual para la entrada
        #it is like w@x
        z1 = mul_dual(x_dual,self.W1)
        #we activate it
        a1 = sigmoid(z1) #we utilize the sigmoid we defined, which gives us the dual part, which corresponds to the derivative

        #this part will te utilized to adjust the w1
        #here the dual part of the weights are set to zero because in the second part of the chain rule the variable 
        #see picture of example to see why
        z2_w1 = mul_dual(a1,DualNumber(self.W2.real,torch.zeros_like(self.W2.real)))
        #and this part to adjust the w2
        #here we need the a1 with the dual part as zeros because we are not interested in that derivative  to adjust w2
        z2_w2 = mul_dual(DualNumber(a1.real,torch.zeros_like(a1.real)),self.W2)
        a2_w1=softmax_with_dual(z2_w1)
        a2_w2=softmax_with_dual(z2_w2)      

        return a2_w1,a2_w2
    
    def zero_grad(self):
        self.W1.dual=torch.ones_like(self.W1.real)
        self.W2.dual=torch.ones_like(self.W2.real)
        return self.W1,self.W2

In [7]:
# Crear la MLP
input_size = 28 * 28
hidden_size = 128
output_size = 10
mlp = SimpleMLP(input_size, hidden_size, output_size)

# Entrenamiento simple usando diferenciación automática
learning_rate = 0.0000000000000000001
num_epochs = 7

for epoch in range(num_epochs):
    contador=0
    for images, labels in train_loader:
        images = images.view(-1, input_size)
        labels_onehot = torch.zeros(labels.size(0), output_size)
        labels_onehot.scatter_(1, labels.view(-1, 1), 1)

        #images_dual = DualNumber(images, torch.zeros_like(images))

        # Forward pass
        a2_w1,a2_w2 = mlp.forward(images)
        #print('salio de forward',contador, end='\r')
        #print('lo que salio de forward',a2_w1.dual, end='\r')
        contador+=1
        loss_w1 = cross_entropy_loss(a2_w1, labels_onehot)
        print('loss real',loss_w1.real)
        print('loss dual w1',loss_w1.dual)
        loss_w2 = cross_entropy_loss(a2_w2, labels_onehot)
        #print('loss real w2',loss_w2.real)
        #print('loss dual w2',loss_w2.dual)

        # Backpropagation
        #loss.real.backward()

        # Actualizar parámetros
        mlp.W1.real -= learning_rate * loss_w1.dual/64
        #mlp.W1.real -= 0 * loss_w1.dual

        #mlp.b1.real -= learning_rate * mlp.b1.real.grad
        mlp.W2.real -= 0 * loss_w2.dual/64
        #mlp.b2.real -= learning_rate * mlp.b2.real.grad

        # Limpiar duales de los pesos w
        mlp.zero_grad()
        #print('dual w1',mlp.W1.dual)
        

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss w1: {loss_w1.real.item()}, Loss w2: {loss_w2.real.item()}')

loss real tensor(894.8837)
loss dual w1 tensor(270.2663)
loss real tensor(919.9730)
loss dual w1 tensor(2602.7659)
loss real tensor(859.0394)
loss dual w1 tensor(282.1953)
loss real tensor(918.1414)
loss dual w1 tensor(-1447.6689)
loss real tensor(958.4288)
loss dual w1 tensor(1526.1520)
loss real tensor(946.1744)
loss dual w1 tensor(2157.1963)
loss real tensor(847.3157)
loss dual w1 tensor(343.2097)
loss real tensor(862.3750)
loss dual w1 tensor(-1753.5132)
loss real tensor(871.6534)
loss dual w1 tensor(-38.9536)
loss real tensor(812.1541)
loss dual w1 tensor(1697.7244)
loss real tensor(911.8663)
loss dual w1 tensor(-555.9580)
loss real tensor(875.0541)
loss dual w1 tensor(2155.1931)
loss real tensor(879.2654)
loss dual w1 tensor(2106.4673)
loss real tensor(779.1421)
loss dual w1 tensor(780.0670)
loss real tensor(873.3259)
loss dual w1 tensor(-1641.6405)
loss real tensor(883.1443)
loss dual w1 tensor(-248.7106)
loss real tensor(831.0110)
loss dual w1 tensor(3174.1633)
loss real tensor

KeyboardInterrupt: 