In [1]:
import torch
import os
from torch import linalg as LA
import numpy as np
print(torch.cuda.is_available())

True


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device='cuda' 

# Take ρ(x) = tanh(x). Consider the 1-hidden layer neural network
# ŷ i = w 2 T ρ(W 1 x i + b 1 ) + b 2
# W is 20 × 10 and w 2 is a vector of size 20. x is a vector of size 10. b 1 and b 2 are vectors of size 20 and 1 respectively.

# Define the parameters
param_dict = {
    'W1': torch.randn(20, 10, device=device, requires_grad=True),
    'b1': torch.randn(10, device=device, requires_grad=True),
    'W2': torch.randn(20, 20, device=device, requires_grad=True),
    'b2': torch.randn(20, device=device, requires_grad=True)
}   


# Define the network without using any PyTorch modules
def my_nn(input, param_dict):

    # Reshape the input image from HxW to a flat vector of size H*W
    x = input.view(-1, 20)
    # print(x.shape)
    # print(param_dict['W1'].shape)
    # print(param_dict['b1'].shape)
    tanh = torch.nn.Tanh()

    x = torch.matmul(x, param_dict['W1']) + param_dict['b1']

    x = tanh(x)
    x = x.view(-1, 20)
    # print(x.shape)
    # print(param_dict['W2'].shape)
    # print(param_dict['b2'].shape)
    
    x = torch.matmul(x, param_dict['W2']) + param_dict['b2']
    x = x.view(-1, 100)
    #print(x.shape)
    return x

# The
# absolute loss given
# l(ŷ, y) = |ŷ − y|
# Consider the cost function J = 1/N ∑ i=1 l(ŷ , y )
# Derive an expression for
# ∂J/∂W1,∂J/∂W2,∂J/∂b1,∂J/∂b2
# and implement it in PyTorch.


# Define the loss function
def my_loss(y_hat, y):
    return torch.mean(torch.abs(y_hat - y))

# Define the gradient computation by hand
def compute_gradients( param_dict, x, y):
    W1, b1, W2, b2 = param_dict.values()
    N = x.shape[0]
    x = input.view(-1, 20)
    z1 = torch.matmul(W1, x) + b1.unsqueeze(1)
    a1 = torch.tanh(z1)
    z2 = torch.matmul(W2, a1) + b2
    y_pred = z2.squeeze()

    sign = torch.sign(y_pred - y)
    dz2 = sign / N
    dW2 = torch.matmul(a1, dz2.T)
    db2 = dz2.sum(dim=1)
    da1 = torch.matmul(W2, dz2)
    dz1 = da1 * (1 - a1**2)
    dW1 = torch.matmul(dz1, x)
    db1 = dz1.sum(dim=1)

    return np.array([dW1, dW2, db1, db2])

# Define the training loop
def train_loop(param_dict, input, target, num_epochs=1000, learning_rate=1e-3):
    for epoch in range(num_epochs):
        # gradient calulated by hand
        # hand_grad = compute_gradients(param_dict, input, target)
        # print(LA.norm(hand_grad))
        # gradient calculated by pytorch
        y_hat = my_nn(input, param_dict)
        loss = my_loss(y_hat, target)
        loss.backward()
        
        #print(LA.norm(param_dict['W1'].grad))
        with torch.no_grad():
            for param in param_dict.values():
                param -= learning_rate * param.grad
                param.grad.zero_()
        if epoch % 100 == 0:
            print('Epoch %d, Loss %f' % (epoch, float(loss)))
    return param_dict

# Generate some random data
input = torch.randn(100, 20, device=device)
target = torch.randn(100, device=device)

# Train the network
train_loop(param_dict, input, target)


Epoch 0, Loss 3.370969
Epoch 100, Loss 3.360848
Epoch 200, Loss 3.350780
Epoch 300, Loss 3.340856
Epoch 400, Loss 3.330891
Epoch 500, Loss 3.320843
Epoch 600, Loss 3.310786
Epoch 700, Loss 3.300823
Epoch 800, Loss 3.290792
Epoch 900, Loss 3.280943


{'W1': tensor([[ 0.3905, -1.7307, -1.5761, -0.9631, -0.8393,  0.0843,  0.2048,  0.1466,
          -0.4340,  0.4377],
         [ 0.9929,  0.6389, -1.7927, -1.3507, -0.9371, -0.2019, -0.3487, -0.0751,
          -2.4599, -0.8706],
         [-1.9312,  1.3581,  1.3343,  0.4263, -0.0698,  0.9423, -0.9671,  0.1911,
           2.4683,  1.1250],
         [-0.3267, -0.7562, -0.1369,  0.2259, -0.5556,  0.6053,  0.5326,  0.2126,
           0.2380, -0.2512],
         [-1.1418, -0.8099,  0.8815,  0.4031,  0.8145,  0.3241,  0.8481,  0.6752,
           0.5213,  0.2456],
         [ 2.1302, -0.7105,  0.5503,  0.3116, -1.5045,  1.4623,  0.5692,  0.0047,
          -0.9773, -0.2093],
         [-0.1730,  0.9484, -0.4274,  0.6228, -0.6712,  1.2872, -0.4323,  2.0429,
           0.3078,  1.3822],
         [-0.7166, -0.7394, -0.9673,  0.2533, -0.3730, -0.6409, -2.0040,  1.3424,
          -0.1576,  0.7179],
         [-0.4161,  1.8600,  1.7107,  0.2197, -2.4808, -0.1957,  0.5282,  0.1225,
          -1.2439, -1.70

In [19]:
# Train this model on the sklearn California Housing Prices datasets.
# • For this you may use the optimizer and learning rates of your choice and train
# for 20-50 epochs.
# • Take half the data for training and half for testing.
# • Create a validation set from the training set and use it to select a good learning
# rate.
# • You might want to use the convenient Xavier initialization.
# • You are free to use the torch.optim package for this part.
# • To speed up things, run the training loop by batches (e.g. 4, 8, 32, 64, etc.).
# PyTorch’s DataLoader would be a useful tool to easily fetch a predefined set
# of batches per training iteration.
# • Report the mean squared error on the train and test set after each epoch.
# • You will need to adjust the size of W 1 to fit the size of this data.

# Load the data
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
import numpy as np

data = fetch_california_housing()
X = data['data']
y = data['target']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Normalize the data
X_train = (X_train - np.mean(X_train, axis=0)) / np.std(X_train, axis=0)
X_test = (X_test - np.mean(X_test, axis=0)) / np.std(X_test, axis=0)

# Convert to torch tensors
X_train = torch.from_numpy(X_train).float().to(device)
X_test = torch.from_numpy(X_test).float().to(device)
y_train = torch.from_numpy(y_train).float().to(device)
y_test = torch.from_numpy(y_test).float().to(device)

# Define the network
class MyNet(torch.nn.Module):
    def __init__(self):
        super(MyNet, self).__init__()
        self.linear1 = torch.nn.Linear(8, 20)
        self.linear2 = torch.nn.Linear(20, 1)
        self.tanh = torch.nn.Tanh()
    def forward(self, x):
        x = self.linear1(x)
        x = self.tanh(x)
        x = self.linear2(x)
        return x
    
# Define the loss function
def my_loss(y_hat, y):
    return torch.mean(torch.abs(y_hat - y))

# Define the training loop
def train_loop(model, input, target, num_epochs=1000, learning_rate=1e-3):
    for epoch in range(num_epochs):
        y_hat = model(input)
        loss = my_loss(y_hat, target)
        loss.backward()
        with torch.no_grad():
            for param in model.parameters():
                param -= learning_rate * param.grad
                param.grad.zero_()
        if epoch % 100 == 0:
            print('Epoch %d, Loss %f' % (epoch, float(loss)))
    return model

# Train the network
model = MyNet().to(device)
model = train_loop(model, X_train, y_train)

# Evaluate the network
y_hat = model(X_test)
loss = my_loss(y_hat, y_test)
print('Test loss: %f' % loss)



Epoch 0, Loss 1.979535
Epoch 100, Loss 1.810317
Epoch 200, Loss 1.645368
Epoch 300, Loss 1.494032
Epoch 400, Loss 1.362126
Epoch 500, Loss 1.251653
Epoch 600, Loss 1.161922
Epoch 700, Loss 1.091001
Epoch 800, Loss 1.036773
Epoch 900, Loss 0.996090
Test loss: 0.967058
