In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import torch.nn.functional as F

# Q1

In [7]:
torch.random.manual_seed(42)
### Q1
# Define the neural network architecture
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Define the hyperparameters
input_size = 10
hidden_size = 20
output_size = 5
learning_rate = 0.001
num_epochs = 1000

# Create the neural network object
model = NeuralNetwork(input_size, hidden_size, output_size)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Generate some dummy data for training
train_data = torch.randn(100, input_size)
train_labels = torch.randint(output_size, (100,))

# Training loop
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(train_data)  # Pass the training data through the model and obtain the predictions

    # Compute the loss
    loss = criterion(outputs, train_labels)

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch+1) % 100 == 0:
        print(f'Epoch: {epoch+1}/{num_epochs}, Loss: {loss.item()}')

# Test the trained model
test_data = torch.randn(10, input_size)
with torch.no_grad():
    test_outputs = model(test_data)  # Pass the test data through the model and obtain the predictions

    # Print the predictions
    _, predicted = torch.max(test_outputs.data, 1)
    print("Predictions:", predicted)

Epoch: 100/1000, Loss: 1.6116470098495483
Epoch: 200/1000, Loss: 1.6078097820281982
Epoch: 300/1000, Loss: 1.6041063070297241
Epoch: 400/1000, Loss: 1.6005334854125977
Epoch: 500/1000, Loss: 1.5970773696899414
Epoch: 600/1000, Loss: 1.5937395095825195
Epoch: 700/1000, Loss: 1.5904844999313354
Epoch: 800/1000, Loss: 1.5873106718063354
Epoch: 900/1000, Loss: 1.5842201709747314
Epoch: 1000/1000, Loss: 1.5812039375305176
Predictions: tensor([2, 4, 4, 2, 2, 2, 3, 2, 2, 3])


# Q2

In [31]:
### Q2

batch_size = 64

# Load the MNIST dataset
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

train_dataset = datasets.MNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = datasets.MNIST(root='./data', train=False, transform=transform)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)


In [32]:
from typing import Optional


class MLP(nn.Module):
    def __init__(self, input_size, output_size, *hidden_size, num_layers: Optional[int]=None, activation_function=F.relu):
        super().__init__()
        
        if not len(hidden_size) > 0:
            hidden_size = (100,)
        
        if num_layers:
            assert len(hidden_size) == num_layers, "hidden size and num_layers should be equal!"
        else: 
            num_layers = len(hidden_size)

        self.flatten = nn.Flatten()
        ## input layer
        self.input_layer = nn.Linear(input_size, hidden_size[0])
        self.hidden_layers = nn.ModuleList([
            nn.Linear(hidden_size[i - 1], hidden_size[i]) for i in range(1, num_layers)
        ])
        ## output layer
        self.output_layer = nn.Linear(hidden_size[-1], output_size)
        self.activation_function = activation_function
    
    def forward(self, x):
        x = self.flatten(x)
        x = self.activation_function(self.input_layer(x))
        for hidden_layer in self.hidden_layers:
            x = self.activation_function(hidden_layer(x))
        x = self.output_layer(x)
        return x


In [27]:
# Define Hyperparameters
input_size = 28 * 28  # MNIST image size
output_size = 10     # Number of classes (digits 0-9)
learning_rate = 0.01
num_epochs = 10

model = MLP(input_size, output_size, 30, 10)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)


In [28]:
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

print('Training finished!')


Epoch [1/10], Step [100/938], Loss: 2.1683
Epoch [1/10], Step [200/938], Loss: 2.0536
Epoch [1/10], Step [300/938], Loss: 1.8033
Epoch [1/10], Step [400/938], Loss: 1.3741
Epoch [1/10], Step [500/938], Loss: 1.1266
Epoch [1/10], Step [600/938], Loss: 0.7078
Epoch [1/10], Step [700/938], Loss: 0.5134
Epoch [1/10], Step [800/938], Loss: 0.5841
Epoch [1/10], Step [900/938], Loss: 0.4681
Epoch [2/10], Step [100/938], Loss: 0.3833
Epoch [2/10], Step [200/938], Loss: 0.4371
Epoch [2/10], Step [300/938], Loss: 0.4573
Epoch [2/10], Step [400/938], Loss: 0.6265
Epoch [2/10], Step [500/938], Loss: 0.6180
Epoch [2/10], Step [600/938], Loss: 0.3737
Epoch [2/10], Step [700/938], Loss: 0.2556
Epoch [2/10], Step [800/938], Loss: 0.3701
Epoch [2/10], Step [900/938], Loss: 0.3056
Epoch [3/10], Step [100/938], Loss: 0.4161
Epoch [3/10], Step [200/938], Loss: 0.5002
Epoch [3/10], Step [300/938], Loss: 0.2186
Epoch [3/10], Step [400/938], Loss: 0.3469
Epoch [3/10], Step [500/938], Loss: 0.4358
Epoch [3/10

In [29]:
# Evaluation
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        # images = images.view(-1, 784)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print("Accuracy: {:.2f}%".format(accuracy))

Accuracy: 93.27%


# Q3

In [None]:
import torch.nn.functional as F
from torchvision.datasets import CIFAR10
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"

class Swish(nn.Module):
    def forward(self, x):
        return x * torch.sigmoid(x)


# Define the deep neural network architecture
class DeepNeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super().__init__()
        self.input_layer = nn.Linear(input_size, hidden_size)
        self.hidden_layers = nn.ModuleList([
            nn.Linear(hidden_size, hidden_size) for _ in range(num_layers)
        ])
        self.output_layer = nn.Linear(hidden_size, output_size)
        self.swish = Swish()

    def forward(self, x):
        x = self.swish(self.input_layer(x))
        for hidden_layer in self.hidden_layers:
            x = self.swish(hidden_layer(x))
        x = self.output_layer(x)
        return x


In [None]:
def run(model, input_size, normal_init=True):
  torch.manual_seed(42)
  model.to(device=device)

  # Define hyperparameters
  learning_rate = 1e-3
  batch_size = 64
  num_epochs = 100

  def init_weight(m):
    if isinstance(m, nn.Linear):
      nn.init.kaiming_normal_(m.weight)
      nn.init.zeros_(m.bias)

  if normal_init:
    model.apply(init_weight)
  
  transform = transforms.Compose([
      transforms.ToTensor(),
      transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
  ])

  # load dataset
  train_dataset = CIFAR10(root='./data', train=True, download=True, 
                          transform=transform)
  test_dataset = CIFAR10(root='./data', train=False, download=True, 
                          transform=transform)
  
  # create data loader
  train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  test_loader = DataLoader(test_dataset, batch_size=batch_size)


  # Define the loss and optimizer
  criterion = nn.CrossEntropyLoss()
  optimizer = optim.NAdam(model.parameters(), lr=learning_rate) # Nadam = Adam + Nesterove

  # training loop

  # Early stoping
  best_accuracy = 0.0
  patience = 5
  epoch_num_improve = 0

  for epoch in range(num_epochs):

    model.train()
    train_loss = 0.0
    train_correct = 0

    # batch
    for images, labels in tqdm(train_loader):
      # flatten data into 1D tensor
      images = images.view(-1, input_size)

      images = images.to(device=device)
      labels = labels.to(device=device)
      
      # Forward pass
      output = model(images)
      loss = criterion(output, labels)

      # TODO: every 4 batch, update?!
      # Backward and optimize
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      train_loss += loss.item()
      _, predicted = torch.max(output.data, 1)
      train_correct += (predicted == labels).sum().item()
    train_accuracy = train_correct / len(train_dataset)
    train_loss /= len(train_loader)

    # Evaluate
    model.eval()
    test_correct = 0

    with torch.no_grad():
      for images, labels in test_loader:
        # flatten data into 1D tensor
        images = images.view(-1, input_size)

        images = images.to(device=device)
        labels = labels.to(device=device)

        output = model(images)

        _, predicted = torch.max(output.data, 1)
        test_correct += (predicted == labels).sum().item()
    test_accuracy = test_correct / len(test_dataset)

    print(f"Epoch: {epoch + 1}/{num_epochs} | Train loss: {train_loss:.4f} | Train Acc: {train_accuracy:.4f} | Test Acc: {test_accuracy:.4f}")

    # Early stopping + save Checkpoint
    if test_accuracy > best_accuracy:
      best_accuracy = test_accuracy
      epoch_num_improve = 0
      torch.save(model.state_dict(), "best_model.pth")
    else:
      epoch_num_improve += 1
      if epoch_num_improve > patience:
        print(f"Early stopping, best accuracy {best_accuracy}")
        break

In [None]:
input_size = 32 * 32 * 3 # CIFAR10 color image(RGB), pixels with 3 channels
hidden_size = 100
num_layers = 20
output_size = 10


model = DeepNeuralNet(input_size, hidden_size, num_layers, output_size)
run(model, input_size)

In [None]:
class DeepNeuralBN(nn.Module):
  def __init__(self, input_size, hidden_size, num_layers, out_size):
    super().__init__()
    self.input_layer = nn.Linear(input_size, hidden_size)

    self.hidden_layers = nn.ModuleList([
        nn.Linear(hidden_size, hidden_size) for _ in range(num_layers)
    ])
    self.bn_layers = nn.ModuleList([
        nn.BatchNorm1d(hidden_size) for _ in range(num_layers)
    ])
    self.output_layer = nn.Linear(hidden_size, out_size)
    self.swish = Swish()

  def forward(self, x):
    x = self.swish(self.input_layer(x))
    for layer, bn in zip(self.hidden_layers, self.bn_layers):
      x = self.swish(bn(layer(x)))
    x = self.output_layer(x)
    return x


In [None]:
input_size = 32 * 32 * 3 # CIFAR10 color image(RGB), pixels with 3 channels
hidden_size = 100
num_layers = 20
output_size = 10


model = DeepNeuralBN(input_size, hidden_size, num_layers, output_size)
run(model, input_size)

In [None]:
class DeepNeuralNetSELU(nn.Module):
  def __init__(self, input_size, hidden_size, num_layers, out_size):
    super().__init__()
    self.input_layer = nn.Linear(input_size, hidden_size)
    self.hidden_layers = nn.ModuleList([
        nn.Linear(hidden_size, hidden_size) for _ in range(num_layers)
    ])
    self.output_layer = nn.Linear(hidden_size, out_size)

  def forward(self, x):
    x = F.selu(self.input_layer(x))
    for layer in self.hidden_layers:
      x = F.selu(layer(x))
    x = self.output_layer(x)
    return x

In [None]:
input_size = 32 * 32 * 3 # CIFAR10 color image(RGB), pixels with 3 channels
hidden_size = 100
num_layers = 20
output_size = 10


model = DeepNeuralNetSELU(input_size, hidden_size, num_layers, output_size)
run(model, input_size)