• Implement the code for the 2-layer neural networks in CS231n 
2021 version with PyTorch (or TensorFlow). 

• Once you have the code (regardless of which framework you 
choose above), you will apply your own data.  The training and test 
dataset is 80%:20%.

• You need to run the code with the following hyperparameter 
settings:

✓ Activation function: tanh, ReLU;

✓ Data preprocessing;

✓ Initial weights: small random number, Xavier or Kaiming/MSRA 
Initialization

✓ Loss function: without or with the regularization term 
(L2), λ = 0.001 or 0.0001
$$ E(w) = \frac{1}{N}\sum^{N}_{c=1}[𝑓(X^c, w) −y^c]^2 
 + \lambda[\sum^{p}_{i=0}(w^{o}_{i})^2
 + \sum_{i=1}^{p}\sum_{j=0}^{m}(w_{ij}^H)^2]
$$
✓ Optimizer: gradient descent, Momentum, Adam;

✓ Learning epochs: 100, 200, 300;

✓ Amount of hidden nodes: 5, 8, 11;

✓ Learning rate decay schedule: none and cosine

✓ Ensembles: top 3 models

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split


In [17]:
class TwoLayerNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(TwoLayerNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()  # activation function
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out


In [18]:
def preprocess_data(X, y):
    X = (X - np.mean(X)) / np.std(X)  # standardize X
    y = y.reshape(-1, 1)  # reshape y to 2D array
    return X, y


In [19]:
def init_weights(model, method='small_random'):
    if method == 'small_random':
        for param in model.parameters():
            nn.init.normal_(param, mean=0, std=0.01)
    elif method == 'xavier':
        for param in model.parameters():
            if len(param.shape) > 1:
                nn.init.xavier_uniform_(param)
    elif method == 'kaiming':
        for param in model.parameters():
            if len(param.shape) > 1:
                nn.init.kaiming_uniform_(param, nonlinearity='relu')


In [20]:
def train_model(X_train, y_train, X_val, y_val, input_size, hidden_size, num_classes, activation_fn, init_method, reg_lambda, optimizer, learning_rate, num_epochs, hidden_nodes, lr_decay_schedule):
    model = TwoLayerNet(input_size, hidden_size, num_classes)
    init_weights(model, init_method)

    criterion = nn.CrossEntropyLoss()  # use cross-entropy loss for classification
    if reg_lambda > 0:
        l2_reg = 0.0
        for param in model.parameters():
            l2_reg += torch.norm(param)
        criterion = nn.CrossEntropyLoss() + reg_lambda * l2_reg

    if optimizer == 'SGD':
        optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    elif optimizer == 'momentum':
        optimizer = optim.SGD(model.parameters(),
                              lr=learning_rate, momentum=0.9)
    elif optimizer == 'Adam':
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    train_losses = []
    val_losses = []
    best_val_loss = float('inf')
    best_model = None
    for epoch in range(num_epochs):
        # learning rate decay schedule
        if lr_decay_schedule == 'cosine':
            lr = learning_rate * 0.5 * (1 + np.cos(epoch / num_epochs * np.pi))
        else:
            lr = learning_rate

        # train
        model.train()
        optimizer.zero_grad()
        inputs = torch.from_numpy(X_train).float()


In [21]:
import torchvision.datasets as datasets
import torchvision.transforms as transforms

# Define data transforms to normalize the data
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Download the training and test sets
trainset = datasets.FashionMNIST(root='../data', train=True, download=False, transform=transform)
testset = datasets.FashionMNIST(root='../data', train=False, download=False, transform=transform)

# Split the training set into training and validation sets
trainset, valset = train_test_split(trainset, test_size=0.2, random_state=42)

# Create data loaders to load the data in batches
trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=32, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=32, shuffle=True)


In [30]:
# Set device to use (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define hyperparameters
input_size = 784 # 28x28
hidden_size = 8
output_size = 10
learning_rate = 0.1
num_epochs = 10

# Create model, loss function, and optimizer
model = TwoLayerNet(input_size, hidden_size, output_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
# Train the model
for epoch in range(num_epochs):
    train_loss = 0.0
    train_correct = 0
    model.train()
    for images, labels in trainloader:
        images = images.view(-1, input_size).to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        print(labels.size())
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * images.size(0)
        _, predicted = torch.max(outputs.data, 1)
        train_correct += (predicted == labels).sum().item()
    train_loss /= len(trainloader.dataset)
    train_accuracy = 100. * train_correct / len(trainloader.dataset)

    # Validate the model
    val_loss = 0.0
    val_correct = 0
    model.eval()
    with torch.no_grad():
        for images, labels in valloader:
            images = images.view(-1, input_size).to(device)
            labels = labels.to(device)
            outputs = model(images)
            
            loss = criterion(outputs, labels)
            val_loss += loss.item() * images.size(0)
            _, predicted = torch.max(outputs.data, 1)
            val_correct += (predicted == labels).sum().item()
        val_loss /= len(valloader.dataset)
        val_accuracy = 100. * val_correct / len(valloader.dataset)

    # Print epoch statistics
    print('Epoch [{}/{}], Train Loss: {:.4f}, Train Accuracy: {:.2f}%, Val Loss: {:.4f}, Val Accuracy: {:.2f}%'
          .format(epoch+1, num_epochs, train_loss, train_accuracy, val_loss, val_accuracy))
    break


torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32

In [31]:
for x,y in trainloader:
    print(x.size(), y.size(), y)
    break

torch.Size([32, 1, 28, 28]) torch.Size([32]) tensor([5, 1, 2, 6, 3, 0, 7, 3, 2, 9, 7, 9, 0, 3, 3, 2, 9, 6, 1, 5, 5, 0, 9, 4,
        3, 6, 5, 5, 7, 9, 7, 6])
