## HW Requirement

• Implement the code for the 2-layer neural networks in CS231n 
2021 version with PyTorch (or TensorFlow). 

• Once you have the code (regardless of which framework you 
choose above), you will apply your own data.  The training and test 
dataset is 80%:20%.

• You need to run the code with the following hyperparameter 
settings:

✓ Activation function: tanh, ReLU

✓ Data preprocessing

✓ Initial weights: small random number, Xavier or Kaiming/MSRA 
Initialization

✓ Loss function: without or with the regularization term 
(L2), λ = 
0.001 or 0.0001
$$ E(w) = \frac{1}{N}\sum^{N}_{c=1}[𝑓(X^c, w) −y^c]^2 
 + \lambda[\sum^{p}_{i=0}(w^{o}_{i})^2
 + \sum_{i=1}^{p}\sum_{j=0}^{m}(w_{ij}^H)^2]
$$
✓ Optimizer: gradient descent, Momentum, Adam

✓ Learning epochs: 100, 200, 300

✓ Amount of hidden nodes: 5, 8, 11

✓ Learning rate decay schedule: none and cosine

✓ Ensembles: top 3 models

## Model

In [37]:
import torch
from torch import nn, optim, Generator
from torch.utils.data import DataLoader, Dataset, sampler, random_split


In [27]:
actives = {
    "relu": nn.ReLU,
    "tanh": nn.Tanh
}
inits = {
    "small_random": lambda x: nn.init.normal_(tensor=x, mean=0, std=0.01),
    "xavier": lambda x: nn.init.xavier_uniform_(tensor=x) if len(x.shape) > 1 else None,
    "kaiming": lambda x: nn.init.kaiming_uniform_(tensor=x, nonlinearity='relu') if len(x.shape) > 1 else None
}
optims = {
    "sgd": optim.SGD,
    "momentum": lambda param, lr: optim.SGD(params=param, lr=lr, momentum=0.9),
    "adam": optim.Adam
}
schedulers={
    "cos":lambda opt:torch.optim.lr_scheduler.CosineAnnealingLR(optimizer=opt, T_max=200)
}

In [28]:
from collections.abc import Callable
class TwoLayerNetwork(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, num_classes: int, init_method:Callable, active_func:nn.modules.module.Module) -> None:
        super(TwoLayerNetwork, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        ## first layer
        self.fc1 = nn.Linear(input_size, hidden_size)
        ## activation
        self.active_func = active_func()
        ## initialize
        for param in self.parameters():
            init_method(param)
        ## second layer
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.active_func(out)
        out = self.fc2(out)
        return out


In [40]:
def train(model: TwoLayerNetwork, opt: nn.Module, device: str, epochs: int, learning_rate: float, trainloader: DataLoader, valloader: DataLoader, criterion: nn.modules.loss._Loss, sched: optim.lr_scheduler._LRScheduler):
    model.to(device)
    optimizer = opt(model.parameters(), lr=learning_rate)
    scheduler = sched(optimizer) if sched else None
    if epochs < 1:
        raise ValueError("Invalid epoch!!")
    else:
        epochs = int(epochs)
    # Train the model
    for epoch in range(epochs):
        train_loss = 0.0
        train_correct = 0
        model.train()
        for X, y in trainloader:
            X = X.view(-1, model.input_size).to(device)
            y = y.to(device)
            optimizer.zero_grad()
            outputs = model(X)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * X.size(0)
            _, predicted = torch.max(outputs.data, 1)
            train_correct += (predicted == y).sum().item()
        train_loss /= len(trainloader.dataset)
        train_accuracy = 100. * train_correct / len(trainloader.dataset)

        # Validate the model
        val_loss = 0.0
        val_correct = 0
        model.eval()
        with torch.no_grad():
            for X, y in valloader:
                X = X.view(-1, model.input_size).to(device)
                y = y.to(device)
                outputs = model(X)
                loss = criterion(outputs, y)
                val_loss += loss.item() * X.size(0)
                _, predicted = torch.max(outputs.data, 1)
                val_correct += (predicted == y).sum().item()
            val_loss /= len(valloader.dataset)
            val_accuracy = 100. * val_correct / len(valloader.dataset)
        if scheduler:
            scheduler.step()
        # Print epoch statistics
        print('Epoch [{}/{}], Train Loss: {:.4f}, Train Accuracy: {:.2f}%, Val Loss: {:.4f}, Val Accuracy: {:.2f}%'
              .format(epoch+1, epochs, train_loss, train_accuracy, val_loss, val_accuracy))


In [None]:
def test(model:nn.Module, device:str, testloader:DataLoader):
    val_correct = 0
    model.eval()
    with torch.no_grad():
        for X, y in testloader:
            X = X.view(-1, model.input_size).to(device)
            y = y.to(device)
            outputs = model(X)
            _, predicted = torch.max(outputs.data, 1)
            val_correct += (predicted == y).sum().item()
        val_accuracy = 100. * val_correct / len(testloader.dataset)
        print(val_accuracy)

66.5380374862183


# Dataset

In [20]:
import pandas as pd
import numpy as np
class HotelReservationDataset(Dataset):
    """Hotel Reservation dataset."""

    def __init__(self, csv_path, root_dir):
        """
        Args:
            csv_path (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        reservations = pd.read_csv(csv_path)
        self.labels_of_columns = dict()
        for col in map(lambda x: x[0], filter(lambda x:x[1]=="O", reservations.dtypes.items())):
            d = dict((j, i) for i, j in enumerate(reservations[col].value_counts().index))
            self.labels_of_columns[col] = d.keys()
            reservations[col]=reservations[col].apply(d.__getitem__)

        self.root_dir = root_dir
        self.features = torch.from_numpy(reservations.iloc[:, 1:-1].to_numpy(dtype=np.float32))
        self.labels = torch.reshape(torch.tensor(reservations.iloc[:, -1:].to_numpy()), shape=(len(self.features),))
        print(self.labels.size())
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return self.features[idx], self.labels[idx]


In [21]:
# load
dataset = HotelReservationDataset(
    csv_path=r"D:\dataset\archive\Hotel Reservations.csv", root_dir="./data")


torch.Size([36275])


In [38]:
# preprocess

# train test split
train_count = int(0.7 * len(dataset))
valid_count = int(0.2 * len(dataset))
test_count = len(dataset) - train_count - valid_count
print(train_count, valid_count, test_count)
trainset, valset, testset = random_split(
    dataset, (train_count, valid_count, test_count), Generator().manual_seed(42))
# set loaders
trainloader = DataLoader(trainset, batch_size=32, shuffle=True)
valloader = DataLoader(valset, batch_size=32, shuffle=True)
testloader = DataLoader(testset, batch_size=32, shuffle=True)


## Training

In [7]:
def training_schedule():
    # processor
    device = "cuda" if torch.cuda.is_available(
    ) else "mps" if torch.backends.mps.is_available() else "cpu"
    # ✓ Loss function: without or with L2, λ = 0.001 or 0.0001

    # hyper parameters
    input_size = len(dataset[0][0])
    output_size = 2
    learning_rate = 0.1
    criterion = nn.CrossEntropyLoss()
    # ✓ Amount of hidden nodes: 5, 8, 11
    for hidden_size in (5, 8, 11):
        # ✓ Learning epochs: 100, 200, 300
        for epochs in (100, 200, 300):
            # Create model, optimizer, scheduler
            for (init, method) in inits.items():
                for (active, func) in actives.items():
                    # ✓ Activation function: tanh, ReLU
                    # ✓ Initial weights: small random number, Xavier or Kaiming/MSRA Initialization
                    model = TwoLayerNetwork(input_size, hidden_size, output_size,
                                            init_method=method, active_func=func).to(device)
                    # ✓ Optimizer: gradient descent, Momentum, Adam
                    for (optimi, zer) in optims.items():
                        # ✓ Learning rate decay schedule: none and cosine
                        for (schedul, er) in schedulers.items():
                            print(hidden_size, epochs, init,
                                active, optimi, schedul, "start")
                            # train(model=model, optimizer=zer, device=device, epochs=epochs, learning_rate=learning_rate,
                            #       trainloader=trainloader, valloader=valloader, criterion=criterion, scheduler=er)
                            print(hidden_size, epochs, init,
                                active, optimi, schedul, "end")


5 100 small_random relu sgd cos
5 100 small_random relu momentum cos
5 100 small_random relu adam cos
5 100 small_random tanh sgd cos
5 100 small_random tanh momentum cos
5 100 small_random tanh adam cos
5 100 xavier relu sgd cos
5 100 xavier relu momentum cos
5 100 xavier relu adam cos
5 100 xavier tanh sgd cos
5 100 xavier tanh momentum cos
5 100 xavier tanh adam cos
5 100 kaiming relu sgd cos
5 100 kaiming relu momentum cos
5 100 kaiming relu adam cos
5 100 kaiming tanh sgd cos
5 100 kaiming tanh momentum cos
5 100 kaiming tanh adam cos
5 200 small_random relu sgd cos
5 200 small_random relu momentum cos
5 200 small_random relu adam cos
5 200 small_random tanh sgd cos
5 200 small_random tanh momentum cos
5 200 small_random tanh adam cos
5 200 xavier relu sgd cos
5 200 xavier relu momentum cos
5 200 xavier relu adam cos
5 200 xavier tanh sgd cos
5 200 xavier tanh momentum cos
5 200 xavier tanh adam cos
5 200 kaiming relu sgd cos
5 200 kaiming relu momentum cos
5 200 kaiming relu adam

In [45]:

# Set device to use (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define hyperparameters
input_size = len(trainset[0][0])
hidden_size = 11
output_size = 2
learning_rate = 0.1
num_epochs = 10

# Create model, loss function, and optimizer
model = TwoLayerNetwork(input_size, hidden_size, output_size, inits["kaiming"], actives["relu"]).to(device)
criterion = nn.CrossEntropyLoss()

# Train the model
train(model, optims["adam"], device, num_epochs, learning_rate, trainloader, valloader, criterion, schedulers["cos"])

Epoch [1/10], Train Loss: 1.7292, Train Accuracy: 67.26%, Val Loss: 0.6387, Val Accuracy: 66.56%
Epoch [2/10], Train Loss: 0.6338, Train Accuracy: 67.53%, Val Loss: 0.6373, Val Accuracy: 66.56%
Epoch [3/10], Train Loss: 0.6333, Train Accuracy: 67.53%, Val Loss: 0.6378, Val Accuracy: 66.56%
Epoch [4/10], Train Loss: 0.6335, Train Accuracy: 67.53%, Val Loss: 0.6433, Val Accuracy: 66.56%
Epoch [5/10], Train Loss: 0.6341, Train Accuracy: 67.53%, Val Loss: 0.6405, Val Accuracy: 66.56%
Epoch [6/10], Train Loss: 0.6339, Train Accuracy: 67.53%, Val Loss: 0.6383, Val Accuracy: 66.56%
Epoch [7/10], Train Loss: 0.6344, Train Accuracy: 67.53%, Val Loss: 0.6385, Val Accuracy: 66.56%
Epoch [8/10], Train Loss: 0.6355, Train Accuracy: 67.53%, Val Loss: 0.6475, Val Accuracy: 66.56%
Epoch [9/10], Train Loss: 0.6335, Train Accuracy: 67.53%, Val Loss: 0.6379, Val Accuracy: 66.56%
Epoch [10/10], Train Loss: 0.6338, Train Accuracy: 67.53%, Val Loss: 0.6375, Val Accuracy: 66.56%
