# Regularization

In [1]:
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
from torchvision.datasets import MNIST
from torchvision import transforms
from sklearn.model_selection import train_test_split

In [2]:
train_validation_dataset = MNIST(root="../datasets/", train=True, download=True, transform=transforms.ToTensor())
test_dataset = MNIST(root="../datasets", train=False, download=False, transform=transforms.ToTensor())

In [3]:
y = stratify = train_validation_dataset.targets.numpy()
train_idxs, val_idxs = train_test_split(
                                range(len(train_validation_dataset)),
                                stratify=stratify,
                                test_size=0.1)

In [4]:
train_dataset = Subset(train_validation_dataset, train_idxs)
val_dataset = Subset(train_validation_dataset, val_idxs)

In [5]:
# parameters
DEVICE = ("cuda:0" if torch.cuda.is_available() else "cpu")
NUM_EPOCHS=10
BATCH_SIZE=32

NUM_LABELS = 10
NUM_FEATURES = 28*28
HIDDEN_SIZE_1 = 100
HIDDEN_SIZE_2 = 50
ALPHA = 0.1

In [6]:
train_dataloader = DataLoader(dataset=train_dataset, 
                              batch_size=BATCH_SIZE,
                              shuffle=True,
                              drop_last=True,
                              num_workers=4)

val_dataloader = DataLoader(dataset=val_dataset, 
                              batch_size=BATCH_SIZE,
                              shuffle=False,
                              drop_last=False,
                              num_workers=4)

test_dataloader = DataLoader(dataset=test_dataset, 
                              batch_size=BATCH_SIZE,
                              shuffle=False,
                              drop_last=False,
                              num_workers=4)

In [7]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
                nn.Linear(NUM_FEATURES, HIDDEN_SIZE_1),
                nn.Sigmoid(),
                nn.Linear(HIDDEN_SIZE_1, HIDDEN_SIZE_2),
                nn.Sigmoid(),
                nn.Linear(HIDDEN_SIZE_2, NUM_LABELS),
                nn.LogSoftmax(dim=1)
            )
    
    def forward(self, X):
        return self.layers(X)

In [8]:
def validate_epoch(model, dataloader):
    model.eval()
    with torch.inference_mode():
        num_samples = 0
        num_correct = 0
        loss_sum = 0
        batch_nums = 0

        for batch_idx, (features, labels) in enumerate(dataloader):
            features = features.view(-1, NUM_FEATURES).to(DEVICE)
            labels = labels.to(DEVICE) 
            # ------ FORWARD PASS --------
            # first linear transformation
            probs = model(features)
            loss = criterion(probs, labels)
            batch_nums+=1
            loss_sum+=loss.item()

            predictions = probs.argmax(dim=1)
            num_samples+=len(features)
            num_correct+=(labels == predictions).sum().detach().cpu().item()
            
        accuracy = num_correct / num_samples
        avg_loss = loss_sum / batch_nums
        return avg_loss, accuracy

In [9]:
def train_epoch(model, dataloader):
    model.train()
    loss_sum = 0
    batch_nums = 0
    for batch_idx, (features, labels) in enumerate(dataloader):

        # reshape features and move to gpu
        features = features.view(-1, NUM_FEATURES).to(DEVICE)
        # move label to GPU
        labels = labels.to(DEVICE)

        # ------ FORWARD PASS --------
        # first linear transformation
        probs = model(features)

        # ------CALCULATE LOSS --------
        #cross-entropy loss
        loss = criterion(probs, labels)

        # ------BACKPROPAGATION --------
        loss.backward()

        # ------GRADIENT DESCENT --------
        optimizer.step()

        # ------CLEAR GRADIENTS --------
        optimizer.zero_grad()

        # ------TRACK LOSS --------
        loss_sum += loss.detach().cpu().item()
        batch_nums += 1
    return loss_sum / batch_nums


In [10]:
model = Model().to(DEVICE)
criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=ALPHA)
for epoch in range(NUM_EPOCHS):
    train_loss = train_epoch(model, train_dataloader)
    val_loss, val_accuracy = validate_epoch(model, val_dataloader)
    print(f'Epoch: {epoch+1}/{NUM_EPOCHS} | Train Loss: {train_loss} | Val Loss: {val_loss} | Val Acc: {val_accuracy}')

Epoch: 1/10 | Train Loss: 1.5840126217718864 | Val Loss: 0.6833014472367915 | Val Acc: 0.8026666666666666
Epoch: 2/10 | Train Loss: 0.49275893600526194 | Val Loss: 0.4023380399622182 | Val Acc: 0.8871666666666667
Epoch: 3/10 | Train Loss: 0.34781832572803667 | Val Loss: 0.32630129273426023 | Val Acc: 0.9093333333333333
Epoch: 4/10 | Train Loss: 0.2878535793521168 | Val Loss: 0.2761212297457647 | Val Acc: 0.9205
Epoch: 5/10 | Train Loss: 0.2442894723955332 | Val Loss: 0.24669151771021017 | Val Acc: 0.9283333333333333
Epoch: 6/10 | Train Loss: 0.211113075137792 | Val Loss: 0.21764235855377417 | Val Acc: 0.9381666666666667
Epoch: 7/10 | Train Loss: 0.1856451758956973 | Val Loss: 0.19449451223927292 | Val Acc: 0.9443333333333334
Epoch: 8/10 | Train Loss: 0.16617056687771567 | Val Loss: 0.18181321852186577 | Val Acc: 0.9478333333333333
Epoch: 9/10 | Train Loss: 0.14985739022093672 | Val Loss: 0.16728815835643004 | Val Acc: 0.951
Epoch: 10/10 | Train Loss: 0.13664886989936048 | Val Loss: 0.1

In [11]:
test_loss, test_accuracy = validate_epoch(model, test_dataloader)
print(f'Test Loss: {test_loss} Test Acc: {test_accuracy}')

Test Loss: 0.14062306454208806 Test Acc: 0.9582


In [12]:
# using weight decay -> L2 loss
model = Model().to(DEVICE)
criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=ALPHA, weight_decay=0.001)
for epoch in range(NUM_EPOCHS):
    train_loss = train_epoch(model, train_dataloader)
    val_loss, val_accuracy = validate_epoch(model, val_dataloader)
    print(f'Epoch: {epoch+1}/{NUM_EPOCHS} | Train Loss: {train_loss} | Val Loss: {val_loss} | Val Acc: {val_accuracy}')

Epoch: 1/10 | Train Loss: 1.6074911988763747 | Val Loss: 0.7364835358680563 | Val Acc: 0.8001666666666667
Epoch: 2/10 | Train Loss: 0.5345603316209694 | Val Loss: 0.43251650598137936 | Val Acc: 0.8858333333333334
Epoch: 3/10 | Train Loss: 0.3913343894451627 | Val Loss: 0.37032234383390306 | Val Acc: 0.9
Epoch: 4/10 | Train Loss: 0.3441987585168162 | Val Loss: 0.33873798226580976 | Val Acc: 0.9088333333333334
Epoch: 5/10 | Train Loss: 0.3147267649070568 | Val Loss: 0.3153703131495004 | Val Acc: 0.9138333333333334
Epoch: 6/10 | Train Loss: 0.2928180080691102 | Val Loss: 0.30447521132040534 | Val Acc: 0.9176666666666666
Epoch: 7/10 | Train Loss: 0.2751405779680841 | Val Loss: 0.2792716518520041 | Val Acc: 0.9268333333333333
Epoch: 8/10 | Train Loss: 0.2601328149225914 | Val Loss: 0.26731096897670564 | Val Acc: 0.9296666666666666
Epoch: 9/10 | Train Loss: 0.2485600982654109 | Val Loss: 0.26043352203324754 | Val Acc: 0.9306666666666666
Epoch: 10/10 | Train Loss: 0.23963006790318853 | Val Lo

In [14]:
# add dropout to model
class DropoutModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
                nn.Linear(NUM_FEATURES, HIDDEN_SIZE_1),
                nn.Dropout(p=0.5, inplace=True),
                nn.Sigmoid(),
                nn.Linear(HIDDEN_SIZE_1, HIDDEN_SIZE_2),
                nn.Dropout(p=0.5, inplace=True),
                nn.Sigmoid(),
                nn.Linear(HIDDEN_SIZE_2, NUM_LABELS),
                nn.LogSoftmax(dim=1)
            )
    
    def forward(self, X):
        return self.layers(X)

In [15]:
# using weight decay -> L2 loss
model = DropoutModel().to(DEVICE)
criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=ALPHA)
for epoch in range(NUM_EPOCHS):
    train_loss = train_epoch(model, train_dataloader)
    val_loss, val_accuracy = validate_epoch(model, val_dataloader)
    print(f'Epoch: {epoch+1}/{NUM_EPOCHS} | Train Loss: {train_loss} | Val Loss: {val_loss} | Val Acc: {val_accuracy}')

Epoch: 1/10 | Train Loss: 1.753309700922918 | Val Loss: 0.8019436944989448 | Val Acc: 0.7166666666666667
Epoch: 2/10 | Train Loss: 0.7418865424221482 | Val Loss: 0.47760056315901434 | Val Acc: 0.86
Epoch: 3/10 | Train Loss: 0.5100500681252612 | Val Loss: 0.40739288022543524 | Val Acc: 0.89
Epoch: 4/10 | Train Loss: 0.4333171890722836 | Val Loss: 0.3901899809889654 | Val Acc: 0.8966666666666666
Epoch: 5/10 | Train Loss: 0.3958948800460644 | Val Loss: 0.37063586936292653 | Val Acc: 0.9081666666666667
Epoch: 6/10 | Train Loss: 0.36788900220842846 | Val Loss: 0.3519753341047846 | Val Acc: 0.914
Epoch: 7/10 | Train Loss: 0.3485153307492343 | Val Loss: 0.3573096214615284 | Val Acc: 0.9148333333333334
Epoch: 8/10 | Train Loss: 0.329779184120307 | Val Loss: 0.33445950937358304 | Val Acc: 0.9218333333333333
Epoch: 9/10 | Train Loss: 0.31913269574631936 | Val Loss: 0.3317088654829546 | Val Acc: 0.9211666666666667
Epoch: 10/10 | Train Loss: 0.3100332437854903 | Val Loss: 0.3316231705050202 | Val 