In [1]:
from google.colab import files
files.upload()
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d zalando-research/fashionmnist

!unzip fashionmnist.zip -d fashionmnist

Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/zalando-research/fashionmnist
License(s): other
Downloading fashionmnist.zip to /content
 99% 68.0M/68.8M [00:00<00:00, 208MB/s]
100% 68.8M/68.8M [00:00<00:00, 174MB/s]
Archive:  fashionmnist.zip
  inflating: fashionmnist/fashion-mnist_test.csv  
  inflating: fashionmnist/fashion-mnist_train.csv  
  inflating: fashionmnist/t10k-images-idx3-ubyte  
  inflating: fashionmnist/t10k-labels-idx1-ubyte  
  inflating: fashionmnist/train-images-idx3-ubyte  
  inflating: fashionmnist/train-labels-idx1-ubyte  


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
df = pd.read_csv("/content/fashionmnist/fashion-mnist_train.csv")
df

Using device: cuda


Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,0,0,0,0,0,0,0,5,0,...,0,0,0,30,43,0,0,0,0,0
3,0,0,0,0,1,2,0,0,0,0,...,3,0,0,0,0,1,0,0,0,0
4,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59996,1,0,0,0,0,0,0,0,0,0,...,73,0,0,0,0,0,0,0,0,0
59997,8,0,0,0,0,0,0,0,0,0,...,160,162,163,135,94,0,0,0,0,0
59998,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train/255.0
X_test = X_test/255.0

class CustomDataset(Dataset):
  def __init__(self, features, labels):
    self.features = torch.tensor(features, dtype=torch.float32)
    self.labels = torch.tensor(labels, dtype=torch.long)

  def __len__(self):
    return len(self.features)

  def __getitem__(self, idx):
    return self.features[idx], self.labels[idx]

train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, pin_memory=True)

### Our Model is Overfitting so we need to optimize our model to reduce overfitting

How to reduce Overfitting

1. Adding more data
2. Reducing the complexity of NN Architecture
3. Regularization (We add a penalty in loss function so while training we will also reduce that penalty term along with the loss)
4. Dropouts
5. Data Augmentation
6. Batch Normalization
7. Early Stopping

### Dropout

1. Applied to the hidden layers
2. Applied after the ReLU activation function
3. Randomly turns off p% neurons in the hidden layer during each forward pass
4. This has a regularization effect
5. During evaluation dropout is not used

### Batch Normalization

1. Applied to Hidden Layers
  - Typically to the hidden layers of the NN, but not to the output layer

2. Applied after Linear Layers and Before Activation Functions
  - Normalizes the output of the preceding layer (eg. after nn.Linear) and is usually followed by and activation function(eg. ReLU)

3. Normalizes Activation
  - Computes the mean and variance of the activations within a mini-batch and uses these statistics to normalize the activations

4. Includes Learnable Parameters
  - Introduces two learnable parameters, gamma(scaling) and beta(shifting), which allow the network to adjust the normalized outputs

5. Improves Training Stability
  - Reduces internal covariate shift, stabilizing the training process and allowing the use of higher learning rates

6. Regulariztion Effect
  - Introduces some regularization because the statistics are computed over a mini-batch, adding noise to the training process

7. Consistent During Evaluation
  - During evalutation, BatchNorm uses the running mean and variance accumulated during training, rather than recomputing them from the mini-batch

### L2 Regulariztion

1. Applied to Model Weights
  - Regularization is applied to the weights of the model to penalize large values and encourage smaller, more generalizable weights

2. Introduced via Loss Function or Optimizer
  - Adds a penalty term lambda(sum(i^2)) to the loss function in L2 regularization
    ```
    Loss(reg) = Loss(original) + lambda(sum(i^2))
    ```
  - In weight decay, directly modifies the gradient update rule to include lambda(wi), effectively shrinking weights during training
    ```
    w <- w - eta(loss - lambda(w))
    ```
3. Penalizes Large Weights
  - Encourages the network to distribute learning accross multiple parameters, avoiding reliance on a few large weights

4. Reduces Overfitting
  - Helps the model generalize better data by discouraging overly complex representations

5. Controlled by a Hyperparameter
  - A regularization coefficent (lamda often set via weight_decay in optimizers) controls the strength of the penalty, larger values lead to stronger regularization

6. No Effect on Bias Terms
  - Regulariztion is typically applied only to weights, not baises, as baises don't directly affect model complexity

7. Active during training
  - Regularization affects weight updates only during training. It does not explicitly influence the model during inference.

In [7]:
class MyNN(nn.Module):
  def __init__(self, num_features):
    super().__init__()
    self.model = nn.Sequential(
        nn.Linear(num_features, 128),
        nn.BatchNorm1d(128),
        nn.ReLU(),
        nn.Dropout(p=0.3),
        nn.Linear(128, 64),
        nn.BatchNorm1d(64),
        nn.ReLU(),
        nn.Linear(64, 10)
    )

  def forward(self, x):
    return self.model(x)

In [8]:
epochs = 50
learning_rate = 0.1

In [9]:
model = MyNN(X_train.shape[1])
model = model.to(device)
criterion = nn.CrossEntropyLoss()

optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=1e-4)

In [10]:
for epoch in range(epochs):
  total_epoch_loss = 0
  for batch_features, batch_labels in train_loader:
    # Moving batch_features and batch_labels to GPU
    batch_features = batch_features.to(device)
    batch_labels = batch_labels.to(device)

    outputs = model(batch_features)

    loss = criterion(outputs, batch_labels)

    optimizer.zero_grad()

    loss.backward()

    optimizer.step()
    total_epoch_loss += loss.item()

  loss = total_epoch_loss / len(train_loader)
  print(f'Epoch {epoch+1}, Loss: {loss}')

Epoch 1, Loss: 0.5674902456899484
Epoch 2, Loss: 0.4456941350499789
Epoch 3, Loss: 0.41252840200066565
Epoch 4, Loss: 0.3889859627187252
Epoch 5, Loss: 0.37099999423821767
Epoch 6, Loss: 0.36042106902599336
Epoch 7, Loss: 0.3498532522395253
Epoch 8, Loss: 0.34292030820747216
Epoch 9, Loss: 0.3286772373964389
Epoch 10, Loss: 0.32509373744080466
Epoch 11, Loss: 0.3171027289032936
Epoch 12, Loss: 0.31070820153007905
Epoch 13, Loss: 0.3069813267638286
Epoch 14, Loss: 0.2991847218821446
Epoch 15, Loss: 0.29712308797985315
Epoch 16, Loss: 0.2917733385488391
Epoch 17, Loss: 0.28906752438594896
Epoch 18, Loss: 0.28667059605568646
Epoch 19, Loss: 0.28388199580212437
Epoch 20, Loss: 0.2816689369305968
Epoch 21, Loss: 0.2764309098422527
Epoch 22, Loss: 0.2739927493830522
Epoch 23, Loss: 0.2716795019408067
Epoch 24, Loss: 0.27032283843308685
Epoch 25, Loss: 0.266225481753548
Epoch 26, Loss: 0.2660126108850042
Epoch 27, Loss: 0.26294780943542717
Epoch 28, Loss: 0.26437255627661943
Epoch 29, Loss: 0

In [11]:
model.eval()

MyNN(
  (model): Sequential(
    (0): Linear(in_features=784, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=128, out_features=64, bias=True)
    (5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Linear(in_features=64, out_features=10, bias=True)
  )
)

In [12]:
# Evaluation code on Test data
total = 0
correct = 0
with torch.no_grad():
  for batch_features, batch_labels in test_loader:
    batch_features = batch_features.to(device)
    batch_labels = batch_labels.to(device)
    outputs = model(batch_features)
    _, predicted = torch.max(outputs, 1)
    total = total + batch_labels.shape[0]

    correct = correct + (predicted == batch_labels).sum().item()

print(correct/total)

0.8873333333333333


In [13]:
# Evaluation code on Training Data
total = 0
correct = 0
with torch.no_grad():
  for batch_features, batch_labels in train_loader:
    batch_features = batch_features.to(device)
    batch_labels = batch_labels.to(device)
    outputs = model(batch_features)
    _, predicted = torch.max(outputs, 1)
    total = total + batch_labels.shape[0]

    correct = correct + (predicted == batch_labels).sum().item()

print(correct/total)

0.9360833333333334


### If the difference between accuracies of Training and Test data is more then there might be overfitting