## PyTorch based Two Layered Neural Network

In [55]:
# step 1 Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [56]:
# step 2 Prepare XOR data as tensors => Pytorch only works with tensors
X = torch.tensor([[0,0],
                 [0,1],
                 [1,0],
                 [1,1]],dtype = torch.float32) # shape [4,2]
y = torch.tensor([[0],
                  [1],
                  [1],
                  [0]],dtype = torch.float32) # shape [4,1]

In [57]:
print(X.shape)
print(y.shape)

torch.Size([4, 2])
torch.Size([4, 1])


In [58]:
epochs = 100

# step 3 Define a 2 layer nn with nn.module
class TwoLayerNN(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim, use_dropout =False, p_dropout = 0.5):
    super().__init__()
    # 1. Linear Layer (Input to Hidden)
    self.linear1 = nn.Linear(input_dim,hidden_dim)

    # 2. Linear Layer (Hidden to Output)
    self.linear2 = nn.Linear(hidden_dim,output_dim)

  def forward(self,x):
    # relu to the output of first linear layer
    h_relu = F.relu(self.linear1(x))

    # final output
    y_pred = self.linear2(h_relu)

    return y_pred


In [67]:
def train_model(X,y,input_dim,hidden_dim,output_dim,lr,epochs = 100):
  # initialization
  model = TwoLayerNN(input_dim,hidden_dim,output_dim ) # for a XOR dataset
  criterion = nn.BCEWithLogitsLoss()
  optimizer = optim.SGD(model.parameters(),lr = lr) # SGD optimizer

  print('Training Started....')
  # training loop 
  for epoch in range(epochs):
    optimizer.zero_grad() # clear the gradients from previous iterations

    outputs = model(X) # pass the X data to model and calculate the output

    loss = criterion(outputs,y) # compute the loss value

    loss.backward() # compute the gradients for all parameters (Autograd)

    optimizer.step() # update the weights based on the computed gradients

    if (epoch + 1) % 100 == 0:
      print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}')

  print('Training Finished')

In [75]:
train_model(X,y,2,10,1,lr = 0.5,epochs = 100)

Training Started....
Epoch 100/100, Loss: 0.0273
Training Finished


## Initialization

### Defining Xavier and He initialization in the previous same code

In [76]:
class TwoLayerNN(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim, use_dropout =False, p_dropout = 0.5):
    super().__init__()
    # 1. Linear Layer (Input to Hidden)
    self.linear1 = nn.Linear(input_dim,hidden_dim)

    # 2. Linear Layer (Hidden to Output)
    self.linear2 = nn.Linear(hidden_dim,output_dim)

    # Hidden layer uses ReLU, so we use HE init (kaiming)
    # "a=0" refers to the negative slope of the rectifier (0 for standard ReLU)   
    nn.init.kaiming_uniform_(self.linear1.weight, a = 0, nonlinearity = 'relu')

    # Output Layer (logits) is linear/sigmoid-like, so we use XAVIER INIT
    nn.init.xavier_uniform_(self.linear2.weight)
    # Note: PyTorch initializes biases to 0 or small uniform numbers by default, 
    # which is usually fine, so we focus on .weight here.

  def forward(self,x):
    # relu to the output of first linear layer
    h_relu = F.relu(self.linear1(x))

    # final output
    y_pred = self.linear2(h_relu)

    return y_pred

In [79]:
train_model(X,y,2,10,1,lr = 0.5,epochs = 100)

Training Started....
Epoch 100/100, Loss: 0.0339
Training Finished


## Regularization

### **Dropout**

In [80]:
class TwoLayerNet(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim, use_dropout =False, p_dropout = 0.5):
    super().__init__()
    # 1. Linear Layer (Input to Hidden)
    self.linear1 = nn.Linear(input_dim,hidden_dim)

    # Define Droput layer
    # here p = 0.5 means 50% of neurons will be zeroed out during training
    self.dropout = nn.Dropout(p = 0.5)

    # 2. Linear Layer (Hidden to Output)
    self.linear2 = nn.Linear(hidden_dim,output_dim)

    # Hidden layer uses ReLU, so we use HE init (kaiming)
    # "a=0" refers to the negative slope of the rectifier (0 for standard ReLU)   
    nn.init.kaiming_uniform_(self.linear1.weight, a = 0, nonlinearity = 'relu')

    # Output Layer (logits) is linear/sigmoid-like, so we use XAVIER INIT
    nn.init.xavier_uniform_(self.linear2.weight)
    # Note: PyTorch initializes biases to 0 or small uniform numbers by default, 
    # which is usually fine, so we focus on .weight here.

  def forward(self,x):
    # relu to the output of first linear layer
    x = F.relu(self.linear1(x))

    # apply dropout after activation and before the next layer ! important
    x = self.dropout(x)

    # final output
    y_pred = self.linear2(x)

    return y_pred

In [83]:
train_model(X,y,2,10,1,lr = 0.5,epochs = 100)

Training Started....
Epoch 100/100, Loss: 0.0294
Training Finished


### **Weight Decay**

In [84]:
# 1. Update the Network Structure (Adding Dropout)
class TwoLayerNet(nn.Module):
    def __init__(self, D_in, H, D_out):
        super().__init__()
        self.linear1 = nn.Linear(D_in, H)
        
        # --- C2.3: Define Dropout Layer ---
        # p=0.5 means 50% of neurons will be zeroed out during training
        self.dropout = nn.Dropout(p=0.5) 
        
        self.linear2 = nn.Linear(H, D_out)
        
        # (Re-applying Initialization from C2.1/C2.2)
        nn.init.kaiming_uniform_(self.linear1.weight, nonlinearity='relu')
        nn.init.xavier_uniform_(self.linear2.weight)

    def forward(self, x):
        # Apply Linear 1 -> ReLU
        x = F.relu(self.linear1(x))
        
        # --- C2.3: Apply Dropout ---
        # Crucial: Dropout is applied after activation, before the next layer
        x = self.dropout(x)
        
        # Apply Linear 2
        y_pred = self.linear2(x)
        return y_pred

# Define Data & Hyperparameters
D_in, H, D_out = 2, 10, 1
model = TwoLayerNet(D_in, H, D_out)
criterion = nn.BCEWithLogitsLoss()

# --- C2.4: Add L2 Weight Decay to Optimizer ---
# 'weight_decay=1e-4' adds the L2 penalty to the loss automatically
optimizer = optim.SGD(model.parameters(), lr=1, weight_decay=1e-4)
print('Training Started....')
# training loop 
for epoch in range(1000):
    optimizer.zero_grad() # clear the gradients from previous iterations

    outputs = model(X) # pass the X data to model and calculate the output

    loss = criterion(outputs,y) # compute the loss value

    loss.backward() # compute the gradients for all parameters (Autograd)

    optimizer.step() # update the weights based on the computed gradients

    if (epoch + 1) % 100 == 0:
      print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}')

print('Training Finished')

# --- Training Loop Note ---
# When using Dropout, you MUST toggle model modes!
# model.train() -> Turns Dropout ON
# model.eval()  -> Turns Dropout OFF (uses all neurons for prediction)

# Example Loop Snippet:
# model.train() # Set to training mode
# ... training loop code (forward, backward, step) ...

# model.eval() # Set to evaluation mode
# ... testing/validation code ...

Training Started....
Epoch 100/100, Loss: 0.1399
Epoch 200/100, Loss: 0.6762
Epoch 300/100, Loss: 0.1703
Epoch 400/100, Loss: 0.2794
Epoch 500/100, Loss: 0.0976
Epoch 600/100, Loss: 0.1531
Epoch 700/100, Loss: 0.5138
Epoch 800/100, Loss: 0.0009
Epoch 900/100, Loss: 0.0674
Epoch 1000/100, Loss: 0.1313
Training Finished


import torch.optim as optim

### OPTION A: SGD with Momentum
### momentum=0.9 is a standard value. 
### It helps push through the small fluctuations you saw earlier.
`optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)`
# --- OR ---

### OPTION B: Adam (Recommended)
### Notice we usually use a lower Learning Rate (0.001 vs 0.01) for Adam
### because it is very efficient.
`optimizer = optim.Adam(model.parameters(), lr=0.001)`

In [86]:
def train_model(X,y,input_dim,hidden_dim,output_dim,lr,epochs = 100):
  # initialization
  model = TwoLayerNN(input_dim,hidden_dim,output_dim ) # for a XOR dataset
  criterion = nn.BCEWithLogitsLoss()
  optimizer = optim.SGD(model.parameters(),lr = lr ,momentum =0.9) # SGD with momentum optimizer

  print('Training Started....')
  # training loop 
  for epoch in range(epochs):
    optimizer.zero_grad() # clear the gradients from previous iterations

    outputs = model(X) # pass the X data to model and calculate the output

    loss = criterion(outputs,y) # compute the loss value

    loss.backward() # compute the gradients for all parameters (Autograd)

    optimizer.step() # update the weights based on the computed gradients

    if (epoch + 1) % 100 == 0:
      print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}')

  print('Training Finished')

In [96]:
train_model(X,y,2,10,1,lr = 0.5,epochs = 100)

Training Started....
Epoch 100/100, Loss: 0.0001
Training Finished


In [None]:
def train_model(X,y,input_dim,hidden_dim,output_dim,epochs = 100):
  # initialization
  model = TwoLayerNN(input_dim,hidden_dim,output_dim ) # for a XOR dataset
  criterion = nn.BCEWithLogitsLoss()
  optimizer = optim.Adam(model.parameters(),lr = 0.1) # Adam optimizer

  print('Training Started....')
  # training loop 
  for epoch in range(epochs):
    optimizer.zero_grad() # clear the gradients from previous iterations

    outputs = model(X) # pass the X data to model and calculate the output

    loss = criterion(outputs,y) # compute the loss value

    loss.backward() # compute the gradients for all parameters (Autograd)

    optimizer.step() # update the weights based on the computed gradients

    if (epoch + 1) % 100 == 0:
      print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}')

  print('Training Finished')

In [106]:
train_model(X,y,2,10,1,epochs = 100)

Training Started....
Epoch 100/100, Loss: 0.0001
Training Finished


In [107]:
from tensorflow.keras.datasets import mnist
(x_train,y_train) , (x_test,y_test) = mnist.load_data()

In [109]:
X = torch.tensor(x_train,dtype = torch.float32)

In [114]:
import numpy as np
def to_one_hot(y, num_classes=2):
    m = y.shape[0]
    oh = np.zeros((m, num_classes))
    oh[np.arange(m), y] = 1
    return oh

In [118]:
y = to_one_hot(y_train,num_classes = 10)
y = torch.tensor(y,dtype = torch.float32)

In [123]:
X = X.reshape(X.shape[0],-1)
X.shape

torch.Size([60000, 784])

In [125]:
train_model(X,y,784,20,1,epochs = 100)

Training Started....


ValueError: Target size (torch.Size([60000, 10])) must be the same as input size (torch.Size([60000, 1]))