## PyTorch based Two Layered Neural Network

In [1]:
# step 1 Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [24]:
# step 2 Prepare XOR data as tensors => Pytorch only works with tensors
X = torch.tensor([[0,0],
                 [0,1],
                 [1,0],
                 [1,1]],dtype = torch.float32) # shape [4,2]
y = torch.tensor([[0],
                  [1],
                  [1],
                  [0]],dtype = torch.float32) # shape [4,1]

In [3]:
print(X.shape)
print(y.shape)

torch.Size([4, 2])
torch.Size([4, 1])


In [73]:
epochs = 100

# step 3 Define a 2 layer nn with nn.module
class TwoLayerNN(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim, use_dropout =False, p_dropout = 0.5):
    super().__init__()
    # 1. Linear Layer (Input to Hidden)
    self.linear1 = nn.Linear(input_dim,hidden_dim)

    # 2. Linear Layer (Hidden to Output)
    self.linear2 = nn.Linear(hidden_dim,output_dim)

  def forward(self,x):
    # relu to the output of first linear layer
    h_relu = F.relu(self.linear1(x))

    # final output
    y_pred = self.linear2(h_relu)

    return y_pred


In [74]:
def train_model(X,y,input_dim,hidden_dim,output_dim,lr,epochs = 100):
  # initialization
  model = TwoLayerNN(input_dim,hidden_dim,output_dim ) # for a XOR dataset
  criterion = nn.BCEWithLogitsLoss()
  optimizer = optim.SGD(model.parameters(),lr = lr) # SGD optimizer

  print('Training Started....')
  # training loop 
  for epoch in range(epochs):
    optimizer.zero_grad() # clear the gradients from previous iterations

    outputs = model(X) # pass the X data to model and calculate the output

    loss = criterion(outputs,y) # compute the loss value

    loss.backward() # compute the gradients for all parameters (Autograd)

    optimizer.step() # update the weights based on the computed gradients

    if (epoch + 1) % 100 == 0:
      print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}')

  print('Training Finished')

In [6]:
train_model(X,y,2,10,1,lr = 0.5,epochs = 100)

Training Started....
Epoch 100/100, Loss: 0.0528
Training Finished


## Initialization

### Defining Xavier and He initialization in the previous same code

In [7]:
class TwoLayerNN(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim, use_dropout =False, p_dropout = 0.5):
    super().__init__()
    # 1. Linear Layer (Input to Hidden)
    self.linear1 = nn.Linear(input_dim,hidden_dim)

    # 2. Linear Layer (Hidden to Output)
    self.linear2 = nn.Linear(hidden_dim,output_dim)

    # Hidden layer uses ReLU, so we use HE init (kaiming)
    # "a=0" refers to the negative slope of the rectifier (0 for standard ReLU)   
    nn.init.kaiming_uniform_(self.linear1.weight, a = 0, nonlinearity = 'relu')

    # Output Layer (logits) is linear/sigmoid-like, so we use XAVIER INIT
    nn.init.xavier_uniform_(self.linear2.weight)
    # Note: PyTorch initializes biases to 0 or small uniform numbers by default, 
    # which is usually fine, so we focus on .weight here.

  def forward(self,x):
    # relu to the output of first linear layer
    h_relu = F.relu(self.linear1(x))

    # final output
    y_pred = self.linear2(h_relu)

    return y_pred

In [8]:
train_model(X,y,2,10,1,lr = 0.5,epochs = 100)

Training Started....
Epoch 100/100, Loss: 0.0281
Training Finished


## Regularization

### **Dropout**

In [9]:
class TwoLayerNet(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim, use_dropout =False, p_dropout = 0.5):
    super().__init__()
    # 1. Linear Layer (Input to Hidden)
    self.linear1 = nn.Linear(input_dim,hidden_dim)

    # Define Droput layer
    # here p = 0.5 means 50% of neurons will be zeroed out during training
    self.dropout = nn.Dropout(p = 0.5)

    # 2. Linear Layer (Hidden to Output)
    self.linear2 = nn.Linear(hidden_dim,output_dim)

    # Hidden layer uses ReLU, so we use HE init (kaiming)
    # "a=0" refers to the negative slope of the rectifier (0 for standard ReLU)   
    nn.init.kaiming_uniform_(self.linear1.weight, a = 0, nonlinearity = 'relu')

    # Output Layer (logits) is linear/sigmoid-like, so we use XAVIER INIT
    nn.init.xavier_uniform_(self.linear2.weight)
    # Note: PyTorch initializes biases to 0 or small uniform numbers by default, 
    # which is usually fine, so we focus on .weight here.

  def forward(self,x):
    # relu to the output of first linear layer
    x = F.relu(self.linear1(x))

    # apply dropout after activation and before the next layer ! important
    x = self.dropout(x)

    # final output
    y_pred = self.linear2(x)

    return y_pred

In [10]:
train_model(X,y,2,10,1,lr = 0.5,epochs = 100)

Training Started....
Epoch 100/100, Loss: 0.0309
Training Finished


### **Weight Decay**

In [11]:
# 1. Update the Network Structure (Adding Dropout)
class TwoLayerNet(nn.Module):
    def __init__(self, D_in, H, D_out):
        super().__init__()
        self.linear1 = nn.Linear(D_in, H)
        
        # --- C2.3: Define Dropout Layer ---
        # p=0.5 means 50% of neurons will be zeroed out during training
        self.dropout = nn.Dropout(p=0.5) 
        
        self.linear2 = nn.Linear(H, D_out)
        
        # (Re-applying Initialization from C2.1/C2.2)
        nn.init.kaiming_uniform_(self.linear1.weight, nonlinearity='relu')
        nn.init.xavier_uniform_(self.linear2.weight)

    def forward(self, x):
        # Apply Linear 1 -> ReLU
        x = F.relu(self.linear1(x))
        
        # --- C2.3: Apply Dropout ---
        # Crucial: Dropout is applied after activation, before the next layer
        x = self.dropout(x)
        
        # Apply Linear 2
        y_pred = self.linear2(x)
        return y_pred

# Define Data & Hyperparameters
D_in, H, D_out = 2, 10, 1
model = TwoLayerNet(D_in, H, D_out)
criterion = nn.BCEWithLogitsLoss()

# --- C2.4: Add L2 Weight Decay to Optimizer ---
# 'weight_decay=1e-4' adds the L2 penalty to the loss automatically
optimizer = optim.SGD(model.parameters(), lr=1, weight_decay=1e-4)
print('Training Started....')
# training loop 
for epoch in range(1000):
    optimizer.zero_grad() # clear the gradients from previous iterations

    outputs = model(X) # pass the X data to model and calculate the output

    loss = criterion(outputs,y) # compute the loss value

    loss.backward() # compute the gradients for all parameters (Autograd)

    optimizer.step() # update the weights based on the computed gradients

    if (epoch + 1) % 100 == 0:
      print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}')

print('Training Finished')

# --- Training Loop Note ---
# When using Dropout, you MUST toggle model modes!
# model.train() -> Turns Dropout ON
# model.eval()  -> Turns Dropout OFF (uses all neurons for prediction)

# Example Loop Snippet:
# model.train() # Set to training mode
# ... training loop code (forward, backward, step) ...

# model.eval() # Set to evaluation mode
# ... testing/validation code ...

Training Started....
Epoch 100/100, Loss: 0.6654
Epoch 200/100, Loss: 0.1660
Epoch 300/100, Loss: 0.5067
Epoch 400/100, Loss: 0.5181
Epoch 500/100, Loss: 0.3484
Epoch 600/100, Loss: 0.3562
Epoch 700/100, Loss: 0.5339
Epoch 800/100, Loss: 0.1774
Epoch 900/100, Loss: 0.5606
Epoch 1000/100, Loss: 0.5758
Training Finished


import torch.optim as optim

### OPTION A: SGD with Momentum
### momentum=0.9 is a standard value. 
### It helps push through the small fluctuations you saw earlier.
`optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)`
# --- OR ---

### OPTION B: Adam (Recommended)
### Notice we usually use a lower Learning Rate (0.001 vs 0.01) for Adam
### because it is very efficient.
`optimizer = optim.Adam(model.parameters(), lr=0.001)`

In [12]:
def train_model(X,y,input_dim,hidden_dim,output_dim,lr,epochs = 100):
  # initialization
  model = TwoLayerNN(input_dim,hidden_dim,output_dim ) # for a XOR dataset
  criterion = nn.BCEWithLogitsLoss()
  optimizer = optim.SGD(model.parameters(),lr = lr ,momentum =0.9) # SGD with momentum optimizer

  print('Training Started....')
  # training loop 
  for epoch in range(epochs):
    optimizer.zero_grad() # clear the gradients from previous iterations

    outputs = model(X) # pass the X data to model and calculate the output

    loss = criterion(outputs,y) # compute the loss value

    loss.backward() # compute the gradients for all parameters (Autograd)

    optimizer.step() # update the weights based on the computed gradients

    if (epoch + 1) % 100 == 0:
      print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}')

  print('Training Finished')

In [13]:
train_model(X,y,2,10,1,lr = 0.5,epochs = 100)

Training Started....
Epoch 100/100, Loss: 0.0001
Training Finished


In [81]:
def train_model_adam(X,y,input_dim,hidden_dim,output_dim,lr  = 0.1,epochs = 100):
  # initialization
  model = TwoLayerNN(input_dim,hidden_dim,output_dim ) # for a XOR dataset
  criterion = nn.BCEWithLogitsLoss()
  optimizer = optim.Adam(model.parameters(),lr = lr) # Adam optimizer

  print('Training Started....')
  # training loop 
  for epoch in range(epochs):
    optimizer.zero_grad() # clear the gradients from previous iterations

    outputs = model(X) # pass the X data to model and calculate the output

    loss = criterion(outputs,y) # compute the loss value

    loss.backward() # compute the gradients for all parameters (Autograd)

    optimizer.step() # update the weights based on the computed gradients

    if (epoch + 1) % 100 == 0:
      print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}')

  print('Training Finished')

In [25]:
train_model_adam(X,y,2,10,1,epochs = 100)

Training Started....
Epoch 100/100, Loss: 0.0008
Training Finished


In [50]:
from tensorflow.keras.datasets import mnist
(x_train,y_train) , (x_test,y_test) = mnist.load_data()

In [85]:
x_test  = x_test.reshape(x_test.shape[0],-1)
x_test.shape

(10000, 784)

In [86]:
X_mnist = torch.tensor(x_test,dtype = torch.float32)
X_mnist.shape

torch.Size([10000, 784])

In [87]:
import numpy as np
def to_one_hot(y, num_classes=2):
    m = y.shape[0]
    oh = np.zeros((m, num_classes))
    oh[np.arange(m), y] = 1
    return oh

In [88]:
y_mnist = to_one_hot(y_test,num_classes = 10)
y_mnist = torch.tensor(y_mnist,dtype = torch.float32)
y_mnist.shape

torch.Size([10000, 10])

In [91]:
train_model_adam(X_mnist,y_mnist,784,20,10,lr = 0.5,epochs = 10000)

Training Started....
Epoch 100/10000, Loss: 0.3249
Epoch 200/10000, Loss: 0.3249
Epoch 300/10000, Loss: 0.3249
Epoch 400/10000, Loss: 0.3249
Epoch 500/10000, Loss: 0.3249
Epoch 600/10000, Loss: 0.3249
Epoch 700/10000, Loss: 0.3249
Epoch 800/10000, Loss: 0.3249
Epoch 900/10000, Loss: 0.3249
Epoch 1000/10000, Loss: 0.3249
Epoch 1100/10000, Loss: 0.3249
Epoch 1200/10000, Loss: 0.3249
Epoch 1300/10000, Loss: 0.3249
Epoch 1400/10000, Loss: 0.3249
Epoch 1500/10000, Loss: 0.3249
Epoch 1600/10000, Loss: 0.3249
Epoch 1700/10000, Loss: 0.3249
Epoch 1800/10000, Loss: 0.3249
Epoch 1900/10000, Loss: 0.3249
Epoch 2000/10000, Loss: 0.3249
Epoch 2100/10000, Loss: 0.3249
Epoch 2200/10000, Loss: 0.3249
Epoch 2300/10000, Loss: 0.3249
Epoch 2400/10000, Loss: 0.3249
Epoch 2500/10000, Loss: 0.3249
Epoch 2600/10000, Loss: 0.3249
Epoch 2700/10000, Loss: 0.3249
Epoch 2800/10000, Loss: 0.3249
Epoch 2900/10000, Loss: 0.3249
Epoch 3000/10000, Loss: 0.3249
Epoch 3100/10000, Loss: 0.3249
Epoch 3200/10000, Loss: 0.3

KeyboardInterrupt: 

In [114]:
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

In [115]:
# --- 1. SETUP DATA ---

# Define the transformation: Convert to Tensor and Normalize
# FashionMNIST mean=0.5, std=0.5 is a safe standard approximation
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Download and Load Training Data
train_data = datasets.FashionMNIST(
    root='./data', 
    train=True, 
    download=True, 
    transform=transform
)

# Download and Load Test Data (to check accuracy later)
test_data = datasets.FashionMNIST(
    root='./data', 
    train=False, 
    download=True, 
    transform=transform
)

# Create DataLoaders (The "Slicer")
# batch_size=64: Feed 64 images at a time
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

# --- 2. DEFINE THE NETWORK ---

class TwoLayerNet(nn.Module):
    def __init__(self):
        super().__init__()
        # Input: 28x28 = 784 pixels
        # Hidden: 256 neurons (Standard size for simple image tasks)
        # Output: 10 classes (T-shirt, Trouser, Pullover, etc.)
        self.linear1 = nn.Linear(784, 256)
        self.linear2 = nn.Linear(256, 10)
        self.dropout = nn.Dropout(p=0.2) # Simple regularization

    def forward(self, x):
        # Flatten the image: (Batch_Size, 1, 28, 28) -> (Batch_Size, 784)
        x = x.view(-1, 784)
        
        # Layer 1 -> ReLU -> Dropout
        x = torch.relu(self.linear1(x))
        x = self.dropout(x)
        
        # Layer 2 (Output Logits)
        x = self.linear2(x)
        return x

# Initialize Model, Loss, and Optimizer
model = TwoLayerNet()
criterion = nn.CrossEntropyLoss() # Handles Softmax internally
optimizer = optim.Adam(model.parameters(), lr=0.001)

# --- 3. TRAINING LOOP ---

epochs = 5
print(f"Starting training on {len(train_data)} images...")

for epoch in range(epochs):
    model.train() # Set to training mode (Enable Dropout)
    running_loss = 0.0
    
    # Inner Loop: Iterate through batches
    for batch_idx, (images, labels) in enumerate(train_loader):
        
        # 1. Zero Gradients
        optimizer.zero_grad()
        
        # 2. Forward Pass
        outputs = model(images)
        
        # 3. Calculate Loss
        loss = criterion(outputs, labels)
        
        # 4. Backward Pass (Autograd)
        loss.backward()
        
        # 5. Optimizer Step (Update Weights)
        optimizer.step()
        
        running_loss += loss.item()
        
    avg_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

# --- 4. EVALUATION (TESTING) ---

print("\nEvaluating on Test Data...")
model.eval() # Set to evaluation mode (Disable Dropout)
correct = 0
total = 0

# No gradients needed for evaluation (saves memory)
with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        
        # Get the predicted class (index of the max value)
        _, predicted = torch.max(outputs.data, 1)
        
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Accuracy on 10,000 test images: {100 * correct / total:.2f}%")

Starting training on 60000 images...
Epoch 1/5, Loss: 0.5100
Epoch 2/5, Loss: 0.3981
Epoch 3/5, Loss: 0.3636
Epoch 4/5, Loss: 0.3438
Epoch 5/5, Loss: 0.3271

Evaluating on Test Data...
Accuracy on 10,000 test images: 87.11%
