<a href="https://colab.research.google.com/github/ParvanehFaraji/Neural_Network/blob/main/NN_Project_Parvaneh_Faraji.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the MLP model
class MLP(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size):
        super(MLP, self).__init__()
        layers = []
        in_features = input_size

        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(in_features, hidden_size))
            layers.append(nn.ReLU())  # Default activation function
            in_features = hidden_size

        layers.append(nn.Linear(in_features, output_size))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

# Hyperparameters
input_size = 28 * 28
hidden_sizes = [128, 64]
output_size = 10
learning_rate = 0.001
batch_size = 64
epochs = 10

# Load Fashion MNIST dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = datasets.FashionMNIST(root="./data", train=True, transform=transform, download=True)
test_dataset = datasets.FashionMNIST(root="./data", train=False, transform=transform, download=True)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialize model, loss, and optimizer
model = MLP(input_size, hidden_sizes, output_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
print("Training the model...")
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images = images.view(images.size(0), -1).to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch [{epoch + 1}/{epochs}], Loss: {running_loss / len(train_loader):.4f}")

# Testing loop
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images = images.view(images.size(0), -1).to(device)
        labels = labels.to(device)

        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")


Training the model...
Epoch [1/10], Loss: 0.5144
Epoch [2/10], Loss: 0.3814
Epoch [3/10], Loss: 0.3383
Epoch [4/10], Loss: 0.3173
Epoch [5/10], Loss: 0.2999
Epoch [6/10], Loss: 0.2825
Epoch [7/10], Loss: 0.2705
Epoch [8/10], Loss: 0.2570
Epoch [9/10], Loss: 0.2486
Epoch [10/10], Loss: 0.2386
Test Accuracy: 87.12%


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import time

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the MLP model
class MLP(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size, activation_fn):
        super(MLP, self).__init__()
        layers = []
        in_features = input_size

        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(in_features, hidden_size))
            layers.append(activation_fn)
            in_features = hidden_size

        layers.append(nn.Linear(in_features, output_size))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

# Hyperparameters
input_size = 32 * 32 * 3  # CIFAR-10 images are 32x32x3
hidden_sizes = [256, 128]
output_size = 10
learning_rate = 0.001
batch_size = 64
epochs = 5

# Load CIFAR-10 dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = datasets.CIFAR10(root="./data", train=True, transform=transform, download=True)
test_dataset = datasets.CIFAR10(root="./data", train=False, transform=transform, download=True)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Train and evaluate model with different activation functions
activation_functions = {
    "ReLU": nn.ReLU(),
    "Tanh": nn.Tanh(),
    "Sigmoid": nn.Sigmoid()
}

results = {}

for name, activation_fn in activation_functions.items():
    print(f"\nTraining with {name} activation function")

    # Initialize model, loss, and optimizer
    model = MLP(input_size, hidden_sizes, output_size, activation_fn).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    start_time = time.time()
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for images, labels in train_loader:
            images = images.view(images.size(0), -1).to(device)  # Flatten images
            labels = labels.to(device)

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {running_loss / len(train_loader):.4f}")

    training_time = time.time() - start_time

    # Testing loop
    model.eval()
    correct = 0
    total = 0
    test_loss = 0.0
    with torch.no_grad():
        for images, labels in test_loader:
            images = images.view(images.size(0), -1).to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)
            test_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    test_loss /= len(test_loader)
    print(f"Test Accuracy: {accuracy:.2f}%, Test Loss: {test_loss:.4f}, Training Time: {training_time:.2f} seconds")

    # Store results
    results[name] = {
        "accuracy": accuracy,
        "loss": test_loss,
        "training_time": training_time
    }

# Compare results
print("\nComparison of Activation Functions:")
for name, metrics in results.items():
    print(f"{name}: Accuracy = {metrics['accuracy']:.2f}%, Loss = {metrics['loss']:.4f}, Training Time = {metrics['training_time']:.2f} seconds")

# Explanation:
print("\nExplanation:")
print("ReLU performs better for deep networks because it does not vanish gradient problem, allowing gradients to flow effectively. Tanh and Sigmoid can struggle due to saturating outputs, which diminish gradient updates in deeper layers. However, they may still perform well in shallower networks or specific tasks.")


Files already downloaded and verified
Files already downloaded and verified

Training with ReLU activation function
Epoch [1/5], Loss: 1.6358
Epoch [2/5], Loss: 1.4281
Epoch [3/5], Loss: 1.3250
Epoch [4/5], Loss: 1.2450
Epoch [5/5], Loss: 1.1734
Test Accuracy: 51.93%, Test Loss: 1.3834, Training Time: 101.82 seconds

Training with Tanh activation function
Epoch [1/5], Loss: 1.7558
Epoch [2/5], Loss: 1.6209
Epoch [3/5], Loss: 1.5509
Epoch [4/5], Loss: 1.5000
Epoch [5/5], Loss: 1.4535
Test Accuracy: 46.40%, Test Loss: 1.5276, Training Time: 100.80 seconds

Training with Sigmoid activation function
Epoch [1/5], Loss: 1.7876
Epoch [2/5], Loss: 1.5937
Epoch [3/5], Loss: 1.5006
Epoch [4/5], Loss: 1.4272
Epoch [5/5], Loss: 1.3691
Test Accuracy: 48.33%, Test Loss: 1.4609, Training Time: 100.36 seconds

Comparison of Activation Functions:
ReLU: Accuracy = 51.93%, Loss = 1.3834, Training Time = 101.82 seconds
Tanh: Accuracy = 46.40%, Loss = 1.5276, Training Time = 100.80 seconds
Sigmoid: Accurac

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import time
from itertools import product

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the MLP model
class MLP(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size, activation_fn):
        super(MLP, self).__init__()
        layers = []
        in_features = input_size

        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(in_features, hidden_size))
            layers.append(activation_fn)
            in_features = hidden_size

        layers.append(nn.Linear(in_features, output_size))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

# Load CIFAR-10 dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = datasets.CIFAR10(root="./data", train=True, transform=transform, download=True)
test_dataset = datasets.CIFAR10(root="./data", train=False, transform=transform, download=True)

# Hyperparameter tuning setup
learning_rates = [0.001, 0.01]
batch_sizes = [32, 64]
hidden_layer_configs = [[256, 128], [512, 256, 128]]
epochs = 5
activation_fn = nn.ReLU()

results = {}

for lr, batch_size, hidden_layers in product(learning_rates, batch_sizes, hidden_layer_configs):
    print(f"\nTraining with learning_rate={lr}, batch_size={batch_size}, hidden_layers={hidden_layers}")

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Initialize model, loss, and optimizer
    model = MLP(32 * 32 * 3, hidden_layers, 10, activation_fn).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Training loop
    start_time = time.time()
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for images, labels in train_loader:
            images = images.view(images.size(0), -1).to(device)  # Flatten images
            labels = labels.to(device)

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {running_loss / len(train_loader):.4f}")

    training_time = time.time() - start_time

    # Testing loop
    model.eval()
    correct = 0
    total = 0
    test_loss = 0.0
    with torch.no_grad():
        for images, labels in test_loader:
            images = images.view(images.size(0), -1).to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)
            test_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    test_loss /= len(test_loader)
    print(f"Test Accuracy: {accuracy:.2f}%, Test Loss: {test_loss:.4f}, Training Time: {training_time:.2f} seconds")

    # Store results
    results[(lr, batch_size, tuple(hidden_layers))] = {
        "accuracy": accuracy,
        "loss": test_loss,
        "training_time": training_time
    }

# Compare results
print("\nHyperparameter Tuning Results:")
for (lr, batch_size, hidden_layers), metrics in results.items():
    print(f"LR={lr}, Batch Size={batch_size}, Hidden Layers={hidden_layers}: Accuracy = {metrics['accuracy']:.2f}%, Loss = {metrics['loss']:.4f}, Training Time = {metrics['training_time']:.2f} seconds")

# Discussion on hyperparameters
print("\nDiscussion:")
print("1. Learning Rate:")
print("   - A lower learning rate (e.g., 0.001) leads to slower but more stable convergence, while a higher learning rate (e.g., 0.01) may speed up training but risks overshooting the optimal weights.")
print("2. Batch Size:")
print("   - Smaller batch sizes (e.g., 32) may lead to noisier gradient updates but can generalize better. Larger batch sizes (e.g., 64) stabilize training but may require more memory.")
print("3. Hidden Layers:")
print("   - Deeper networks (e.g., [512, 256, 128]) have more capacity to learn complex patterns but are prone to overfitting and require more training time. Simpler configurations (e.g., [256, 128]) may train faster but might underfit.")


Files already downloaded and verified
Files already downloaded and verified

Training with learning_rate=0.001, batch_size=32, hidden_layers=[256, 128]
Epoch [1/5], Loss: 1.6416
Epoch [2/5], Loss: 1.4461
Epoch [3/5], Loss: 1.3472
Epoch [4/5], Loss: 1.2666
Epoch [5/5], Loss: 1.1965
Test Accuracy: 50.44%, Test Loss: 1.4295, Training Time: 124.60 seconds

Training with learning_rate=0.001, batch_size=32, hidden_layers=[512, 256, 128]
Epoch [1/5], Loss: 1.6662
Epoch [2/5], Loss: 1.4597
Epoch [3/5], Loss: 1.3475
Epoch [4/5], Loss: 1.2607
Epoch [5/5], Loss: 1.1794
Test Accuracy: 51.58%, Test Loss: 1.3844, Training Time: 190.77 seconds

Training with learning_rate=0.001, batch_size=64, hidden_layers=[256, 128]
Epoch [1/5], Loss: 1.6399
Epoch [2/5], Loss: 1.4314
Epoch [3/5], Loss: 1.3238
Epoch [4/5], Loss: 1.2451
Epoch [5/5], Loss: 1.1716
Test Accuracy: 52.47%, Test Loss: 1.3881, Training Time: 99.93 seconds

Training with learning_rate=0.001, batch_size=64, hidden_layers=[512, 256, 128]
Epoch

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import time
from itertools import product

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the MLP model
class MLP(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size, activation_fn):
        super(MLP, self).__init__()
        layers = []
        in_features = input_size

        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(in_features, hidden_size))
            layers.append(activation_fn)
            in_features = hidden_size

        layers.append(nn.Linear(in_features, output_size))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

# Load CIFAR-100 dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = datasets.CIFAR100(root="./data", train=True, transform=transform, download=True)
test_dataset = datasets.CIFAR100(root="./data", train=False, transform=transform, download=True)

# Hyperparameter tuning setup
learning_rates = [0.001, 0.01]
batch_sizes = [32, 64]
hidden_layer_configs = [[256, 128], [512, 256, 128]]
epochs = 10
activation_fn = nn.ReLU()

results = {}


for lr, batch_size, hidden_layers in product(learning_rates, batch_sizes, hidden_layer_configs):
    print(f"\nTraining with learning_rate={lr}, batch_size={batch_size}, hidden_layers={hidden_layers}")

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Initialize model, loss, and optimizer
    model = MLP(32 * 32 * 3, hidden_layers, 100, activation_fn).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Training loop
    start_time = time.time()
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for images, labels in train_loader:
            images = images.view(images.size(0), -1).to(device)  # Flatten images
            labels = labels.to(device)

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {running_loss / len(train_loader):.4f}")

    training_time = time.time() - start_time

    # Testing loop
    model.eval()
    correct = 0
    total = 0
    test_loss = 0.0
    with torch.no_grad():
        for images, labels in test_loader:
            images = images.view(images.size(0), -1).to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)
            test_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    test_loss /= len(test_loader)
    print(f"Test Accuracy: {accuracy:.2f}%, Test Loss: {test_loss:.4f}, Training Time: {training_time:.2f} seconds")

    # Store results
    results[(lr, batch_size, tuple(hidden_layers))] = {
        "accuracy": accuracy,
        "loss": test_loss,
        "training_time": training_time
    }

# Compare results
print("\nHyperparameter Tuning Results:")
for (lr, batch_size, hidden_layers), metrics in results.items():
    print(f"LR={lr}, Batch Size={batch_size}, Hidden Layers={hidden_layers}: Accuracy = {metrics['accuracy']:.2f}%, Loss = {metrics['loss']:.4f}, Training Time = {metrics['training_time']:.2f} seconds")

# Explanation on MLPs and CIFAR-100
print("\nDiscussion:")
print("MLPs struggle with complex image datasets like CIFAR-100 for several reasons:")
print("1. Lack of Spatial Hierarchy: Unlike CNNs, MLPs do not exploit the spatial structure of images, leading to inefficient learning of features.")
print("2. High Dimensionality: Flattening images results in a loss of spatial relationships between pixels.")
print("3. Overfitting: MLPs require more parameters to handle high-dimensional data, increasing the risk of overfitting.")
print("4. Computational Inefficiency: MLPs with many parameters are computationally expensive and require significant resources for training.")


Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ./data/cifar-100-python.tar.gz


100%|██████████| 169M/169M [00:11<00:00, 14.8MB/s]


Extracting ./data/cifar-100-python.tar.gz to ./data
Files already downloaded and verified

Training with learning_rate=0.001, batch_size=32, hidden_layers=[256, 128]
Epoch [1/10], Loss: 3.7803
Epoch [2/10], Loss: 3.4147
Epoch [3/10], Loss: 3.2486
Epoch [4/10], Loss: 3.1222
Epoch [5/10], Loss: 3.0228
Epoch [6/10], Loss: 2.9316
Epoch [7/10], Loss: 2.8463
Epoch [8/10], Loss: 2.7683
Epoch [9/10], Loss: 2.6940
Epoch [10/10], Loss: 2.6203
Test Accuracy: 22.58%, Test Loss: 3.5116, Training Time: 257.83 seconds

Training with learning_rate=0.001, batch_size=32, hidden_layers=[512, 256, 128]
Epoch [1/10], Loss: 3.8164
Epoch [2/10], Loss: 3.4576
Epoch [3/10], Loss: 3.2854
Epoch [4/10], Loss: 3.1602
Epoch [5/10], Loss: 3.0527
Epoch [6/10], Loss: 2.9650
Epoch [7/10], Loss: 2.8706
Epoch [8/10], Loss: 2.7917
Epoch [9/10], Loss: 2.7076
Epoch [10/10], Loss: 2.6269
Test Accuracy: 21.96%, Test Loss: 3.5076, Training Time: 386.55 seconds

Training with learning_rate=0.001, batch_size=64, hidden_layers=[2