- This example will walk through the entire process: loading data, defining the network, setting up the loss and optimizer, training the network, and evaluating its performance.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import numpy as np

# --- 1. Device Configuration ---
# Check if GPU is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# --- 2. Hyperparameters ---
input_size = 784  # MNIST images are 28x28 pixels -> flattened to 784
hidden_size = 500 # Number of neurons in the hidden layer
num_classes = 10  # Digits 0-9
num_epochs = 5    # How many times to iterate over the entire dataset
batch_size = 100  # Number of samples per batch
learning_rate = 0.001

# --- 3. Load and Prepare MNIST Dataset ---
print("Loading MNIST dataset...")
# Transformations to apply to the data:
# - ToTensor(): Converts PIL Image or numpy.ndarray (H x W x C) in the range [0, 255]
#               to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0].
# - Normalize(): Normalizes tensor with mean and standard deviation.
#                (0.1307,) and (0.3081,) are the calculated mean and std for MNIST.
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# Download and load the training data
train_dataset = torchvision.datasets.MNIST(root='./data',
                                           train=True,
                                           transform=transform,
                                           download=True)

# Download and load the test data
test_dataset = torchvision.datasets.MNIST(root='./data',
                                          train=False,
                                          transform=transform)

# Create DataLoaders
# DataLoader provides an iterable over the dataset, handling batching, shuffling etc.
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=batch_size,
                          shuffle=True) # Shuffle training data each epoch

test_loader = DataLoader(dataset=test_dataset,
                         batch_size=batch_size,
                         shuffle=False) # No need to shuffle test data

print(f"Dataset loaded. Train samples: {len(train_dataset)}, Test samples: {len(test_dataset)}")
print(f"Number of batches in train_loader: {len(train_loader)}")

# --- 4. Define the Neural Network Model (ANN/MLP) ---
# Fully connected neural network with one hidden layer
class SimpleANN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SimpleANN, self).__init__() # Initialize the parent class (nn.Module)
        # Define the layers
        self.fc1 = nn.Linear(input_size, hidden_size) # Input layer to hidden layer
        self.relu = nn.ReLU()                         # Activation function for hidden layer
        self.fc2 = nn.Linear(hidden_size, num_classes) # Hidden layer to output layer

    def forward(self, x):
        # Define the forward pass
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        # Note: No final activation (like Softmax) is applied here because
        # nn.CrossEntropyLoss applies LogSoftmax internally.
        # The output 'out' contains raw scores (logits) for each class.
        return out

# --- 5. Instantiate the Model, Loss, and Optimizer ---
model = SimpleANN(input_size, hidden_size, num_classes).to(device)
print("\nModel Architecture:")
print(model)

# Loss Function: CrossEntropyLoss is suitable for multi-class classification.
# It combines nn.LogSoftmax() and nn.NLLLoss() in one single class.
criterion = nn.CrossEntropyLoss()

# Optimizer: Adam is a popular choice.
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# --- 6. Training Loop ---
print("\nStarting Training...")
n_total_steps = len(train_loader)
train_losses = []
train_accuracies = []

for epoch in range(num_epochs):
    n_correct = 0
    n_samples = 0
    running_loss = 0.0
    model.train() # Set model to training mode

    for i, (images, labels) in enumerate(train_loader):
        # Original image shape: [batch_size, 1, 28, 28]
        # Reshape images to [batch_size, 784] to feed into the ANN
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device) # Move labels to the configured device

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()   # Clear gradients from previous step
        loss.backward()         # Compute gradients based on the loss
        optimizer.step()        # Update model parameters (weights and biases)

        running_loss += loss.item()

        # Calculate accuracy for the batch
        # Get predictions: the class with the highest score (logit)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()

        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

    # Record loss and accuracy for the epoch
    epoch_loss = running_loss / len(train_loader)
    epoch_acc = 100.0 * n_correct / n_samples
    train_losses.append(epoch_loss)
    train_accuracies.append(epoch_acc)
    print(f'--- Epoch {epoch+1} Summary --- Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}% ---')


print("Finished Training.")

# --- 7. Evaluation Loop (Testing) ---
print("\nStarting Evaluation on Test Set...")
# In test phase, we don't need to compute gradients (for memory efficiency)
model.eval() # Set model to evaluation mode (disables dropout, batchnorm updates etc.)
with torch.no_grad():
    n_correct_test = 0
    n_samples_test = 0
    for images, labels in test_loader:
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)
        outputs = model(images)

        # Get predictions: max returns (value, index)
        _, predicted = torch.max(outputs.data, 1)
        n_samples_test += labels.size(0)
        n_correct_test += (predicted == labels).sum().item()

    accuracy_test = 100.0 * n_correct_test / n_samples_test
    print(f'Accuracy of the network on the {len(test_dataset)} test images: {accuracy_test:.2f} %')

# --- 8. Plot Training Loss and Accuracy ---
fig, ax1 = plt.subplots(figsize=(10, 5))

color = 'tab:red'
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Training Loss', color=color)
ax1.plot(range(1, num_epochs + 1), train_losses, color=color, marker='o', label='Training Loss')
ax1.tick_params(axis='y', labelcolor=color)
ax1.grid(True, axis='y', linestyle='--', alpha=0.7)

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
color = 'tab:blue'
ax2.set_ylabel('Training Accuracy (%)', color=color)
ax2.plot(range(1, num_epochs + 1), train_accuracies, color=color, marker='x', linestyle='--', label='Training Accuracy')
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout() # otherwise the right y-label is slightly clipped
plt.title("Training Loss and Accuracy per Epoch")
# Add legends manually if needed, or rely on plot labels if clear
# fig.legend(loc="upper right", bbox_to_anchor=(1,1), bbox_transform=ax1.transAxes)
plt.show()



# PyTorch MNIST ANN Classifier: Code Explanation

This document breaks down a Python script designed to train a simple Artificial Neural Network (ANN) for classifying handwritten digits from the MNIST dataset using PyTorch.

## 1. Device Setup
- **Purpose**: Optimizes computation by utilizing available hardware.
- **Action**: Detects if a CUDA-enabled GPU is present.
  - If yes, sets `device` to `'cuda'`.
  - If no, sets `device` to `'cpu'`.
- **Impact**: All subsequent tensor operations and model parameters are moved to this `device`, enabling GPU acceleration if available.

## 2. Hyperparameters
- **Purpose**: Configures the model architecture and training process.
- **Key Definitions**:
    - `input_size = 784`: MNIST images (28x28 pixels) are flattened into a 784-element vector.
    - `hidden_size = 500`: Specifies the number of neurons in the network's hidden layer.
    - `num_classes = 10`: Represents the ten possible output digits (0-9).
    - `num_epochs = 5`: The number of full passes through the entire training dataset.
    - `batch_size = 100`: The number of samples processed in one iteration before updating model parameters.
    - `learning_rate = 0.001`: Determines the step size for parameter updates during optimization.

## 3. MNIST Dataset Handling
- **Purpose**: Loads and preprocesses the MNIST digit dataset.
- **Actions**:
    - **Loading**: `torchvision.datasets.MNIST` is used to download (if needed) and load the training and test sets.
    - **Transformations (`transforms.Compose`)**:
        - `transforms.ToTensor()`: Converts PIL images (pixel range [0, 255]) to PyTorch FloatTensors (shape: C x H x W, pixel range [0.0, 1.0]).
        - `transforms.Normalize((0.1307,), (0.3081,))`: Standardizes pixel values using the MNIST dataset's global mean and standard deviation, aiding in model convergence.
    - **`DataLoader`**:
        - Wraps the training and test `Dataset` objects.
        - Provides an iterable for accessing data in mini-batches.
        - `shuffle=True` for `train_loader`: Randomizes the order of training samples each epoch to improve generalization.
        - `shuffle=False` for `test_loader`: Order is irrelevant for evaluation.

## 4. Model Definition (`SimpleANN` Class)
- **Purpose**: Defines the architecture of the neural network.
- **Structure**: Inherits from `torch.nn.Module`.
    - **`__init__(self, input_size, hidden_size, num_classes)` (Constructor)**:
        - Initializes `super(SimpleANN, self).__init__()`.
        - `self.fc1 = nn.Linear(input_size, hidden_size)`: First fully connected layer (input to hidden).
        - `self.relu = nn.ReLU()`: ReLU activation function for non-linearity.
        - `self.fc2 = nn.Linear(hidden_size, num_classes)`: Second fully connected layer (hidden to output).
    - **`forward(self, x)` (Forward Pass)**:
        - Defines the data flow: Input `x` -> `fc1` -> `relu` -> `fc2` -> Output (Logits).
        - The output consists of raw scores (logits) for each class; `nn.CrossEntropyLoss` handles the necessary Softmax internally.

## 5. Instantiation of Model, Loss, and Optimizer
- **Model**: `model = SimpleANN(input_size, hidden_size, num_classes).to(device)` creates an instance of the network and moves it to the configured `device`.
- **Loss Function**: `criterion = nn.CrossEntropyLoss()` is selected, ideal for multi-class classification tasks.
- **Optimizer**: `optimizer = optim.Adam(model.parameters(), lr=learning_rate)` is chosen to update model parameters based on computed gradients.

## 6. Training Loop
- **Purpose**: Iteratively trains the model using the training dataset.
- **Process per Epoch**:
    - `model.train()`: Sets the model to training mode (important for layers like Dropout or BatchNorm, though not heavily used in this simple model).
    - **Batch Iteration (`for images, labels in train_loader`)**:
        - **Data Preparation**:
            - `images = images.reshape(-1, 28*28).to(device)`: Flattens images and moves them to the `device`.
            - `labels = labels.to(device)`: Moves labels to the `device`.
        - **Forward Pass**: `outputs = model(images)` generates predictions.
        - **Loss Calculation**: `loss = criterion(outputs, labels)` computes the error.
        - **Backward Pass & Optimization**:
            - `optimizer.zero_grad()`: Clears accumulated gradients from previous steps.
            - `loss.backward()`: Computes gradients of the loss with respect to model parameters.
            - `optimizer.step()`: Updates model parameters using the Adam optimization algorithm.
        - **Metrics**: Tracks and prints running loss and accuracy for monitoring.

## 7. Evaluation Loop (Testing)
- **Purpose**: Assesses the trained model's performance on unseen data.
- **Process**:
    - `model.eval()`: Sets the model to evaluation mode.
    - `with torch.no_grad():`: Disables gradient computation, saving memory and speeding up inference.
    - **Batch Iteration (`for images, labels in test_loader`)**:
        - Data is prepared and moved to the `device`.
        - Model predictions are generated.
        - Accuracy is calculated by comparing predictions to true labels.
    - The overall test accuracy is printed.

## 8. Plotting Results
- **Purpose**: Visualizes the training progress.
- **Action**: Uses `matplotlib.pyplot` to plot the training loss and accuracy values recorded at the end of each epoch. This helps in understanding the learning dynamics and diagnosing potential issues.

## Summary
This script provides a comprehensive, yet fundamental, example of a PyTorch workflow for a standard image classification task. It showcases data loading, model definition, training with backpropagation, and performance evaluation. By running this code, one can expect to see the model's loss decrease and its accuracy on the MNIST dataset improve over the specified number of epochs, typically achieving high classification accuracy.

---
- let's build a simple Artificial Neural Network (ANN) for a regression task using PyTorch. We'll use the California Housing dataset for this example, which involves predicting median house values based on several features.

- This example will cover loading and preparing the data, defining a suitable ANN architecture for regression, training the model, and evaluating its performance.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# --- 1. Device Configuration ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# --- 2. Hyperparameters ---
# input_size will be determined by the dataset
hidden_size1 = 128  # Number of neurons in the first hidden layer
hidden_size2 = 64   # Number of neurons in the second hidden layer
output_size = 1     # Predicting a single continuous value (median house value)
num_epochs = 50     # Increased epochs for regression
batch_size = 64
learning_rate = 0.001

# --- 3. Load and Prepare California Housing Dataset ---
print("Loading California Housing dataset...")
housing = fetch_california_housing()
X, y = housing.data, housing.target

# Convert to Pandas DataFrame for easier inspection (optional)
X_df = pd.DataFrame(X, columns=housing.feature_names)
y_series = pd.Series(y, name='MedHouseVal')

input_size = X.shape[1] # Number of features
print(f"Dataset loaded. Number of features: {input_size}")
print(f"Features shape: {X.shape}, Target shape: {y.shape}")

# Split data into training and testing sets
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Feature Scaling (Very important for ANNs)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_test_scaled = scaler.transform(X_test_raw)

# Convert data to PyTorch Tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_raw, dtype=torch.float32).unsqueeze(1) # Reshape y to [n_samples, 1]
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_raw, dtype=torch.float32).unsqueeze(1) # Reshape y

# Create TensorDatasets and DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

print(f"Data prepared. Train samples: {len(train_dataset)}, Test samples: {len(test_dataset)}")

# --- 4. Define the Neural Network Model for Regression ---
class SimpleANNRegression(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
        super(SimpleANNRegression, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size2, output_size)
        # No activation function for the output layer in regression
        # as we want to predict continuous values directly.

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        return out

# --- 5. Instantiate the Model, Loss, and Optimizer ---
model = SimpleANNRegression(input_size, hidden_size1, hidden_size2, output_size).to(device)
print("\nModel Architecture:")
print(model)

# Loss Function: Mean Squared Error is suitable for regression.
criterion = nn.MSELoss()

# Optimizer: Adam
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# --- 6. Training Loop ---
print("\nStarting Training...")
n_total_steps = len(train_loader)
train_losses = []

for epoch in range(num_epochs):
    model.train() # Set model to training mode
    running_loss = 0.0
    for i, (features, targets) in enumerate(train_loader):
        features = features.to(device)
        targets = targets.to(device)

        # Forward pass
        outputs = model(features)
        loss = criterion(outputs, targets)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        if (i+1) % 200 == 0: # Print more frequently for larger datasets
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

    epoch_loss = running_loss / len(train_loader)
    train_losses.append(epoch_loss)
    print(f'--- Epoch {epoch+1} Summary --- Training Loss (MSE): {epoch_loss:.4f} ---')

print("Finished Training.")

# --- 7. Evaluation Loop (Testing) ---
print("\nStarting Evaluation on Test Set...")
model.eval() # Set model to evaluation mode
all_predictions = []
all_targets = []
with torch.no_grad():
    test_loss = 0.0
    for features, targets in test_loader:
        features = features.to(device)
        targets = targets.to(device)
        outputs = model(features)
        loss = criterion(outputs, targets)
        test_loss += loss.item()
        
        all_predictions.append(outputs.cpu().numpy()) # Move to CPU before converting to NumPy
        all_targets.append(targets.cpu().numpy())

    avg_test_loss = test_loss / len(test_loader)
    
    # Concatenate all predictions and targets
    all_predictions = np.concatenate(all_predictions, axis=0)
    all_targets = np.concatenate(all_targets, axis=0)

    mse_test = mean_squared_error(all_targets, all_predictions) # Should be close to avg_test_loss
    r2_test = r2_score(all_targets, all_predictions)

    print(f'Average Test Loss (MSE): {avg_test_loss:.4f}')
    print(f'Calculated Test MSE: {mse_test:.4f}')
    print(f'Test R-squared (R2): {r2_test:.4f}')


# --- 8. Plot Training Loss ---
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs + 1), train_losses, marker='o', label='Training Loss (MSE)')
plt.xlabel('Epoch')
plt.ylabel('Mean Squared Error Loss')
plt.title('Training Loss per Epoch')
plt.legend()
plt.grid(True)
plt.show()

# --- 9. Plot Actual vs. Predicted Values (Test Set) ---
plt.figure(figsize=(10, 6))
plt.scatter(all_targets, all_predictions, alpha=0.3, edgecolors='k', label='Predictions')
# Plot a line for perfect predictions
min_val = min(all_targets.min(), all_predictions.min())
max_val = max(all_targets.max(), all_predictions.max())
plt.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, label='Perfect Prediction')
plt.xlabel("Actual Median House Value")
plt.ylabel("Predicted Median House Value")
plt.title("Actual vs. Predicted Values (Test Set)")
plt.legend()
plt.grid(True)
plt.show()


# PyTorch Regression ANN: Key Changes and Explanations

This document outlines the modifications made to a simple Artificial Neural Network (ANN) structure to adapt it for a regression task, specifically using the California Housing dataset. The core PyTorch training workflow remains similar to classification, but key differences arise in data preparation, model output, loss function, and evaluation metrics.

## 1. Dataset
- **Source**: `Workspace_california_housing` from `sklearn.datasets`.
- **Nature**: A regression dataset where the goal is to predict housing prices based on various features.

## 2. Feature Scaling
- **Method**: `StandardScaler` from `sklearn.preprocessing`.
- **Action**: Applied to the input features (`X`).
- **Importance**: Crucial for neural networks, especially those using gradient-based optimization. Scaling features to have zero mean and unit variance helps the network train more effectively and converge faster by preventing features with larger magnitudes from dominating the learning process.

## 3. Target Reshaping
- **Action**: The target variable `y` (housing prices) is reshaped using `.unsqueeze(1)`.
- **Reason**: This converts the 1D target tensor (shape `[n_samples]`) into a 2D tensor of shape `[n_samples, 1]`.
- **Compatibility**: PyTorch's `nn.MSELoss` (and many other loss functions) typically expects the model's output and the target tensor to have compatible shapes for element-wise comparison. For a single regression output per sample, both output and target should ideally be `[batch_size, 1]`.

## 4. Model Architecture (`SimpleANNRegression`)
- **Output Layer (`self.fc3`)**:
    - `output_size = 1`: The final linear layer is configured to have a single output neuron.
    - **Reason**: In regression, we are predicting a single continuous value (e.g., price) for each input sample.
- **Output Activation**:
    - **None**: No activation function (like ReLU, Sigmoid, or Softmax) is applied to the output of `self.fc3`.
    - **Reason**: For regression tasks, the model should be able to output any real number. Applying an activation like ReLU would restrict outputs to be non-negative, and Sigmoid/Tanh would restrict them to a specific range, which is usually not desired for general regression problems. The raw linear output of the last layer is used directly as the prediction.

## 5. Loss Function
- **Type**: `nn.MSELoss()` (Mean Squared Error).
- **Formula**: $L = \frac{1}{N} \sum_{i=1}^{N} (y_i - \hat{y}_i)^2$, where $y_i$ is the true value and $\hat{y}_i$ is the predicted value.
- **Suitability**: Standard and widely used loss function for regression tasks as it penalizes larger errors more heavily.

## 6. Training Loop
- **Core Logic**: Remains fundamentally similar to the classification training loop (forward pass, loss calculation, backward pass, optimizer step).
- **Loss Calculation**: The `nn.MSELoss` is directly applied between the model's continuous outputs and the continuous target values.
- **Metrics**:
    - **Accuracy is not used**: Accuracy is a classification metric and is not meaningful for regression.
    - **Training Loss (MSE)**: The primary metric tracked during training is the Mean Squared Error. The goal is to minimize this value.

## 7. Evaluation Loop
- **Primary Metric**:
    - **Test MSE**: The average Mean Squared Error is calculated on the test set to assess performance on unseen data.
- **Additional Metric**:
    - **R-squared ($R^2$) Score**:
        - Calculated using `sklearn.metrics.r2_score`.
        - **Purpose**: Measures the proportion of the variance in the dependent variable that is predictable from the independent variables. It indicates how well the regression model fits the observed data.
        - **Interpretation**:
            - $R^2 = 1$: Perfect fit (model explains all the variability).
            - $R^2 = 0$: Model performs no better than predicting the mean of the target variable.
            - $R^2 < 0$: Model performs worse than predicting the mean (indicates a very poor fit).

## 8. Plotting
- **Training Progress**: A plot of the training loss (MSE) over epochs is generated to visualize the learning process and convergence.
- **Prediction Quality**: A scatter plot of "Actual vs. Predicted" values on the test set is created.
    - **Ideal Scenario**: For a well-performing regression model, the points on this scatter plot should cluster tightly around the diagonal line (where actual value equals predicted value). This visually indicates the model's predictive accuracy.

## Summary
This example demonstrates the necessary adaptations to transform a basic ANN structure from a classification setup to a regression setup within the PyTorch framework. The main changes revolve around the model's output layer configuration (single neuron, no activation), the choice of loss function (`MSELoss`), and the relevant evaluation metrics (MSE, R-squared instead of accuracy). The fundamental training loop structure, however, remains consistent, highlighting the flexibility of PyTorch for various machine learning tasks.