#### Core Optimizers

In [None]:
import torch
import torch.optim as optim

################################################
####################   SGD #####################
################################################

model = torch.nn.Linear(10, 1)  # Simple linear model

# SGD optimizer
optimizer = optim.SGD(model.parameters(), 
                      lr=0.01,           # Learning rate: Controls the step size during updates
                      momentum=0.9,      # Momentum: Helps the model move faster in the right direction
                      weight_decay=0.01) # Weight decay: L2 regularization to prevent overfitting

################################################
####################  Adam #####################
################################################

# Adam optimizer
optimizer = optim.Adam(model.parameters(), 
                       lr=0.001,          # Learning rate: Controls the step size
                       betas=(0.9, 0.999), # betas: Momentum terms for the first and second moment estimates
                       eps=1e-08,         # eps: A small value to prevent division by zero during updates
                       weight_decay=0.01) # Weight decay: L2 regularization to prevent overfitting

################################################
################### RMSprop ####################
################################################
# RMSprop optimizer
optimizer = optim.RMSprop(model.parameters(), 
                          lr=0.01,           # Learning rate: Controls the step size
                          alpha=0.99,        # alpha: The smoothing constant, for the moving average of squared gradients
                          eps=1e-08,         # eps: Prevents division by zero
                          weight_decay=0.01) # Weight decay: L2 regularization

################################################
################### Adagrad ####################
################################################
# Adagrad optimizer
optimizer = optim.Adagrad(model.parameters(), 
                          lr=0.01,           # Learning rate
                          weight_decay=0.01,  # Weight decay
                          initial_accumulator_value=0.1)  # Initializes the accumulator for squared gradients

################################################
################### AdamW ####################
################################################
# AdamW optimizer
optimizer = optim.AdamW(model.parameters(), 
                        lr=0.001,          # Learning rate
                        betas=(0.9, 0.999), # Betas for first and second moment estimates
                        eps=1e-08,         # Prevents division by zero
                        weight_decay=0.01) # Weight decay applied in a more decoupled manner

################################################
################### Adadelta ###################
################################################
# Adadelta optimizer
optimizer = optim.Adadelta(model.parameters(), 
                           lr=1.0,            # Learning rate
                           rho=0.95,          # Smoothing constant for running average of squared gradients
                           eps=1e-06,         # Prevents division by zero
                           weight_decay=0.01) # Weight decay


#### Learning Rate Scheduling

How Learning Rate Scheduling Works
Learning rate scheduling in deep learning is a technique used to adjust the learning rate during training. The idea is to gradually change the learning rate to help the model converge more efficiently and avoid overfitting or underfitting.

During training, the learning rate controls how much to adjust the model's parameters (weights) after each update step. A well-chosen learning rate can make training faster and more stable. However, using a constant learning rate throughout training is often suboptimal. Instead, learning rate schedules adjust the learning rate at different points in the training process to improve model performance.

In [None]:
import torch.optim as optim

################################################
################### StepLR #####################
################################################

"""
The learning rate is reduced by a factor (gamma) after a fixed number of epochs (step_size). This is one of the most commonly used schedules, as it gradually decreases the learning rate over time to allow for finer adjustments as training progresses.

Example: Reduce learning rate by a factor of 0.1 every 10 epochs.

When to use: Ideal for longer training runs where the model needs to converge smoothly.
"""

# Example optimizer (Adam in this case)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# StepLR scheduler
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 
                                             step_size=10,   # Every 10 epochs, reduce learning rate
                                             gamma=0.1)      # Multiply the learning rate by 0.1 after step_size epochs

################################################
################### ExponentialLR #####################
################################################
"""
The learning rate decays exponentially based on the number of epochs. This type of scheduling reduces the learning rate at an exponentially decreasing rate, providing a smooth decay.

When to use: Typically used in settings where you want gradual decay without abrupt changes.
"""

# ExponentialLR scheduler
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 
                                                   gamma=0.99)  # Decay the learning rate by 1% after every epoch

################################################
############ CosineAnnealingLR  ################
################################################
"""
The learning rate follows a cosine curve, gradually decaying to a minimum learning rate at the end of each cycle. It can also "restart" after completing a cycle.

When to use: Great for training over many epochs (e.g., deep neural networks or large datasets) and allows for smoother decay. The cyclical nature helps avoid getting stuck in local minima.
"""

# CosineAnnealingLR scheduler
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 
                                                      T_max=50,    # Number of epochs after which learning rate reaches the minimum
                                                      eta_min=1e-6) # The minimum learning rate value, which the scheduler will approach
################################################
############ ReduceLROnPlateau ################
################################################

"""
The learning rate is reduced when a monitored metric (such as validation loss) stops improving. It adapts the learning rate dynamically based on model performance.

When to use: Best for situations where you want to decrease the learning rate when the model plateaus. For example, if the validation loss stops improving for several epochs, the learning rate will decrease, often helping the model escape local minima.
"""

# ReduceLROnPlateau scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 
                                                      mode='min',   # 'min' means we reduce learning rate when validation loss is no longer decreasing
                                                      factor=0.1,   # Reduce the learning rate by a factor of 0.1
                                                      patience=5,   # Number of epochs to wait before reducing the learning rate
                                                      verbose=True) # Print a message when learning rate is reduced

################################################
################# CyclicLR  ####################
################################################
"""
The learning rate is cycled between a minimum and maximum value over multiple iterations, with the idea being to avoid getting stuck in local minima and speed up convergence.

When to use: Best for training models where you want faster convergence and escape from local minima. Useful when you have long training cycles.
"""

# CyclicLR scheduler
scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, 
                                              base_lr=1e-6,    # Minimum learning rate
                                              max_lr=0.1,      # Maximum learning rate
                                              step_size_up=2000, # Number of iterations to increase the learning rate
                                              mode='triangular')  # Triangular mode of learning rate increase/decrease


#### Learning Rate Warm-up

How Learning Rate Warm-up Works
Learning rate warm-up is a technique used to gradually increase the learning rate from a small value to the target value over a set number of iterations or epochs at the beginning of training. This helps avoid large updates in the early stages, which can cause instability or poor convergence.

Instead of starting with a high learning rate, warm-up starts small, allowing the model to make controlled, stable updates. Once the warm-up phase is complete, the learning rate reaches its target, and the model continues training with the chosen learning rate or a scheduled decay. This method ensures smoother and more stable training, especially for deep networks and large models.

In [None]:
import torch
import torch.optim as optim

# Define a simple model (e.g., a linear layer)
model = torch.nn.Linear(10, 1)

# Optimizer setup (using Adam)
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Initial learning rate (after warm-up phase)

# Learning rate warm-up parameters
warm_up_epochs = 5  # Number of epochs for the warm-up phase
base_lr = 1e-6      # Starting learning rate during warm-up (start small to avoid instability)
target_lr = 0.001   # Final learning rate after warm-up (typically your target learning rate)

# Warm-up loop: gradually increase the learning rate from base_lr to target_lr
for epoch in range(warm_up_epochs):
    # Calculate the current learning rate for this epoch
    lr = base_lr + (target_lr - base_lr) * (epoch + 1) / warm_up_epochs  # Linear increase
    """
    The learning rate is gradually increased linearly from base_lr to target_lr. For each epoch during the warm-up phase, the learning rate is updated.
    lr is calculated for each epoch by linearly interpolating between base_lr and target_lr.
    """
    for param_group in optimizer.param_groups:  # Update optimizer's learning rate
        param_group['lr'] = lr  # Set the new learning rate

    # Explanation of parameters:
    # base_lr: The initial learning rate at the start of warm-up (very small to start training gently)
    # target_lr: The final learning rate after warm-up (the main learning rate that you intend to use)
    # warm_up_epochs: The number of epochs to linearly increase the learning rate
    # lr: The current learning rate, gradually increasing from base_lr to target_lr over the warm-up epochs
    
    print(f'Epoch {epoch + 1}/{warm_up_epochs} | Learning Rate: {lr}')  # Print the current learning rate for each epoch

# After warm-up, continue training with the main learning rate
# You can use a scheduler or leave the learning rate constant if needed.


#### Weight Decay (L2 Regularization)

Why Weight Decay (L2 Regularization) is Needed
Weight decay, also known as L2 regularization, is a technique used to prevent overfitting in machine learning models, especially deep learning models. It works by adding a penalty to the loss function based on the magnitude of the weights. This penalty discourages the model from learning overly large weights, which can cause the model to overfit to the training data.

How It Works:
During training, the optimizer updates the model's weights to minimize the loss function. Weight decay adds an extra term to the loss function that penalizes large weights, effectively shrinking them over time. This makes the model focus on simpler solutions rather than overfitting to the noise in the training data.

Impact of Weight Decay:
Prevents Overfitting: By adding a penalty for large weights, weight decay helps the model generalize better to unseen data.

Simpler Models: Encourages the model to find solutions with smaller weights, leading to more robust and generalizable models.

Stabilizes Training: Helps the optimizer focus on simpler solutions, preventing extreme weight values that could cause instability during training.

#### Gradient Clipping

Why Gradient Clipping is Needed:
Gradient clipping is a technique used to prevent the exploding gradient problem in deep learning models. This problem occurs when gradients become very large during backpropagation, leading to unstable training, where weights change dramatically in a single update, causing the model to diverge.

Gradient clipping limits the gradients' values, ensuring they do not exceed a certain threshold. This helps stabilize the training process, especially in deep networks or RNNs where the gradients can become very large.

How Gradient Clipping Works:
During the backpropagation process, the gradients of the loss function with respect to the model's parameters are computed. If any gradient exceeds a predefined threshold, gradient clipping scales the gradients down so that they do not exceed this threshold, thus maintaining stability in the training process.

Impact of Gradient Clipping:
Prevents Exploding Gradients: By clipping large gradients, the model avoids drastic weight updates and keeps the training process stable.

Improves Training Stability: It ensures that large gradients do not lead to unstable updates, which is especially useful in deep networks and RNNs.

Allows Use of Larger Learning Rates: With gradient clipping, you can use larger learning rates without risking instability, improving training efficiency.

In [None]:
import torch
import torch.optim as optim

# Define a simple model (e.g., a linear layer)
model = torch.nn.Linear(10, 1)

# Optimizer setup (using Adam)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop (example)
for epoch in range(100):
    optimizer.zero_grad()  # Zero out gradients from previous step
    loss = some_loss_function(model, data, targets)  # Compute loss (assume loss function is defined)
    
    loss.backward()  # Backpropagate gradients
    
    # Gradient clipping to avoid exploding gradients
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Clip gradients if their norm exceeds 1.0
    
    optimizer.step()  # Update model parameters based on gradients

    print(f'Epoch {epoch + 1}/100 | Loss: {loss.item()}')

# Explanation of the code:
# clip_grad_norm_: Clamps the gradients during backpropagation if their norm exceeds the max_norm threshold.
# max_norm: The threshold value for gradient clipping. Gradients are scaled down to this value if they exceed it.
# optimizer.step(): After clipping, the optimizer uses the modified gradients to update the model's weights.

