# Exercise 1: Custom Activation Functions and Their Impact

In this exercise, you'll implement custom activation functions and compare their performance with standard activations. You'll use the Wine Quality dataset, which is small but provides an interesting regression problem.

## Setup


In [None]:

import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import time

# Load Wine Quality dataset
wine_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep=';')
X = wine_data.drop('quality', axis=1).values
y = wine_data['quality'].values

# Split and scale the data
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

# Utility function for plotting
def plot_activation_functions(activation_functions, x_range=(-5, 5)):
    x = np.linspace(x_range[0], x_range[1], 200)
    plt.figure(figsize=(12, 4))
    
    for name, fn in activation_functions.items():
        y = fn(x)
        plt.plot(x, y, label=name)
    
    plt.grid(True)
    plt.legend()
    plt.title("Activation Functions")
    plt.xlabel("x")
    plt.ylabel("f(x)")
    plt.show()



## Part 1: Implementing Custom Activation Functions

Implement the following custom activation functions:

1. Mish: f(x) = x * tanh(softplus(x))
2. Swish: f(x) = x * sigmoid(x)
3. A custom variant of your choice (be creative!)


In [None]:

def custom_mish(x):
    # TODO: Implement the Mish activation function
    # Hint: Use tf.math.softplus and tf.math.tanh
    return None

def custom_swish(x):
    # TODO: Implement the Swish activation function
    # Hint: Use tf.math.sigmoid
    return None

def custom_variant(x):
    # TODO: Implement your own activation function variant
    # Be creative! Consider combining existing functions or creating something new
    return None

# Test your implementations
activation_functions = {
    "Mish": custom_mish,
    "Swish": custom_swish,
    "Custom Variant": custom_variant
}

# Create test input
test_input = tf.constant([-2.0, -1.0, 0.0, 1.0, 2.0])
print("Test outputs:")
for name, fn in activation_functions.items():
    print(f"{name}: {fn(test_input).numpy()}")

# Plot the activation functions
plot_activation_functions(activation_functions)




## Part 2: Creating Models with Custom Activations

Create a function that builds a model using a given activation function:



In [None]:

def create_model(activation_fn, input_shape=[11]):
    # TODO: Implement a model with:
    # - 3 Dense layers (64, 32, 1 neurons)
    # - Custom activation for hidden layers
    # - No activation for output layer
    # - He initialization for weights
    return None

# Create models with different activations
activations_to_test = {
    "ReLU": tf.nn.relu,
    "Mish": custom_mish,
    "Swish": custom_swish,
    "Custom": custom_variant
}

models = {name: create_model(fn) for name, fn in activations_to_test.items()}



## Part 3: Training and Comparison

Train each model and compare their performance:


In [None]:

def train_and_evaluate(model, name):
    # TODO: Implement training and evaluation
    # - Compile model with appropriate loss and metrics
    # - Train for 20 epochs
    # - Record training time and history
    # - Evaluate on test set
    # Return training time, history, and test score
    return None

# Dictionary to store results
results = {}

# Train and evaluate each model
for name, model in models.items():
    print(f"\nTraining model with {name} activation:")
    results[name] = train_and_evaluate(model, name)

# TODO: Create a comparison DataFrame with:
# - Training time
# - Final training loss
# - Final validation loss
# - Test score




## Part 4: Visualization and Analysis

Create visualizations to compare the performance of different activation functions:


In [None]:


def plot_training_curves(results):
    # TODO: Create plots comparing:
    # - Training loss over time
    # - Validation loss over time
    # - Training vs validation loss for each activation
    pass

# Create visualizations
plot_training_curves(results)

# TODO: Calculate and display additional metrics like:
# - Training speed (examples/second)
# - Number of parameters
# - Memory usage




## Part 5: Analysis Questions

1. Which activation function performed best in terms of:
   - Final model accuracy?
   - Training speed?
   - Convergence stability?

2. Why do you think your custom activation function performed the way it did?

3. What are the tradeoffs between the different activation functions you tested?


# Exercise 2: Momentum and Learning Rate Interaction Study

In this exercise, we'll explore how momentum and learning rate interact during training. We'll create a systematic study of different combinations and visualize their effects on model training.

## Setup and Imports



In [81]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.datasets import fashion_mnist
import pandas as pd

# Load and preprocess Fashion MNIST dataset
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0

# Take a subset of data for faster experimentation
n_samples = 10000
X_train = X_train[:n_samples]
y_train = y_train[:n_samples]



## Part 1: Training Infrastructure



In [82]:
def create_model(seed=42):
    """Creates a simple neural network"""
    tf.random.set_seed(seed)
    return tf.keras.Sequential([
        tf.keras.layers.Flatten(input_shape=[28, 28]),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(10, activation='softmax')
    ])


class TrainingMonitor(tf.keras.callbacks.Callback):
    """Monitors training metrics including loss trends and stability"""
    def __init__(self):
        super().__init__()
        self.batch_losses = []
        self.loss_changes = []  # Track relative loss changes instead of gradients
        
    def on_train_batch_end(self, batch, logs=None):
        # Store the batch loss
        current_loss = logs['loss']
        self.batch_losses.append(current_loss)
        
        # Calculate relative loss change (as a proxy for gradient behavior)
        if len(self.batch_losses) > 1:
            loss_change = abs((current_loss - self.batch_losses[-2]) / self.batch_losses[-2])
            self.loss_changes.append(loss_change)
        else:
            self.loss_changes.append(0.0)




## Part 2: Training Function



In [None]:
def train_model_with_params(learning_rate, momentum, use_nesterov=False):
    """Trains model with specific learning rate and momentum settings"""
    model = create_model()
    
    # STUDENT TASK 3: Create SGD optimizer with given parameters
    
    model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    monitor = TrainingMonitor()
    
    # Train for a small number of epochs
    history = model.fit(
        X_train, y_train,
        epochs=5,
        batch_size=32,
        validation_split=0.2,
        callbacks=[monitor],
        verbose=0
    )
    
    return {
        'history': history.history,
        'batch_losses': monitor.batch_losses,
        'loss_changes': monitor.loss_changes,
        'final_loss': history.history['loss'][-1],
        'final_accuracy': history.history['accuracy'][-1]
    }

def run_parameter_study(learning_rates, momentums):
    """Runs training with different combinations of learning rates and momentums"""
    results = []
    #     # STUDENT TASK 4: Create nested loops to test all combinations
    #     # Include both standard momentum and Nesterov momentum
    



## Part 3: Visualization Functions



In [None]:
def create_heatmaps(results_df):
    """Creates heatmaps for loss and stability across parameter combinations"""
    fig, axes = plt.subplots(2, 2, figsize=(15, 15))

    # STUDENT TASK 5: Create four heatmaps
    
    # Standard momentum accuracy
    
    # Nesterov momentum accuracy
    
    # Standard momentum stability (using loss changes)
    
    # Nesterov momentum stability
    



## Part 4: Running the Experiment



In [None]:
# Define parameter ranges
learning_rates = [0.001, 0.01, 0.1, 0.5]
momentums = [0.0, 0.5, 0.9, 0.99]

# Run the study

# Create visualizations

# Print best configurations


# Exercise 3: Adaptive Learning Rate Emergency

In this exercise, you'll implement a custom callback that monitors training stability and automatically adjusts the learning rate when problems are detected. This represents a real-world scenario where you need to rescue training that's becoming unstable.

## Setup and Imports



In [94]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import fashion_mnist
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load and preprocess Fashion MNIST
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0



## Part 1: Training Monitor Callback



In [None]:
class TrainingEmergencyCallback(tf.keras.callbacks.Callback):
    def __init__(self, 
                 patience=5,
                 loss_spike_threshold=1.5,
                 min_lr=1e-6):
        super().__init__()
        self.patience = patience
        self.loss_spike_threshold = loss_spike_threshold
        self.min_lr = min_lr
        
        self.loss_history = []
        self.lr_history = []
    
    def on_train_begin(self, logs=None):
        # STUDENT TASK 1: Initialize monitoring variables
        # Keep track of consecutive problems and best loss
        pass
        
        
    def check_training_problems(self, logs):
        """Checks for potential training problems"""
        current_loss = logs['loss']
        current_lr = tf.keras.backend.get_value(self.model.optimizer.learning_rate)
        
        # Store history
        self.loss_history.append(current_loss)
        self.lr_history.append(current_lr)
        
        problems = []
        
        # Check for loss spikes
        if len(self.loss_history) > 1:
            avg_previous_loss = np.mean(self.loss_history[-5:]) if len(self.loss_history) >= 5 else self.loss_history[-1]
            if current_loss > avg_previous_loss * self.loss_spike_threshold:
                problems.append("Loss spike detected")
        
        # Check for NaN/Inf
        if np.isnan(current_loss) or np.isinf(current_loss):
            problems.append("NaN/Inf values detected")
            
        # Check for consistently increasing loss
        if len(self.loss_history) >= 5:
            if all(x < y for x, y in zip(self.loss_history[-5:], self.loss_history[-4:])):
                problems.append("Consistently increasing loss")
                
        return problems
    
    def adjust_learning_rate(self, problems):
        """Adjusts learning rate based on detected problems"""
        current_lr = tf.keras.backend.get_value(self.model.optimizer.learning_rate)
        

        if problems:
            # STUDENT TASK 4: Implement learning rate adjustment
            # Reduce learning rate if there are problems
            # Make sure new lr isn't below min_lr
            pass
        return False
    
    def on_batch_end(self, batch, logs=None):
        problems = self.check_training_problems(logs)
        
        if problems:
            self.consecutive_problems += 1
            logger.info(f"Training problems detected: {problems}")
            
            if self.consecutive_problems >= self.patience:
                logger.warning(f"Exceeded patience - adjusting learning rate")
                self.adjust_learning_rate(problems)
                self.consecutive_problems = 0
        else:
            self.consecutive_problems = 0
            current_loss = logs['loss']
            if current_loss < self.best_loss:
                self.best_loss = current_loss





## Part 2: Training Function



In [None]:
def create_model():
    """Creates a model prone to training instability"""
    return tf.keras.Sequential([
        tf.keras.layers.Flatten(input_shape=[28, 28]),
        tf.keras.layers.Dense(512, activation='relu'),
        tf.keras.layers.Dense(512, activation='relu'),
        tf.keras.layers.Dense(512, activation='relu'),
        tf.keras.layers.Dense(10, activation='softmax')
    ])

def train_with_emergency_monitoring(initial_lr=0.1, epochs=10, patience=3):
    """Trains model with emergency monitoring"""
    model = create_model()
    
    # STUDENT TASK 5: Create optimizer and compile model
    
    # Create callback
    
    # Train model
    
    
    return history, emergency_cb



## Part 3: Visualization



In [None]:
def plot_training_metrics(callback):
    """Visualizes training metrics and learning rate adjustments"""
    fig, (ax1, ax3) = plt.subplots(2, 1, figsize=(12, 12))
    
    # Plot loss history
    ax1.plot(callback.loss_history)
    ax1.set_title('Training Loss')
    ax1.set_ylabel('Loss')
    ax1.grid(True)
    

    
    # Plot learning rate changes
    ax3.plot(callback.lr_history)
    ax3.set_title('Learning Rate')
    ax3.set_ylabel('Learning Rate')
    ax3.set_yscale('log')
    ax3.grid(True)
    
    plt.tight_layout()
    plt.show()

logger = logging.getLogger()
logger.setLevel(logging.WARNING)

# Run training
history, callback = train_with_emergency_monitoring(patience=5)
plot_training_metrics(callback)