# week 4
## activation functions
### part: 3 investigate the vanishing gradient problem with the sigmoid and tanh activation function

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.datasets import make_moons

# Step 1: Generate a simple classification dataset
X, y = make_moons(n_samples=500, noise=0.2, random_state=42)
y = y.reshape(-1, 1) # Reshape for OneHotEncoder

encoder = OneHotEncoder(sparse_output=False)
y = encoder.fit_transform(y) # One-hot encode labels

scaler = StandardScaler()
X = scaler.fit_transform(X) # Standardize features

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Define activation functions and their derivatives
def sigmoid(x):
return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
return x * (1 - x)

def relu(x):
return np.maximum(0, x)

def relu_derivative(x):
return (x &gt; 0).astype(float)

def tanh(x):
return np.tanh(x)

def tanh_derivative(x):
return 1 - np.tanh(x) ** 2

# Step 3: Define the MLP class with different activation functions
class MLP:
def __init__(self, input_size, hidden_size, output_size, activation=&#39;sigmoid&#39;, learning_rate=0.1,
epochs=1000, batch_size=8):
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.learning_rate = learning_rate
self.epochs = epochs
self.batch_size = batch_size

# Choose activation function
if activation == &#39;sigmoid&#39;:
self.activation = sigmoid
self.activation_derivative = sigmoid_derivative
elif activation == &#39;relu&#39;:
self.activation = relu
self.activation_derivative = relu_derivative
elif activation == &#39;tanh&#39;:
self.activation = tanh
self.activation_derivative = tanh_derivative
else:
raise ValueError(&quot;Invalid activation function. Choose &#39;sigmoid&#39;, &#39;relu&#39;, or &#39;tanh&#39;.&quot;)

# Initialize weights and biases
self.W1 = np.random.randn(input_size, hidden_size)
self.b1 = np.zeros((1, hidden_size))
self.W2 = np.random.randn(hidden_size, output_size)
self.b2 = np.zeros((1, output_size))

def forward(self, X):
self.z1 = np.dot(X, self.W1) + self.b1
self.a1 = self.activation(self.z1)
self.z2 = np.dot(self.a1, self.W2) + self.b2
self.a2 = sigmoid(self.z2) # Output layer remains sigmoid for classification
return self.a2

def backward(self, X, y, output):
error = y - output
d_output = error * sigmoid_derivative(output)
error_hidden = np.dot(d_output, self.W2.T)
d_hidden = error_hidden * self.activation_derivative(self.a1)

# Update weights and biases
self.W2 += np.dot(self.a1.T, d_output) * self.learning_rate
self.b2 += np.sum(d_output, axis=0, keepdims=True) * self.learning_rate
self.W1 += np.dot(X.T, d_hidden) * self.learning_rate
self.b1 += np.sum(d_hidden, axis=0, keepdims=True) * self.learning_rate

return np.mean(np.abs(error))

def train(self, X, y):
self.losses = []
for epoch in range(self.epochs):
indices = np.random.permutation(X.shape[0])

X_shuffled, y_shuffled = X[indices], y[indices]

for i in range(0, X.shape[0], self.batch_size): # Mini-batch processing
X_batch = X_shuffled[i:i+self.batch_size]
y_batch = y_shuffled[i:i+self.batch_size]
output = self.forward(X_batch)
loss = self.backward(X_batch, y_batch, output)

self.losses.append(loss)
if epoch % 100 == 0:
print(f&#39;Epoch {epoch}, Loss: {loss:.4f}&#39;)

def predict(self, X):
return np.argmax(self.forward(X), axis=1)

# Experiment with different activation functions
activations = [&#39;sigmoid&#39;, &#39;relu&#39;, &#39;tanh&#39;]
accuracies = {}

for activation in activations:
print(f&#39;Experimenting with Activation Function: {activation}&#39;)
mlp = MLP(input_size=2, hidden_size=5, output_size=2, activation=activation, learning_rate=0.1,
epochs=1000, batch_size=8)
mlp.train(X_train, y_train)

# Step 5: Plot convergence of training error
plt.plot(mlp.losses, label=f&#39;Activation: {activation}&#39;)

# Evaluate the trained model
predictions = mlp.predict(X_test)
y_test_labels = np.argmax(y_test, axis=1)

accuracy = np.mean(predictions == y_test_labels)
accuracies[activation] = accuracy * 100
print(f&#39;Test Accuracy with {activation} activation -&gt; {accuracy * 100:.2f}%&#39;)

plt.xlabel(&#39;Epochs&#39;)
plt.ylabel(&#39;Training Error&#39;)
plt.title(&#39;Convergence of Training Error with Different Activation Functions&#39;)
plt.legend()
plt.show()

# Investigate Vanishing Gradient Problem
print(&quot;Investigating Vanishing Gradient Problem...&quot;)
for activation in [&#39;sigmoid&#39;, &#39;tanh&#39;]:
mlp = MLP(input_size=2, hidden_size=10, output_size=2, activation=activation, learning_rate=0.1,
epochs=1000, batch_size=8)
mlp.train(X_train, y_train)
avg_gradient = np.mean(np.abs(mlp.W1))
print(f&#39;Average Gradient Magnitude for {activation}: {avg_gradient:.6f}&#39;)

# Print final accuracy comparison
for activation, acc in accuracies.items():
print(f&#39;Final Test Accuracy with {activation} activation: {acc:.2f}%&#39;)