In [10]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [11]:
iris = load_iris()
X = iris.data
y = iris.target

In [12]:
# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
# Standardize the features (mean=0, variance=1)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [14]:
# Define the Softmax function
def softmax(logits):
    exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))  # Numerical stability
    return exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

In [15]:
# Define the cost function
def cross_entropy_loss(y, probs):
    m = y.shape[0]
    log_likelihood = -np.log(probs[range(m), y])
    return np.sum(log_likelihood) / m

In [16]:
# Define the gradient of the cost function
def compute_gradient(X, y, probs):
    m = X.shape[0]
    grad = np.dot(X.T, (probs - np.eye(np.max(y) + 1)[y])) / m
    return grad

In [17]:
# Implement Batch Gradient Descent with Early Stopping
def softmax_regression(X_train, y_train, X_val, y_val, learning_rate=0.01, max_epochs=1000, patience=5):
    n_classes = len(np.unique(y_train))
    n_features = X_train.shape[1]
    theta = np.zeros((n_features, n_classes))  # Initialize parameters

    best_theta = theta
    best_val_loss = np.inf
    no_improvement = 0

    for epoch in range(max_epochs):
        # Compute probabilities
        logits = np.dot(X_train, theta)
        probs = softmax(logits)

        # Compute gradient and update parameters
        grad = compute_gradient(X_train, y_train, probs)
        theta -= learning_rate * grad

        # Compute validation loss
        val_logits = np.dot(X_val, theta)
        val_probs = softmax(val_logits)
        val_loss = cross_entropy_loss(y_val, val_probs)

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_theta = theta
            no_improvement = 0
        else:
            no_improvement += 1
            if no_improvement >= patience:
                print(f"Early stopping at epoch {epoch}")
                break

        if epoch % 100 == 0:
            train_loss = cross_entropy_loss(y_train, probs)
            print(f"Epoch {epoch}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")

    return best_theta

In [18]:
# Train the model
theta = softmax_regression(X_train, y_train, X_val, y_val, learning_rate=0.1, max_epochs=1000, patience=5)

# Evaluate the model on the validation set
def predict(X, theta):
    logits = np.dot(X, theta)
    probs = softmax(logits)
    return np.argmax(probs, axis=1)

y_pred = predict(X_val, theta)
accuracy = np.mean(y_pred == y_val)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

Epoch 0: Train Loss = 1.0986, Val Loss = 0.9975
Epoch 100: Train Loss = 0.3926, Val Loss = 0.3904
Epoch 200: Train Loss = 0.3599, Val Loss = 0.3707
Epoch 300: Train Loss = 0.3451, Val Loss = 0.3606
Epoch 400: Train Loss = 0.3365, Val Loss = 0.3546
Epoch 500: Train Loss = 0.3309, Val Loss = 0.3508
Epoch 600: Train Loss = 0.3270, Val Loss = 0.3482
Epoch 700: Train Loss = 0.3240, Val Loss = 0.3463
Epoch 800: Train Loss = 0.3216, Val Loss = 0.3449
Epoch 900: Train Loss = 0.3197, Val Loss = 0.3438
Validation Accuracy: 86.67%
