In [1]:
# https://archive.ics.uci.edu/dataset/320/student+performance
# student performance dataset from UCI Machine Learning Repository

In [2]:
import numpy as np
import pandas as pd

In [3]:
# Sigmoid Activation function and its derivative

def logistic(x):
    return 1 / (1 + np.exp(-x))

def logistic_derivative(x):
    return np.multiply(x, (1 - x))


In [4]:
# Single layer forward propagation

def single_layer_forward_propagation(W, B, A_prev):
    Z = np.dot(W, A_prev) + B
    return logistic(Z)


In [5]:
# Set parameters in the parameter dictionary

def set_parameters(parameters, l, W, B):
    parameters["W" + str(l + 1)] = W
    parameters["b" + str(l + 1)] = B
    return parameters

In [6]:
# Get parameters from the parameter dictionary

def get_parameters(parameters, l):
    return parameters["W" + str(l + 1)], parameters["b" + str(l + 1)]


In [7]:
# Forward propagation through the entire neural network
  
def vectorized_forward_propagation(X, parameters):
    activation_cache = {}
    L = len(parameters) // 2  # Number of layers in the network and each layer has weights, so divide by 2
    A = np.transpose(X)
    activation_cache["A_input0"] = A
    for l in range(L):
        W, B = get_parameters(parameters, l)
        A = single_layer_forward_propagation(W, B, A)
        activation_cache["A_input" + str(l + 1)] = A
    return activation_cache

In [8]:
# Backward propagation through the entire network to update weights 
"""
Perform backward propagation through the entire network. 
Args:
X : Input for the training data.
y : True labels for the training data.
activation_cache: A dictionary storing outputs for each layer.
parameters: Dictionary containing weights and biases for each layer.
alpha: The learning rate used to update the parameters.
W : Sample weights.
Returns:
The updated weight after one iteration.
"""
def vectorized_backward_propagation(X, y, activation_cache, parameters, alpha, W):
    m = X.shape[0]
    L = len(parameters) // 2
    A_last = activation_cache["A_input" + str(L)]
    error = A_last - y
    for l in reversed(range(L)):
        A_prev = activation_cache["A_input" + str(l)]
        W_layer, B_layer = get_parameters(parameters, l)
        delta = error * logistic_derivative(activation_cache["A_input" + str(l + 1)])
        dW_layer = np.dot(delta, A_prev.T) / m
        dB_layer = np.sum(delta, axis=1, keepdims=True) / m
        W_layer -= alpha * dW_layer
        B_layer -= alpha * dB_layer
        parameters = set_parameters(parameters, l, W_layer, B_layer)
        if l > 0:
            error = np.dot(W_layer.T, delta)
    return parameters

In [9]:
# Initialization for weights using Xavier/He

def get_a(n_prev, n_next):
    return np.sqrt(2 / (n_prev + n_next))


In [10]:
# Initialize weights for all layers in neural network

def initialize_weights(layer_dimensions):
    parameters = {}
    for l in range(1, len(layer_dimensions)):
        a = get_a(layer_dimensions[l-1], layer_dimensions[l])
        parameters["W" + str(l)] = np.random.uniform(-a, a, size=(layer_dimensions[l], layer_dimensions[l-1]))
        parameters["b" + str(l)] = np.zeros((layer_dimensions[l], 1))
    return parameters

In [11]:
# Train a weak learner (neural network) using the given weights.

def train_weak_learner(X, y, W, alpha, epochs, layer_dimensions):

    # Initialize the weights according to the layer dimensions
    parameters = initialize_weights(layer_dimensions)
    for epoch in range(epochs):
        activation_cache = vectorized_forward_propagation(X, parameters)
        parameters = vectorized_backward_propagation(X, y.T, activation_cache, parameters, alpha, W)
    return parameters


In [12]:
# Forward pass through the trained neural network where, X: Input data points to make prediction 
# parameters: Trained parameters (weights) of the neural network.

def forward(X, parameters):
    activation_cache = vectorized_forward_propagation(X, parameters)
    A_last = activation_cache["A_input" + str(len(parameters) // 2)]
    return A_last

In [13]:
def AdaBoostM2(X, y, T, alpha, epochs, layer_dimensions):
    # AdaBoost.M2 algorithm for binary classification.
    
    N = len(X)
    W = np.ones(N) / N  # Initialize sample weights
    weak_learners = []  # List to store weak learners
    learner_weights = []  # List to store weights of weak learners
    
    for t in range(T):
        # Train a weak learner with current sample weights
        parameters = train_weak_learner(X, y, W, alpha, epochs, layer_dimensions)
        predicted_labels = forward(X, parameters).flatten()

        # Calculate Pseudo-loss or error for current weak  learner
        pseudo_loss = np.sum(W * (y != (predicted_labels > 0.4).astype(int))) / 2
        print(f"Iteration {t}, Pseudo-loss: {pseudo_loss:.4f}")

        # If the error is too high, stop early
        if pseudo_loss >= 0.4:
            break
        # Calculate weight for current weak learner 
        learner_weight = pseudo_loss / (1 - pseudo_loss)
        print(f"Iteration {t}, Beta: {learner_weight:.4f}")

        # Update sample weights based on performance
        W *= np.exp(0.4 * (1 - (y == (predicted_labels > 0.4).astype(int))))
        W /= np.sum(W)
        print(f"Iteration {t}, Weights: {W}")

        # Store the weak learner parameters and its weight
        weak_learners.append(parameters)
        learner_weights.append(np.log(1 / learner_weight))
        
    # Return the list of weak learners and weights of them.
    return weak_learners, learner_weights

In [14]:
#Predict the class labels for a given set of data points.

def predict(X, weak_learners, weights):
    M = len(X) # Number of data points
    y_pred = np.zeros(M)
    for i in range(M):
        weighted_sum = np.sum([forward(X[i].reshape(1, -1), wl).flatten() * w for wl, w in zip(weak_learners, weights)])
        y_pred[i] = (weighted_sum > 0.4).astype(int)
    return y_pred


In [15]:
def hyperparameter_tuning(X, y, T, alpha_values, epoch_values, layer_configs):

    # Initialize variables to store the best hyperparameters
    best_alpha = None
    best_epochs = None
    best_layer_dimensions = None
    best_accuracy = 0
    
    # Split data into training and testing sets
    split_index = int(0.6 * len(X))
    X_train, X_val = X[:split_index], X[split_index:]
    y_train, y_val = y[:split_index], y[split_index:]

    # Iterate through all combinations of hyperparameters
    for alpha in alpha_values:
        for epochs in epoch_values:
            for layer_dimensions in layer_configs:
                weak_learners, weights = AdaBoostM2(X_train, y_train, T, alpha, epochs, layer_dimensions)
                y_pred_val = predict(X_val, weak_learners, weights)
                accuracy_val = np.mean(y_pred_val == y_val)
                print(f"Alpha: {alpha}, Epochs: {epochs}, Layers: {layer_dimensions}, Validation Accuracy: {accuracy_val:.4f}")
                if accuracy_val > best_accuracy:
                    best_alpha = alpha
                    best_epochs = epochs
                    best_layer_dimensions = layer_dimensions
                    best_accuracy = accuracy_val
    
    return best_alpha, best_epochs, best_layer_dimensions


In [16]:
# Load the Student Performance dataset
data = pd.read_csv("student.csv")

# Select target variable (G3) 
# Convert categorical features to numerical for processing
categorical_columns = data.select_dtypes(include=['object']).columns

for col in categorical_columns:
    data[col] = pd.factorize(data[col])[0]

# Define features (X) and target (y)
X = data.drop(columns=['G3']).values # G3 is final grade as per the dataset information
y = data['G3'].values

# Normalize the features values 
X = (X - X.mean(axis=0)) / X.std(axis=0)

# Binarize target: Pass/Fail based on a threshold (e.g., G3 >= 10 is Pass)
y = (y >= 10).astype(int)


In [17]:
# Define the Hyperparameters to tune
alpha_values = [0.01, 0.05]
epoch_values = [20, 50]
layer_configs = [
    [X.shape[1], 16, 1],          # A simple architecture with 1 hidden layer
    [X.shape[1], 32, 16, 1],      # A moderately complex architecture with 2 hidden layers
]

T = 20  # Number of boosting iterations for weak learners

# Perform hyperparameter tuning to find best combination
best_alpha, best_epochs, best_layer_dimensions = hyperparameter_tuning(X, y, T, alpha_values, epoch_values, layer_configs)
print(f"Best Alpha: {best_alpha}, Best Epochs: {best_epochs}, Best Layer Configuration: {best_layer_dimensions}")

Iteration 0, Pseudo-loss: 0.0411
Iteration 0, Beta: 0.0429
Iteration 0, Weights: [0.00247073 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073
 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073
 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073
 0.0036859  0.00247073 0.00247073 0.00247073 0.00247073 0.00247073
 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073
 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073
 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073
 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073
 0.00247073 0.00247073 0.00247073 0.00247073 0.0036859  0.00247073
 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073
 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073
 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073
 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073
 0.00247073 0.00247073 0.00247073 0.0036859  0.0

In [18]:
# Split the dataset into training and testing sets
split_index = int(0.6 * len(X)) # 60% training, 40% testing
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

# Train AdaBoost.M2 with the best hyperparameters
weak_learners, weights = AdaBoostM2(X_train, y_train, T, best_alpha, best_epochs, best_layer_dimensions)


Iteration 0, Pseudo-loss: 0.0411
Iteration 0, Beta: 0.0429
Iteration 0, Weights: [0.00247073 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073
 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073
 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073
 0.0036859  0.00247073 0.00247073 0.00247073 0.00247073 0.00247073
 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073
 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073
 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073
 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073
 0.00247073 0.00247073 0.00247073 0.00247073 0.0036859  0.00247073
 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073
 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073
 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073
 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073 0.00247073
 0.00247073 0.00247073 0.00247073 0.0036859  0.0

In [19]:
# Evaluate performance on training  set
y_pred_train = predict(X_train, weak_learners, weights)
accuracy_train = np.mean(y_pred_train == y_train)
print(f"Training Accuracy: {accuracy_train * 100:.2f}%")



Training Accuracy: 91.77%


In [20]:
# Evaluate performance on testing set
y_pred_test = predict(X_test, weak_learners, weights)
accuracy_test = np.mean(y_pred_test == y_test)
print(f"Test Accuracy: {accuracy_test * 100:.2f}%")


Test Accuracy: 73.85%
