This project sourced in large part from the University of Waterloo's CS480/680 Introduction to Machine Learning

# Upload files in Google Colab
If you are running this Jupyter Notebook on Google Colab, uncomment and run this cell to upload the data files (train_inputs.csv, train_targets.csv, test_inputs.csv, test_targets.csv) in the colab virtual machine.  You will be prompted to select files that you would like to upload. 

If you are running this Jupyter Notebook on your computer, you can delete or ignore this cell.

In [None]:
# from google.colab import files
# uploaded = files.upload()
# %ls

# Import libraries 
Do not use any other Python library.

numpy - Linear algebra library for handling vectors and matrices, collectively processed as numpy arrays.

matplotlib - Graphing library for visualizing .

sklearn - Machine learning library from which we will source some of our datasets. 

time - Simple library for timing code.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import fetch_openml
from time import time

# Uncomment below to allow matplotlib to display interactive plots
# %matplotlib widget

# Load Datasets

These functions load data for classification into RAM for python to use. Some load from files on the device, while some download data from the internet.

Inputs:

*   **plot**: boolean for whether to plot the data points

Outputs:

*   **train_inputs**: numpy array of N training data points x M features
*   **train_labels**: numpy array of N training labels
*   **test_inputs**: numpy array of N' test data points x M features
*   **test_labels**: numpy array of N' test labels

In [None]:
def load_synthetic_excel_data(plot=True):
    np.random.seed(42)

    test_inputs = np.genfromtxt('classification-datasets/synthetic-one-dimension/test_inputs.csv', delimiter=',')
    test_inputs = test_inputs.reshape(-1, 1)  # Ensure it's a 2D array
    test_targets = np.genfromtxt('classification-datasets/synthetic-one-dimension/test_targets.csv', delimiter=',')
    train_inputs = np.genfromtxt('classification-datasets/synthetic-one-dimension/train_inputs.csv', delimiter=',')
    train_inputs = train_inputs.reshape(-1, 1)  # Ensure it's a 2D array
    train_targets = np.genfromtxt('classification-datasets/synthetic-one-dimension/train_targets.csv', delimiter=',')
    train_targets = train_targets.astype(int)
    test_targets = test_targets.astype(int)
    
    train_sample_count = 20
    test_sample_count = 100
    np.random.seed(42)
    train_samples = np.random.choice(train_inputs.shape[0], train_sample_count)
    test_samples = np.random.choice(test_inputs.shape[0], test_sample_count)
    train_inputs = train_inputs[train_samples]
    train_targets = train_targets[train_samples]
    test_inputs = test_inputs[test_samples]
    test_targets = test_targets[test_samples]

    if plot:
        plt.figure(figsize=(10, 5))
        plt.scatter(train_inputs, train_targets, color='blue', label='Train Data')
        plt.scatter(test_inputs, test_targets, color='red', label='Test Data')
        plt.xlabel('Input Feature')
        plt.ylabel('Target Class')
        plt.title('Synthetic One-Dimensional Data')
        plt.legend()
        plt.grid()
        plt.show()

    return train_inputs, train_targets, test_inputs, test_targets

def load_scikit_cancer_data(plot=True):
    # Load the breast cancer diagnostic dataset from scikit-learn
    cancer = load_breast_cancer()
    inputs = cancer.data
    targets = cancer.target
    test_inputs = inputs[:100]
    test_targets = targets[:100]
    train_inputs = inputs[100:]
    train_targets = targets[100:]
    
    # Normalize inputs
    train_inputs = (train_inputs - np.mean(train_inputs, axis=0)) / np.std(train_inputs, axis=0)
    test_inputs = (test_inputs - np.mean(test_inputs, axis=0)) / np.std(test_inputs, axis=0)

    train_sample_count = 20
    test_sample_count = 100
    np.random.seed(42)
    train_samples = np.random.choice(train_inputs.shape[0], train_sample_count)
    test_samples = np.random.choice(test_inputs.shape[0], test_sample_count)
    train_inputs = train_inputs[train_samples]
    train_targets = train_targets[train_samples]
    test_inputs = test_inputs[test_samples]
    test_targets = test_targets[test_samples]

    if plot:
        plt.figure(figsize=(10, 5))
        plt.scatter(train_inputs[:, 0], train_inputs[:, 1], c=train_targets, cmap='viridis', edgecolors='k', s=20)
        plt.xlabel('Feature 1')
        plt.ylabel('Feature 2')
        plt.title('Breast Cancer Diagnostic Data')
        plt.colorbar(label='Target Class')
        plt.grid()
        plt.show()

    return train_inputs, train_targets, test_inputs, test_targets

def load_scikit_49_digits_data(plot=True):
    # Load the optical digits dataset from scikit-learn
    digits = load_digits()
    inputs = digits.data
    targets = digits.target
    # Filter just the 4's and 9's
    inputs = inputs[(targets==4) | (targets==9)]
    targets = targets[(targets==4) | (targets==9)]
    targets[targets==4] = 0
    targets[targets==9] = 1
    # Split into training and testing sets
    test_inputs = inputs[:100]
    test_targets = targets[:100]
    train_inputs = inputs[100:]
    train_targets = targets[100:]
    # Rescale inputs to the range [0, 1]
    train_inputs = (train_inputs) / 16
    test_inputs = (test_inputs) / 16

    train_sample_count = 20
    test_sample_count = 100
    np.random.seed(42)
    train_samples = np.random.choice(train_inputs.shape[0], train_sample_count)
    test_samples = np.random.choice(test_inputs.shape[0], test_sample_count)
    train_inputs = train_inputs[train_samples]
    train_targets = train_targets[train_samples]
    test_inputs = test_inputs[test_samples]
    test_targets = test_targets[test_samples]

    if plot:
        # Plot example digits
        inputs_reshaped = train_inputs.reshape(-1, 8, 8)
        plt.figure(figsize=(10, 5))
        for i in range(10):
            plt.subplot(3, 4, i + 1)
            plt.imshow(inputs_reshaped[i], cmap='gray')
            plt.title(f'Handwritten {4 if train_targets[i]==0 else 9}')
            plt.axis('off')
        plt.tight_layout()
        plt.show()
    
    return train_inputs, train_targets, test_inputs, test_targets

def load_scikit_mini_digits_data(plot=True):
    # Load the optical digits dataset from scikit-learn
    digits = load_digits()
    inputs = digits.data
    targets = digits.target
    test_inputs = inputs[:100]
    test_targets = targets[:100]
    train_inputs = inputs[100:]
    train_targets = targets[100:]
    # Rescale inputs to the range [0, 1]
    train_inputs = train_inputs / 16
    test_inputs = test_inputs / 16

    train_sample_count = 100
    test_sample_count = 100
    np.random.seed(42)
    train_samples = np.random.choice(train_inputs.shape[0], train_sample_count)
    test_samples = np.random.choice(test_inputs.shape[0], test_sample_count)
    train_inputs = train_inputs[train_samples]
    train_targets = train_targets[train_samples]
    test_inputs = test_inputs[test_samples]
    test_targets = test_targets[test_samples]

    if plot:
        # Plot example digits
        inputs_reshaped = train_inputs.reshape(-1, 8, 8)
        plt.figure(figsize=(10, 5))
        for i in range(10):
            plt.subplot(3, 4, i + 1)
            plt.imshow(inputs_reshaped[i], cmap='gray')
            plt.title(f'Handwritten {train_targets[i]}')
            plt.axis('off')
        plt.tight_layout()
        plt.show()

    return train_inputs, train_targets, test_inputs, test_targets

# Function: softmax

This function implements the vectorized logistic softmax function.

Input:

*   **input**: vector of inputs (N x K numpy array of floats)

Output:
*   **output**: vector of outputs (N x K numpy array of floats)

In [None]:
def softmax(input):
    # Make sure you use a numerically stable solution
    return output

# Function: predict_logistic_regression

This function uses a vector of weights to make predictions for a set of inputs.  
The prediction for each data point is a distribution over the labels. 
Assume that there is one column for each class.

Inputs:
*   **inputs**: matrix of input data points for which we want to make a prediction (numpy array of N data points x M+1 features)
*   **weights**: vector of weights (numpy array of M+1 x K weights)

Output:
*   **predicted_probabilities**: matrix of predicted probabilities (numpy array of N data points x K labels)

In [None]:
def predict_logistic_regression(inputs, weights):
    # Compute the linear combination of inputs and weights
    # Apply the softmax function to get predicted probabilities
    return predicted_probabilities

# Function eval_logistic_regression

This function evaluates a set of predictions by computing the negative log probabilities of the labels and the accuracy (percentage of correctly predicted labels).  Assume that there are only two possible labels {0,1}.  A data point is correctly labeled when the probability of the target label is >= 0.5.

Inputs:
*   **inputs**: matrix of input data points for which we will evaluate the predictions (numpy array of N data points x M+1 features)
*   **weights**: vector of weights (numpy array of M+1 x K weights)
*   **labels**: vector of target labels associated with the inputs (numpy array of N x K labels)

Outputs:
*   **neg_log_prob**: negative log probability of the set of predictions (float)
*   **accuracy**: percentage of correctly labeled data points (float)

In [None]:
def eval_logistic_regression(inputs, weights, labels):
    # Compute the linear combination of inputs and weights
    # Compute loss from linear combination using numerically stable computation
    # Apply the softmax function on linear combination to get predicted probabilities
    # Compute accuracy
    return losses, accuracy

# Function: initialize_weights

This function initializes the weights uniformly at random in the interval [-0.01,0.01]

Input:
*   **n_weights**: # of weights to be initialized (integer)
*   **n_classes**: # of type classes to classify into (integer)

Output:
*   **random_weights**: vector of weights (M x K numpy array of floats)

In [None]:
def initialize_weights(n_features, n_classes=2):
    return random_weights

# Function train_logistic_regression_gradient

This function optimizes a set of weights for logistic regression based on a training set using the gradient descent method.

Inputs:
*   **train_inputs**: matrix of input training points (numpy array of N data points x M+1 features)
*   **train_labels**: vector of labels associated with the inputs (numpy array of N labels)
*   **eta_hyperparam**: learning rate for the gradient descent optimization (scalar)
*   **lambda_hyperparam**: lambda hyperparameter used to adjust the overall degree of regularization (scalar)
*   **lasso_ridge_ratio**: hyperparameter used to adjust the amount of lasso (L1) regularization versus ridge (L2) regularization (scalar 0-1)
*   **num_iterations**: number of iterations of the gradient descent algorithm to perform

Output:
*   **weights**: vector of weights that have been optimized (numpy array of M+1 x K weights)



In [None]:
def train_logistic_regression_gradient_descent(train_inputs, train_labels, eta_hyperparam=0.01, lambda_hyperparam=0.0, lasso_ridge_ratio=1.0, num_iters=1000):
    # Initialize weights
    for it in range(num_iters):
        # Compute the predicted probabilities
        # Compute the gradient
        # Apply L1 regularization (lasso) to the gradient
        # Apply L2 regularization (ridge) except for the bias term
        # Update the weights
    return weights

# Cross validation functions

These functions perform k-fold cross validation to search for optimal hyperparameters.

Inputs:
*   **inputs**: matrix of input points (numpy array of N data points by M+1 features)
*   **labels**: vector of labels associated with the inputs (numpy array of N labels)
*   **k_folds**: # of folds in cross-validation (integer)
*   **lasso_ridge_ratio**: hyperparaemter used to adjust the amount of LASSO (L1) regularization versus ridge (L2) regularization (scalar 0-1)
*   **num_iterations**: number of iterations of the gradient descent algorithm to perform (integer)

Outputs:
*   **neg_log_probabilities**: dict of hyperparameter to negative log probabilities for the corresponding hyperparameter (float)
*   **accuracies**: dict of hyperparameter to average accuracy for the corresponding hyperparaemter (float)

# Function cross_validation_eta

This function performs k-fold cross validation to determine the best eta hyperparameter for logistic regression.

Additional Inputs:
*   **eta_hyperparams**: list of learning rate hyperparameters where each hyperparameter is a different eta value (list of floats)
*   **lambda_hyperparam**: lambda hyperparameter used to adjust the overall degree of regularization (scalar)

# Function cross_validation_lambda

This function performs k-fold cross validation to determine the best lambda hyperparameter for logistic regression.

Additional Inputs:
*   **eta_hyperparam**: learning rate hyperparameters where each hyperparameter is a different eta value (scalar)
*   **lambda_hyperparams**: list of lambda hyperparameter used to adjust the overall degree of regularization (list of floats)

In [None]:
def cross_validation_eta(inputs, labels, k_folds, eta_hyperparams, lambda_hyperparam, lasso_ridge_ratio, num_iterations):
    # Shuffle the data
    # Perform k-fold cross-validation
    for i, eta_hyperparam in enumerate(eta_hyperparams):
        for fold in range(k_folds):
            # Split the data into training and validation sets
            # Train the model
            # Evaluate on the validation set
    return neg_log_probabilities, accuracies

def cross_validation_lambda(inputs, labels, k_folds, eta_hyperparam, lambda_hyperparams, lasso_ridge_ratio, num_iterations):
    # Shuffle the data
    # Perform k-fold cross-validation
    for i, lambda_hyperparam in enumerate(lambda_hyperparams):
        for fold in range(k_folds):
            # Split the data into training and validation sets
            # Train the model
            # Evaluate on the validation set
            neg_log_prob, accuracy = eval_logistic_regression(val_inputs, weights, val_labels)
            fold_neg_log_probs.append(neg_log_prob)
            fold_accuracies.append(accuracy)
    return neg_log_probabilities, accuracies    

# Function: plot_logistic_regression_neg_log_probabilities

Function that plots the negative log probabilities for different lambda values (hyperparameters) in logistic regression based on cross validation

Inputs:
*   **neg_log_probabilities**: vector of negative log probabilities for the corresponding hyperparameters (numpy array of floats)
*   **accuracies**: vector of fractions of correct predictions for the corresponding hyperparameters (numpy array of floats)
*   **hyperparams**: list of hyperparameters where each hyperparameter is a different lambda value (list of floats)

# Function: plot_decision_boundary

Function that plots the decision boundary for 1D input data

Inputs:
*   **inputs**: matrix of input points (numpy array of N data points by M+1 features)
*   **labels**: vector of labels associated with the inputs (numpy array of N data points by K labels)
*   **weights**: vector of trained weights (numpy array of M+1 x K weights)

In [None]:
def plot_eta_vs_metrics(neg_log_probabilities,accuracies,hyperparams):
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.plot(hyperparams,neg_log_probabilities)
    plt.ylabel('negative log probability')
    plt.xlabel('eta')
    plt.grid()
    plt.subplot(1, 2, 2)
    plt.plot(hyperparams,accuracies)
    plt.ylabel('accuracy')
    plt.xlabel('eta')
    plt.grid()
    plt.show()

def plot_lambda_vs_metrics(neg_log_probabilities,accuracies,hyperparams):
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.plot(hyperparams,neg_log_probabilities)
    plt.ylabel('negative log probability')
    plt.xlabel('lambda')
    plt.grid()
    plt.subplot(1, 2, 2)
    plt.plot(hyperparams,accuracies)
    plt.ylabel('accuracy')
    plt.xlabel('lambda')
    plt.grid()
    plt.show()

def plot_decision_boundary(inputs, labels, weights):
    # Check if input is 2 dimensional
    if len(inputs[0]) != 2:
        print("plot_decision_boundary only accepts 1-dimensional input + bias.")
        return
    probs = predict_logistic_regression(inputs, weights)
    x_values = np.linspace(-5, 15, 100)
    y_probs = predict_logistic_regression(np.column_stack((np.ones(x_values.shape), x_values)), weights)
    y_values = y_probs[:, 0]  # Get the probabilities for class 1
    # 2d plot
    plt.figure()
    # Draw logistic curve under the data
    plt.plot(x_values, y_values, color='red', label='Logistic Curve', zorder=1)
    plt.scatter(inputs[:, 1], probs[:,0], c=labels[:,0], cmap='viridis', edgecolors='k', s=20, zorder=2)
    plt.xlabel('Feature 1')
    plt.ylabel('Predicted Probability of Class 1')
    plt.colorbar(label='True Class')
    plt.title('Test Data with Predictions')

# Main Logistic Regression code

Use this section to train and test your models. You will do the following:

Load data.

Use k-fold cross validation to find the best lambda value for logistic regression.

Plot the negative log probabilities for different lambda values.

Test logistic regression on full training data with the best lambda value.

In [None]:
# This is one way you can test multiple at once, but implement it however you like
for dataset_name, dataset_info in datasets.items():
    for alg_name, alg in algorithms.items():
        load_dataset = dataset_info["load"]
        lambda_hyperpraram = dataset_info["lambda"]
        eta_hyperparams = dataset_info["etas"]
        run_train = alg["train"]
        run_validation = alg["validation"]
        regularization = alg["regularization"]
        
        # Convert classes to one-hot encoding. You may assume labels are 0-indexed integers
        # Add bias term (1) at the front of each data point
        # Run cross validation
        # Find best and worst parameters
        # Plot results
        # train and evaluate with best parameter