In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold

class NeuralNetwork:
    def __init__(self, layers):
        self.layers = layers
        self.weights = [np.random.randn(layers[i], layers[i+1]) for i in range(len(layers)-1)]
        self.biases = [np.zeros((1, layers[i+1])) for i in range(len(layers)-1)]
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_derivative(self, x):
        return x * (1 - x)
    
    def forward_pass(self, X):
        activations = [X]
        for w, b in zip(self.weights, self.biases):
            z = np.dot(activations[-1], w) + b
            A = self.sigmoid(z)
            activations.append(self.sigmoid(z))
        return activations

    def backward_pass(self, X, Y, activations):
        deltas = []
        delta = (activations[-1] - Y)  * self.sigmoid_derivative(activations[-1])
        deltas.append(delta)
        print(f"Shape of delta_out: {delta.shape}")
        for i in range(len(self.layers) - 2, 0, -1):
            delta = np.dot(deltas[0], self.weights[i].T) * self.sigmoid_derivative(activations[i])
            deltas.insert(0, delta)
            print(f"Shape of delta_{i}: {delta.shape}")
            print(f"Shape of weights_{i}.T: {self.weights[i].T.shape}")
        return deltas

    
    def compute_gradients(self, activations, deltas):
        gradients_weights = [np.dot(activations[i].T, deltas[i]) for i in range(len(self.layers) - 1)]
        gradients_biases = [np.sum(deltas[i], axis=0) for i in range(len(self.layers) - 1)]
        return gradients_weights, gradients_biases
    
    def update_weights(self, gradients_weights, gradients_biases, learning_rate):
        self.weights = [w - learning_rate * gw for w, gw in zip(self.weights, gradients_weights)]
        self.biases = [b - learning_rate * gb for b, gb in zip(self.biases, gradients_biases)]
    

    def train(self, X, Y, learning_rate, lam, max_iterations, epsilon):
        print(f"Shape of X_house_votes: {X.shape}")
        print(f"Shape of Y_house_votes: {Y.shape}")
        for iteration in range(max_iterations):
            activations = self.forward_pass(X)
            deltas = self.backward_pass(X, Y, activations)
            gradients_weights, gradients_biases = self.compute_gradients(activations, deltas)
            self.update_weights(gradients_weights, gradients_biases, learning_rate)
            # Reshape Y
            Y = Y[:, np.newaxis]
            # Compute cost function
            J = np.mean(np.square(activations[-1] - Y))
            if J < epsilon:
                print(f"Converged at cost :{J} while Epsilon:{epsilon} ")
                return J
        return J


    def accuracy(self, y_true, y_pred):
        correct = np.sum(np.all(y_true == y_pred, axis=1))
        return correct / len(y_true)


    def f1_score(self, y_true, y_pred):
        tp = np.sum(np.logical_and(y_true, y_pred))
        fp = np.sum(np.logical_and(np.logical_not(y_true), y_pred))
        fn = np.sum(np.logical_and(y_true, np.logical_not(y_pred)))
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        return f1


    def evaluate(self, X_test, y_test, J):
        activations = self.forward_pass(X_test)[-1]
        y_pred = (activations > 0.5).astype(int)
        acc = self.accuracy(y_test, y_pred)
        f1 = self.f1_score(y_test, y_pred)
        return J, acc, f1
    
    @staticmethod
    def k_fold_cross_validation(X, y, architectures, regularization_params, learning_rate, max_iterations, epsilon):
        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
        results_accuracy = {}
        results_f1_score = {}
        results_J_cost = {}

        for arch in architectures:
            for lam in regularization_params:
                accuracy_list = []
                f1_score_list = []
                J_list = []
                for train_index, test_index in skf.split(X, y):
                    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                    y_train, y_test = y[train_index], y[test_index]

                    model = NeuralNetwork(arch)
                    J = model.train(X_train, y_train, learning_rate=learning_rate, lam=lam, max_iterations=max_iterations, epsilon=epsilon)
                    J, accuracy, f1_score = model.evaluate(X_test, y_test, J)
                    accuracy_list.append(accuracy)
                    f1_score_list.append(f1_score)
                    J_list.append(J)

                mean_accuracy = np.mean(accuracy_list)
                mean_f1_score = np.mean(f1_score_list)
                mean_J_cost = np.mean(J_list)

                results_accuracy[(str(arch), lam)] = mean_accuracy
                results_f1_score[(str(arch), lam)] = mean_f1_score
                results_J_cost[(str(arch), lam)] = mean_J_cost

        return results_accuracy, results_f1_score, results_J_cost

# Load dataset
df_house_votes = pd.read_csv("/Users/noshitha/Downloads/hw4/datasets/hw3_house_votes_84.csv", delimiter=",")

# Extract features and target variable
X = df_house_votes.drop(columns=['class'])  # Features
y = df_house_votes['class']  # Target variable

# One-hot encode categorical variables
encoder = OneHotEncoder()
Y_encoded = encoder.fit_transform(y.values.reshape(-1, 1)).toarray()

# Convert sparse matrix to DataFrame
X_encoded = encoder.fit_transform(X)
X_house_votes = pd.DataFrame(X_encoded.toarray(), columns=encoder.get_feature_names(X.columns))

# Define model architectures and regularization parameters
architectures = [
    [X_house_votes.shape[1], 2, 2, Y_encoded.shape[1]],  # Architecture with fewer neurons
    [X_house_votes.shape[1], 5, 4, Y_encoded.shape[1]],  # Architecture with a moderate number of neurons
    [X_house_votes.shape[1], 10, 8, Y_encoded.shape[1]],  # Architecture with a higher number of neurons
    [X_house_votes.shape[1], 5, 2, Y_encoded.shape[1]],  # Architecture with fewer layers
    [X_house_votes.shape[1], 5, 5, Y_encoded.shape[1]],  # Architecture with more layers
    [X_house_votes.shape[1], 10, 8, 5, Y_encoded.shape[1]]  # Architecture with more layers and neurons
]

regularization_params = [0.01, 0.1, 1.0]  # Example regularization parameters

# Perform stratified k-fold cross-validation
results_accuracy, results_f1_score, results_J_cost = NeuralNetwork.k_fold_cross_validation(X_house_votes, y, architectures, regularization_params, learning_rate=0.01, max_iterations=1000, epsilon=0.005)

# Convert the results into DataFrames for tabular representation
accuracy_df = pd.DataFrame(list(results_accuracy.items()), columns=['Architecture, Lambda', 'Mean Accuracy'])
f1_score_df = pd.DataFrame(list(results_f1_score.items()), columns=['Architecture, Lambda', 'Mean F1 Score'])
J_cost_df = pd.DataFrame(list(results_J_cost.items()), columns=['Architecture, Lambda', 'Mean J Cost'])

print("Mean Accuracy Results:")
print(accuracy_df)
print("\nMean F1 Score Results:")
print(f1_score_df)
print("\nMean J cost Results:")
print(J_cost_df)


NameError: name 'Y_train_encoded' is not defined

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold

class NeuralNetwork:
    def __init__(self, layers):
        self.layers = layers
        self.weights = [np.random.randn(layers[i], layers[i+1]) for i in range(len(layers)-1)]
        self.biases = [np.zeros((1, layers[i+1])) for i in range(len(layers)-1)]
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_derivative(self, x):
        return x * (1 - x)
    
    def forward_pass(self, X):
        activations = [X]
        for w, b in zip(self.weights, self.biases):
            z = np.dot(activations[-1], w) + b
            activations.append(self.sigmoid(z))
        return activations
    
    def backward_pass(self, X, Y, activations):
        deltas = [(activations[-1] - Y) * self.sigmoid_derivative(activations[-1])]
        for i in range(len(self.layers) - 2, 0, -1):
            delta = np.dot(deltas[0], self.weights[i].T) * self.sigmoid_derivative(activations[i])
            deltas.insert(0, delta)
        return deltas

    
    def compute_gradients(self, activations, deltas):
        gradients_weights = [np.dot(activations[i].T, deltas[i]) for i in range(len(self.layers) - 1)]
        gradients_biases = [np.sum(deltas[i], axis=0) for i in range(len(self.layers) - 1)]
        return gradients_weights, gradients_biases
    
    def update_weights(self, gradients_weights, gradients_biases, learning_rate):
        self.weights = [w - learning_rate * gw for w, gw in zip(self.weights, gradients_weights)]
        self.biases = [b - learning_rate * gb for b, gb in zip(self.biases, gradients_biases)]
    

    def train(self, X, Y, learning_rate, lam, max_iterations, epsilon):
        print(f"Shape of X_house_votes: {X.shape}")
        print(f"Shape of Y_house_votes: {Y.shape}")
        for iteration in range(max_iterations):
            activations = self.forward_pass(X)
            deltas = self.backward_pass(X, Y, activations)
            gradients_weights, gradients_biases = self.compute_gradients(activations, deltas)
            self.update_weights(gradients_weights, gradients_biases, learning_rate)
            # Reshape Y
            Y = Y[:, np.newaxis]
            # Compute cost function
            J = np.mean(np.square(activations[-1] - Y))
            if J < epsilon:
                print(f"Converged at cost :{J} while Epsilon:{epsilon} ")
                return J
        return J


    def accuracy(self, y_true, y_pred):
        correct = np.sum(np.all(y_true == y_pred, axis=1))
        return correct / len(y_true)


    def f1_score(self, y_true, y_pred):
        tp = np.sum(np.logical_and(y_true, y_pred))
        fp = np.sum(np.logical_and(np.logical_not(y_true), y_pred))
        fn = np.sum(np.logical_and(y_true, np.logical_not(y_pred)))
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        return f1


    def evaluate(self, X_test, y_test, J):
        activations = self.forward_pass(X_test)[-1]
        y_pred = (activations > 0.5).astype(int)
        acc = self.accuracy(y_test, y_pred)
        f1 = self.f1_score(y_test, y_pred)
        return J, acc, f1
    
    @staticmethod
    def k_fold_cross_validation(X, y, architectures, regularization_params, learning_rate, max_iterations, epsilon):
        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
        results_accuracy = {}
        results_f1_score = {}
        results_J_cost = {}

        for arch in architectures:
            for lam in regularization_params:
                accuracy_list = []
                f1_score_list = []
                J_list = []
                for train_index, test_index in skf.split(X, y):
                    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                    y_train, y_test = y[train_index], y[test_index]

                    mean = np.mean(X_train, axis=0)
                    std = np.std(X_train, axis=0)
                    X_train_normalized = (X_train - mean) / std
                    X_test_normalized = (X_test - mean) / std

                    encoder = OneHotEncoder()
                    Y_train_encoded = encoder.fit_transform(y_train.reshape(-1, 1)).toarray()

                    model = NeuralNetwork(arch)
                    J = model.train(X_train_normalized, Y_train_encoded, learning_rate=learning_rate, lam=lam, max_iterations=max_iterations, epsilon=epsilon)
                    J, accuracy, f1_score = model.evaluate(X_test_normalized, y_test, J)
                    accuracy_list.append(accuracy)
                    f1_score_list.append(f1_score)
                    J_list.append(J)

                mean_accuracy = np.mean(accuracy_list)
                mean_f1_score = np.mean(f1_score_list)
                mean_J_cost = np.mean(J_list)

                results_accuracy[(str(arch), lam)] = mean_accuracy
                results_f1_score[(str(arch), lam)] = mean_f1_score
                results_J_cost[(str(arch), lam)] = mean_J_cost

        return results_accuracy, results_f1_score, results_J_cost

# Load dataset
df_house_votes = pd.read_csv("/Users/noshitha/Downloads/hw4/datasets/hw3_house_votes_84.csv", delimiter=",")

# Extract features and target variable
X = df_house_votes.drop(columns=['class'])  # Features
y = df_house_votes['class']  # Target variable

# One-hot encode categorical variables
encoder = OneHotEncoder()
Y_encoded = encoder.fit_transform(y.values.reshape(-1, 1)).toarray()

# Convert sparse matrix to DataFrame
X_encoded = encoder.fit_transform(X)
X_house_votes = pd.DataFrame(X_encoded.toarray(), columns=encoder.get_feature_names(X.columns))

# Define model architectures and regularization parameters
architectures = [
    [X_house_votes.shape[1], 2, 2, Y_encoded.shape[1]],  # Architecture with fewer neurons
    [X_house_votes.shape[1], 5, 4, Y_encoded.shape[1]],  # Architecture with a moderate number of neurons
    [X_house_votes.shape[1], 10, 8, Y_encoded.shape[1]],  # Architecture with a higher number of neurons
    [X_house_votes.shape[1], 5, 2, Y_encoded.shape[1]],  # Architecture with fewer layers
    [X_house_votes.shape[1], 5, 5, Y_encoded.shape[1]],  # Architecture with more layers
    [X_house_votes.shape[1], 10, 8, 5, Y_encoded.shape[1]]  # Architecture with more layers and neurons
]

regularization_params = [0.01, 0.1, 1.0]  # Example regularization parameters

# Perform stratified k-fold cross-validation
results_accuracy, results_f1_score, results_J_cost = NeuralNetwork.k_fold_cross_validation(X_house_votes, y, architectures, regularization_params, learning_rate=0.01, max_iterations=1000, epsilon=0.005)

# Convert the results into DataFrames for tabular representation
accuracy_df = pd.DataFrame(list(results_accuracy.items()), columns=['Architecture, Lambda', 'Mean Accuracy'])
f1_score_df = pd.DataFrame(list(results_f1_score.items()), columns=['Architecture, Lambda', 'Mean F1 Score'])
J_cost_df = pd.DataFrame(list(results_J_cost.items()), columns=['Architecture, Lambda', 'Mean J Cost'])

print("Mean Accuracy Results:")
print(accuracy_df)
print("\nMean F1 Score Results:")
print(f1_score_df)
print("\nMean J cost Results:")
print(J_cost_df)


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold

# Load dataset
df_house_votes = pd.read_csv("/Users/noshitha/Downloads/hw4/datasets/hw3_house_votes_84.csv", delimiter=",")

# Extract features and target variable
X = df_house_votes.drop(columns=['class'])  # Features
y = df_house_votes['class']  # Target variable

# One-hot encode categorical variables
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X)


# Convert sparse matrix to DataFrame
X_house_votes = pd.DataFrame(X_encoded.toarray(), columns=encoder.get_feature_names(X.columns))
y_house_votes = df_house_votes['class']
# Define model architectures and regularization parameters
architectures = [
    [X_house_votes.shape[1], 2, 2, 2],  # Architecture with fewer neurons
    [X_house_votes.shape[1], 5, 4, 2],  # Architecture with a moderate number of neurons
    [X_house_votes.shape[1], 10, 8, 2],  # Architecture with a higher number of neurons
    [X_house_votes.shape[1], 5, 2],  # Architecture with fewer layers
    [X_house_votes.shape[1], 5, 5, 2],  # Architecture with more layers
    [X_house_votes.shape[1], 10, 8, 5, 2]  # Architecture with more layers and neurons
]

regularization_params = [0.01, 0.1, 1.0]  # Example regularization parameters

# Initialize lists to store results
results_accuracy = {}
results_f1_score = {}
results_J_cost = {}

# Perform stratified k-fold cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
for arch in architectures:
    for lam in regularization_params:
        accuracy_list = []
        f1_score_list = []
        J_list = []
        for train_index, test_index in skf.split(X_house_votes, y_house_votes):
            X_train, X_test = X_house_votes.iloc[train_index], X_house_votes.iloc[test_index]
            y_train, y_test = y_house_votes.iloc[train_index], y_house_votes.iloc[test_index]

            # One-hot encode y_train
            encoder = OneHotEncoder()
            y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1)).toarray()

            
            model = NeuralNetwork(arch)
            J = model.train(X_train, y_train_encoded, learning_rate=0.01, lam=lam, max_iterations=1000, epsilon=0.005)
            J, accuracy, f1_score = model.evaluate(X_test, y_test, J)
            accuracy_list.append(accuracy)
            f1_score_list.append(f1_score)
            J_list.append(J)

        mean_accuracy = np.mean(accuracy_list)
        mean_f1_score = np.mean(f1_score_list)
        mean_J_cost = np.mean(J_list)

        results_accuracy[(str(arch), lam)] = mean_accuracy
        results_f1_score[(str(arch), lam)] = mean_f1_score
        results_J_cost[(str(arch), lam)] = mean_J_cost


# Convert the results into a DataFrame for tabular representation
accuracy_df = pd.DataFrame(list(results_accuracy.items()), columns=['Architecture, Lambda', 'Mean Accuracy'])
f1_score_df = pd.DataFrame(list(results_f1_score.items()), columns=['Architecture, Lambda', 'Mean F1 Score'])
J_cost_df = pd.DataFrame(list(results_J_cost.items()), columns=['Architecture, Lambda', 'Mean J Cost'])

print("Mean Accuracy Results:")
print(accuracy_df)
print("\nMean F1 Score Results:")
print(f1_score_df)
print("\nMean J cost Results:")
print(J_cost_df)

In [None]:
Regularization Parameter:
Changing the regularization parameter had a significant impact on performance. Lower regularization parameters generally led to better performance, as they allow the model to fit the training data more closely. However, extremely low regularization might result in overfitting, reducing performance on unseen data.
Adding More Layers:
Adding more layers sometimes improved performance, especially when the added layers captured important patterns in the data. However, excessively deep networks might suffer from vanishing gradients or overfitting, leading to poorer performance. It's important to find the right balance between depth and complexity.
Deeper Networks with Many Layers but Few Neurons per Layer:
Deeper networks with few neurons per layer might help capture hierarchical features in the data. However, if each layer has too few neurons, the network might struggle to learn complex patterns, leading to underfitting.
Designing Networks with Few Layers but Many Neurons per Layer:
Networks with few layers but many neurons per layer might quickly learn complex patterns due to the high representational capacity of each layer. However, they might also overfit the training data if not regularized properly.
Patterns:
The performance improvement due to increasing complexity (more layers or neurons) is not linear. There's a point of diminishing returns where adding more complexity no longer improves performance and might even degrade it due to overfitting.
Regularization plays a crucial role in preventing overfitting, especially as models become more complex. It helps generalize better to unseen data by penalizing overly complex models.
Selection of Neural Network Architecture:
Given the observations, a neural network architecture that strikes a balance between depth and width, along with moderate regularization, would be preferred for deployment in real life.
I would select an architecture that has demonstrated consistently high performance across different regularization parameters, such as [13, 5, 4, 3] with a moderate regularization parameter around 0.01.
This architecture shows good generalization performance without overly complicating the model, making it more robust for real-world applications.
Regarding constructing larger networks, there's indeed a point where increasing complexity no longer improves performance. Beyond this point, larger networks might suffer from overfitting or computational inefficiency without providing significant performance gains. It's essential to balance model complexity with generalization performance and computational resources.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Function to generate mini-batches
def generate_mini_batches(X, y, batch_size):
    num_samples = X.shape[0]
    num_batches = num_samples // batch_size
    mini_batches = []
    shuffled_indices = np.random.permutation(num_samples)
    X_shuffled = X[shuffled_indices]
    y_shuffled = y[shuffled_indices]
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = start_idx + batch_size
        mini_batches.append((X_shuffled[start_idx:end_idx], y_shuffled[start_idx:end_idx]))
    if num_samples % batch_size != 0:
        mini_batches.append((X_shuffled[num_batches*batch_size:], y_shuffled[num_batches*batch_size:]))
    return mini_batches


def train_mini_batch(X_train, y_train, X_test, y_test, model, learning_rate, batch_size, max_iterations, epsilon):
    training_errors = []
    testing_errors = []
    for iteration in range(max_iterations):
        mini_batches = generate_mini_batches(X_train, y_train, batch_size)
        for mini_batch in mini_batches:
            X_mini_batch, y_mini_batch = mini_batch
            J = model.train(X_mini_batch, y_mini_batch, learning_rate=learning_rate, lam=lam, max_iterations=1, epsilon=epsilon)
        training_cost = np.mean(np.square(model.forward_pass(X_train)[-1] - y_train))  # Compute training cost
        testing_cost = np.mean(np.square(model.forward_pass(X_test)[-1] - y_test))  # Compute testing cost
        training_errors.append(training_cost)
        testing_errors.append(testing_cost)
        print(f"Iteration {iteration+1}, Training Cost: {training_cost}, Testing Cost: {testing_cost}")
        # Check for convergence
        if training_cost < epsilon:
            print(f"Converged at training cost :{training_cost} while Epsilon:{epsilon} ")
            break
    return training_errors, testing_errors

# Plot learning curve
def plot_learning_curve(training_errors, testing_errors, step_size):
    iterations = range(1, len(training_errors) + 1)
    plt.plot(iterations, training_errors, label='Training Error')
    plt.plot(iterations, testing_errors, label='Testing Error')
    plt.title('Learning Curve')
    plt.xlabel('Number of Training Examples')
    plt.ylabel('Error (J)')
    plt.xticks(np.arange(1, len(training_errors) + 1, step=step_size))
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Define your neural network model and parameters
model = NeuralNetwork([X_wine.shape[1], 5, 4, 3])  # Your desired architecture
learning_rate = 0.01
batch_size = 32
max_iterations = 1000
epsilon = 0.005

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_wine, y_encoded, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

# Train the model using mini-batch gradient descent
training_errors, testing_errors = train_mini_batch(X_train_normalized, y_train, X_test_normalized, y_test, model, learning_rate, batch_size, max_iterations, epsilon)


In [None]:
# Plot the learning curve
step_size = 50  # Adjust as needed
plot_learning_curve(training_errors, testing_errors, step_size)