In [54]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle

class NeuralNetwork:
    def __init__(self, layers):
        self.layers = layers
        self.weights = [np.random.randn(layers[i], layers[i+1]) for i in range(len(layers)-1)]
        self.biases = [np.zeros((1, layers[i+1])) for i in range(len(layers)-1)]
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_derivative(self, x):
        return x * (1 - x)
    
    def forward_pass(self, X):
        activations = [X]
        for w, b in zip(self.weights, self.biases):
            z = np.dot(activations[-1], w) + b
            activations.append(self.sigmoid(z))
        return activations
    
    def backward_pass(self, X, Y, activations):
        deltas = [(activations[-1] - Y) * self.sigmoid_derivative(activations[-1])]
        for i in range(len(self.layers) - 2, 0, -1):
            delta = np.dot(deltas[0], self.weights[i].T) * self.sigmoid_derivative(activations[i])
            deltas.insert(0, delta)
        return deltas
    
    
    def compute_gradients(self, activations, deltas):
        gradients_weights = [np.dot(activations[i].T, deltas[i]) for i in range(len(self.layers) - 1)]
        gradients_biases = [np.sum(deltas[i], axis=0) for i in range(len(self.layers) - 1)]
        return gradients_weights, gradients_biases
    
    def update_weights(self, gradients_weights, gradients_biases, learning_rate):
        self.weights = [w - learning_rate * gw for w, gw in zip(self.weights, gradients_weights)]
        self.biases = [b - learning_rate * gb for b, gb in zip(self.biases, gradients_biases)]
    
    def train(self, X, Y, learning_rate, lam, max_iterations, epsilon):
        for iteration in range(max_iterations):
            activations = self.forward_pass(X)
            deltas = self.backward_pass(X, Y, activations)
            gradients_weights, gradients_biases = self.compute_gradients(activations, deltas)
            self.update_weights(gradients_weights, gradients_biases, learning_rate)
            # Compute cost function
            J = np.mean(np.square(activations[-1] - Y))
            #print(f"Iteration {iteration+1}, Cost: {J}")
            # Check for convergence
            if J < epsilon:
                print(f"Converged at cost :{J} while Epsilon:{epsilon} and Iteration: {iteration+1} ")
                return J
        return J
            

    def accuracy(self, y_true, y_pred):
        correct = np.sum(np.all(y_true == y_pred, axis=1))
        return correct / len(y_true)


    def f1_score(self, y_true, y_pred):
        tp = np.sum(np.logical_and(y_true, y_pred))
        fp = np.sum(np.logical_and(np.logical_not(y_true), y_pred))
        fn = np.sum(np.logical_and(y_true, np.logical_not(y_pred)))
        precision = (tp) / (tp + fp) if (tp + fp) > 0 else 0
        recall = (tp) / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        return f1


    def evaluate(self, X_test, y_test, J):
        activations = self.forward_pass(X_test)[-1]
        y_pred = (activations > 0.5).astype(int)
        acc = self.accuracy(y_test, y_pred)
        f1 = self.f1_score(y_test, y_pred)
        return J, acc, f1
    
    def k_fold_cross_validation(X, y, architectures, regularization_params, learning_rate, max_iterations, epsilon):
        results_accuracy = {}
        results_f1_score = {}
        results_J_cost = {}
        
        num_splits = 10
        fold_size = len(X) // num_splits

        for arch in architectures:
            for lam in regularization_params:
                accuracy_list = []
                f1_score_list = []
                J_list = []
                
                for i in range(num_splits):
                    start = i * fold_size
                    end = (i + 1) * fold_size
                    
                    X_train = pd.concat([X[:start], X[end:]])
                    y_train = np.concatenate([y[:start], y[end:]])
                    X_test = X[start:end]
                    y_test = y[start:end]


                    model = NeuralNetwork(arch)
                    J = model.train(X_train, y_train, learning_rate=learning_rate, lam=lam, max_iterations=max_iterations, epsilon=epsilon)
                    J, accuracy, f1_score = model.evaluate(X_test, y_test, J)
                    accuracy_list.append(accuracy)
                    f1_score_list.append(f1_score)
                    J_list.append(J)

                mean_accuracy = np.mean(accuracy_list)
                mean_f1_score = np.mean(f1_score_list)
                mean_J_cost   = np.mean(J_list)

                results_accuracy[(str(arch), lam)] = mean_accuracy
                results_f1_score[(str(arch), lam)] = mean_f1_score
                results_J_cost[(str(arch), lam)] = mean_J_cost

        return results_accuracy, results_f1_score, results_J_cost


df_titanic = pd.read_csv("/Users/noshitha/Downloads/final_project/titanic.csv", delimiter=",")

# Shuffle the dataset
df_titanic_shuffle = shuffle(df_titanic)

# Split the dataset into features and target variable
X_df = df_titanic_shuffle.drop(columns=['Survived'])

# Convert categorical columns to categorical data type
categorical_cols = ['Pclass', 'Siblings/Spouses Aboard', 'Parents/Children Aboard']
for col in categorical_cols:
    X_df[col] = X_df[col].astype('category')

# Encoding on X:
X_categorical = pd.get_dummies(X_df[['Pclass', 'Sex', 'Siblings/Spouses Aboard', 'Parents/Children Aboard']])
X_categorical['Name'] = X_df['Name']  # Add Name column to X_categorical
X_numerical = X_df[['Name', 'Age', 'Fare']]

X_numerical_normal = X_numerical[['Age', 'Fare']]
mean = np.mean(X_numerical_normal, axis=0)
std = np.std(X_numerical_normal, axis=0)
X_numerical_normalized = (X_numerical_normal - mean) / std

# Combine normalized numerical columns with categorical columns
X_combined = pd.concat([X_numerical[['Name']], X_numerical_normalized], axis=1)

merged_df = pd.merge(X_categorical, X_combined, on='Name')
X = merged_df.drop(columns=['Name'])
y = df_titanic_shuffle['Survived']

# Normalize data
y_resized = y.values.reshape(-1, 1)

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse=False)

# Fit and transform the target variable
y_encoded = encoder.fit_transform(y_resized)

# Define model architectures and regularization parameters
architectures = [ 
      [X.shape[1], 16, y_encoded.shape[1]],
      [X.shape[1], 32, y_encoded.shape[1]],
      [X.shape[1], 16, 8, y_encoded.shape[1]]
#     [X.shape[1], 16, 8, y_encoded.shape[1]],
#     [X.shape[1], 32, 16, 8, y_encoded.shape[1]],
#     [X.shape[1], 64, 32, 16, 8, y_encoded.shape[1]]
]

regularization_params = [0.01, 0.10, 0.25, 0.50, 0.75] 


# Initialize lists to store results
results_accuracy = {}
results_f1_score = {}
results_J_cost = {}

# Perform stratified k-fold cross-validation
results_accuracy, results_f1_score, results_J_cost = NeuralNetwork.k_fold_cross_validation(X, y_encoded, architectures, regularization_params, learning_rate=0.01 , max_iterations=3500, epsilon=0.005)

# Convert the results into a DataFrame for tabular representation
accuracy_df = pd.DataFrame(list(results_accuracy.items()), columns=['Architecture, Lambda', 'Mean Accuracy'])
f1_score_df = pd.DataFrame(list(results_f1_score.items()), columns=['Architecture, Lambda', 'Mean F1 Score'])
J_cost_df = pd.DataFrame(list(results_J_cost.items()), columns=['Architecture, Lambda', 'Mean J Cost'])

print("Mean Accuracy Results for titanic DATASET:")
print(accuracy_df)
print("\nMean F1 Score Results for titanic DATASET:")
print(f1_score_df)

Mean Accuracy Results for titanic DATASET:
      Architecture, Lambda  Mean Accuracy
0      ([21, 16, 2], 0.01)       0.796591
1       ([21, 16, 2], 0.1)       0.806818
2      ([21, 16, 2], 0.25)       0.798864
3       ([21, 16, 2], 0.5)       0.806818
4      ([21, 16, 2], 0.75)       0.792045
5      ([21, 32, 2], 0.01)       0.811364
6       ([21, 32, 2], 0.1)       0.796591
7      ([21, 32, 2], 0.25)       0.803409
8       ([21, 32, 2], 0.5)       0.804545
9      ([21, 32, 2], 0.75)       0.798864
10  ([21, 16, 8, 2], 0.01)       0.795455
11   ([21, 16, 8, 2], 0.1)       0.794318
12  ([21, 16, 8, 2], 0.25)       0.801136
13   ([21, 16, 8, 2], 0.5)       0.801136
14  ([21, 16, 8, 2], 0.75)       0.794318

Mean F1 Score Results for titanic DATASET:
      Architecture, Lambda  Mean F1 Score
0      ([21, 16, 2], 0.01)       0.798600
1       ([21, 16, 2], 0.1)       0.806818
2      ([21, 16, 2], 0.25)       0.800239
3       ([21, 16, 2], 0.5)       0.806818
4      ([21, 16, 2], 0.75)     

In [55]:
J_cost_df

Unnamed: 0,"Architecture, Lambda",Mean J Cost
0,"([21, 16, 2], 0.01)",0.092025
1,"([21, 16, 2], 0.1)",0.092037
2,"([21, 16, 2], 0.25)",0.091338
3,"([21, 16, 2], 0.5)",0.091675
4,"([21, 16, 2], 0.75)",0.092553
5,"([21, 32, 2], 0.01)",0.089125
6,"([21, 32, 2], 0.1)",0.090412
7,"([21, 32, 2], 0.25)",0.090392
8,"([21, 32, 2], 0.5)",0.090946
9,"([21, 32, 2], 0.75)",0.090463


In [52]:
J_cost_df

Unnamed: 0,"Architecture, Lambda",Mean J Cost
0,"([21, 16, 2], 0.01)",0.08806
1,"([21, 16, 2], 0.1)",0.08783
2,"([21, 16, 2], 0.25)",0.08746
3,"([21, 16, 2], 0.5)",0.086518
4,"([21, 16, 2], 0.75)",0.087575


In [50]:
# Merge accuracy and f1_score DataFrames on 'Architecture, Lambda'
merged_df = pd.merge(accuracy_df, f1_score_df, on='Architecture, Lambda')

# Merge the merged DataFrame with J_cost_df on 'Architecture, Lambda'
final_df = pd.merge(merged_df, J_cost_df, on='Architecture, Lambda')

# Rename columns for clarity
merged_df.columns = ['Architecture, Lambda', 'Mean Accuracy', 'Mean F1 Score']
merged_df

Unnamed: 0,"Architecture, Lambda",Mean Accuracy,Mean F1 Score
0,"([21, 16, 2], 0.01)",0.8125,0.813674
1,"([21, 16, 2], 0.1)",0.811364,0.813797
2,"([21, 16, 2], 0.25)",0.806818,0.807927
3,"([21, 16, 2], 0.5)",0.804545,0.804545
4,"([21, 16, 2], 0.75)",0.801136,0.80274


In [None]:
## 2500

In [47]:
# Merge accuracy and f1_score DataFrames on 'Architecture, Lambda'
merged_df = pd.merge(accuracy_df, f1_score_df, on='Architecture, Lambda')

# Merge the merged DataFrame with J_cost_df on 'Architecture, Lambda'
final_df = pd.merge(merged_df, J_cost_df, on='Architecture, Lambda')

# Rename columns for clarity
merged_df.columns = ['Architecture, Lambda', 'Mean Accuracy', 'Mean F1 Score']
merged_df

Unnamed: 0,"Architecture, Lambda",Mean Accuracy,Mean F1 Score
0,"([21, 16, 2], 0.01)",0.815909,0.816999
1,"([21, 16, 2], 0.1)",0.810227,0.814172
2,"([21, 16, 2], 0.25)",0.822727,0.826321
3,"([21, 16, 2], 0.5)",0.801136,0.804181
4,"([21, 16, 2], 0.75)",0.802273,0.804069
5,"([21, 16, 8, 2], 0.01)",0.806818,0.808859
6,"([21, 16, 8, 2], 0.1)",0.810227,0.810708
7,"([21, 16, 8, 2], 0.25)",0.802273,0.802273
8,"([21, 16, 8, 2], 0.5)",0.809091,0.811195
9,"([21, 16, 8, 2], 0.75)",0.811364,0.813102


## TO BE RUN

In [56]:
import numpy as np
import matplotlib.pyplot as plt
#from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Function to generate mini-batches
def generate_mini_batches(X, y, batch_size):
    num_samples = X.shape[0]
    num_batches = num_samples // batch_size
    mini_batches = []
    shuffled_indices = np.random.permutation(num_samples)
    X_shuffled = X[shuffled_indices]
    y_shuffled = y[shuffled_indices]
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = start_idx + batch_size
        mini_batches.append((X_shuffled[start_idx:end_idx], y_shuffled[start_idx:end_idx]))
    if num_samples % batch_size != 0:
        mini_batches.append((X_shuffled[num_batches*batch_size:], y_shuffled[num_batches*batch_size:]))
    return mini_batches


def train_mini_batch(X_train, y_train, model, learning_rate, batch_size, max_iterations, epsilon):
    training_errors = []
    for iteration in range(max_iterations):
        mini_batches = generate_mini_batches(X_train, y_train, batch_size)
        for mini_batch in mini_batches:
            X_mini_batch, y_mini_batch = mini_batch
            J = model.train(X_mini_batch, y_mini_batch, learning_rate=learning_rate, lam=0.25, max_iterations=2000, epsilon=epsilon)
        training_cost = np.mean(np.square(model.forward_pass(X_train)[-1] - y_train))  # Compute training cost
        training_errors.append(training_cost)
        #print(f"Iteration {iteration+1}, Training Cost: {training_cost}")
#         # Check for convergence
#         if training_cost < epsilon:
#             print(f"Converged at training cost :{training_cost} while Epsilon:{epsilon} ")
#             break
    return training_errors

# Plot learning curve
def plot_learning_curve(training_errors, step_size):
    iterations = range(1, len(training_errors) + 1)
    plt.plot(iterations, training_errors, label='Training Error')
    plt.title('Learning Curve')
    plt.xlabel('Number of Training Samples')
    plt.ylabel('J values')
    plt.xticks(np.arange(1, len(training_errors) + 1, step=step_size))
    plt.legend()
    plt.grid(True)
    plt.show()
    

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Define your neural network model and parameters
model = NeuralNetwork([X.shape[1], 16, y_encoded.shape[1]])  # Your desired architecture
learning_rate = 0.01
batch_size = 200
max_iterations = 2000
epsilon = 0.005


# Standardize features
mean = np.mean(X, axis=0)
std = np.std(X, axis=0)
X_normalized = (X - mean) / std

print("X_normalized: ",X_normalized)

# Train the model using mini-batch gradient descent
training_errors = train_mini_batch(X_normalized.to_numpy(), y_encoded, model, learning_rate, batch_size, max_iterations, epsilon)


X_normalized:       Pclass_1  Pclass_2  Pclass_3  Sex_female  Sex_male  \
0    1.762521 -0.511601 -1.103404    1.350867 -1.350867   
1   -0.567369 -0.511601  0.906287   -0.740266  0.740266   
2   -0.567369 -0.511601  0.906287    1.350867 -1.350867   
3   -0.567369  1.954649 -1.103404    1.350867 -1.350867   
4   -0.567369 -0.511601  0.906287   -0.740266  0.740266   
..        ...       ...       ...         ...       ...   
882  1.762521 -0.511601 -1.103404    1.350867 -1.350867   
883  1.762521 -0.511601 -1.103404   -0.740266  0.740266   
884  1.762521 -0.511601 -1.103404    1.350867 -1.350867   
885 -0.567369  1.954649 -1.103404    1.350867 -1.350867   
886 -0.567369 -0.511601  0.906287    1.350867 -1.350867   

     Siblings/Spouses Aboard_0  Siblings/Spouses Aboard_1  \
0                     0.684502                  -0.555211   
1                     0.684502                  -0.555211   
2                    -1.460916                   1.801116   
3                     0.684502  

In [None]:
# Plot the learning curve
step_size = 50
plot_learning_curve(training_errors, step_size)

[0.00015101434080728903,
 0.00014564750486419187,
 0.00016243411763689034,
 0.00018351094106995662,
 0.0001413664182069319,
 0.00013322736356195818,
 0.00017575799789746653,
 0.00013948123255833349,
 0.00018315874616125564,
 0.00012870063768131496,
 0.00013795432716112564,
 0.0001393586294228058,
 0.00012318399562895077,
 0.00010988780620345019,
 0.0001030493799800052,
 0.00011024114150019828,
 0.00010598941573792663,
 0.00011083845621076925,
 0.00012560908322939814,
 0.00011637277392867296,
 0.00012536196816982664,
 0.0001250685602092324,
 0.00013334932928829788,
 0.00012595186020250172,
 0.00011520652920928845,
 0.00011331215812835182,
 9.930537494513483e-05,
 0.00010315016289039754,
 0.00010781425867415212,
 0.00010717361955583808,
 0.00014144453398208447,
 9.658112879920119e-05,
 0.00010536519058408826,
 0.0001189808708136688,
 0.00016506114135395007,
 9.39796674996747e-05,
 0.0001189468667140917,
 9.824743200639272e-05,
 0.00011384312583797597,
 0.00010650462579682854,
 0.00012814