In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import time
from tabulate import tabulate

# Load the data
pima_df = pd.read_csv("week_11_data_pima.csv")

# Prepare the data
X = pima_df.drop('outcome', axis=1)
y = pima_df['outcome']

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Define configurations
configs = [
    {'data_size': 1000, 'hidden_layers': (4,), 'name': '1 hidden layer 4 nodes'},
    {'data_size': 10000, 'hidden_layers': (4,), 'name': '1 hidden layer 4 nodes'},
    {'data_size': 100000, 'hidden_layers': (4,), 'name': '1 hidden layer 4 nodes'},
    {'data_size': 1000, 'hidden_layers': (4, 4), 'name': '2 hidden layers of 4 nodes each'},
    {'data_size': 10000, 'hidden_layers': (4, 4), 'name': '2 hidden layers of 4 nodes each'},
    {'data_size': 100000, 'hidden_layers': (4, 4), 'name': '2 hidden layers of 4 nodes each'},
]

results = []

for config in configs:
    # Prepare data subset based on data_size
    train_size = min(config['data_size'], len(X_train_scaled))

    # Sample from training data if needed
    if train_size < len(X_train_scaled):
        indices = np.random.choice(len(X_train_scaled), train_size, replace=False)
        X_train_subset = X_train_scaled[indices]
        y_train_subset = y_train.iloc[indices]
    else:
        # If dataset is smaller than requested, use repeated sampling
        repeats = train_size // len(X_train_scaled)
        remainder = train_size % len(X_train_scaled)

        X_train_subset = np.vstack([X_train_scaled] * repeats)
        y_train_subset = pd.concat([y_train] * repeats)

        if remainder > 0:
            indices = np.random.choice(len(X_train_scaled), remainder, replace=False)
            X_train_subset = np.vstack([X_train_subset, X_train_scaled[indices]])
            y_train_subset = pd.concat([y_train_subset, y_train.iloc[indices]])

    # Create and train model
    model = MLPClassifier(
        hidden_layer_sizes=config['hidden_layers'],
        activation='relu',  # Using ReLU activation
        solver='adam',
        max_iter=500,
        random_state=42,
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=10
    )

    # Time the training
    start_time = time.time()
    model.fit(X_train_subset, y_train_subset)
    end_time = time.time()

    # Make predictions
    train_pred = model.predict(X_train_subset)
    val_pred = model.predict(X_val_scaled)

    # Calculate accuracy (using 1-accuracy for error)
    train_error = 1 - accuracy_score(y_train_subset, train_pred)
    val_error = 1 - accuracy_score(y_val, val_pred)
    execution_time = end_time - start_time

    results.append([
        config['data_size'],
        config['name'],
        f"{train_error:.4f}",
        f"{val_error:.4f}",
        f"{execution_time:.3f}s"
    ])

# Display results in table format
headers = ["Data size", "Configuration", "Training error", "Validation error", "Time of execution"]
print(tabulate(results, headers=headers, tablefmt="grid"))

+-------------+---------------------------------+------------------+--------------------+---------------------+
|   Data size | Configuration                   |   Training error |   Validation error | Time of execution   |
|        1000 | 1 hidden layer 4 nodes          |           0.238  |             0.2429 | 0.071s              |
+-------------+---------------------------------+------------------+--------------------+---------------------+
|       10000 | 1 hidden layer 4 nodes          |           0.0128 |             0.0142 | 0.495s              |
+-------------+---------------------------------+------------------+--------------------+---------------------+
|      100000 | 1 hidden layer 4 nodes          |           0.0006 |             0.0006 | 6.255s              |
+-------------+---------------------------------+------------------+--------------------+---------------------+
|        1000 | 2 hidden layers of 4 nodes each |           0.246  |             0.2383 | 0.065s        