In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dataclasses import dataclass
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import os
import pickle


plt.rcParams['figure.dpi'] = 150
plt.rcParams['axes.grid'] = True
plt.style.use('seaborn-v0_8-darkgrid')
np.set_printoptions(suppress=True)


In [11]:
try:
    df = pd.read_csv('diabetes_noisy.csv')
except FileNotFoundError:
    print("Error: 'diabetes_noisy.csv' not found.")
    raise

df = df.dropna().reset_index(drop=True)

In [12]:

y = df['Outcome'].values
X_df = df.drop(columns=['Outcome'])


numeric_cols = X_df.columns.tolist()

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_cols)
])


X = preprocessor.fit_transform(X_df).astype(np.float64)


X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
print(f"Positive rate: {y.mean():.4f}\n")

Train: (537, 8), Val: (115, 8), Test: (116, 8)
Positive rate: 0.3490



In [13]:
class LogisticRegression:
    def __init__(self, n_features):
        self.weights = np.zeros(n_features, dtype=np.float64)
        self.bias = 0.0

    def sigmoid(self, z):
        z = np.clip(z, -500, 500)
        return 1.0/(1.0 + np.exp(-z))

    def forward(self, X):
        return self.sigmoid(X @ self.weights + self.bias)

    def compute_loss(self, y_true, y_pred):
        eps = 1e-15
        return -np.mean(y_true*np.log(y_pred+eps) + (1-y_true)*np.log(1-y_pred+eps))

    def compute_gradients(self, X, y_true, y_pred):
        m = X.shape[0]
        err = y_pred - y_true
        dw = (X.T @ err)/m
        db = np.mean(err)
        return dw, db

    def predict(self, X):
        return (self.forward(X) > 0.5).astype(int)

    def evaluate(self, X, y_true):
        y_prob = self.forward(X)
        y_pred = (y_prob > 0.5).astype(int)
        return {
            'loss': self.compute_loss(y_true, y_prob),
            'accuracy': accuracy_score(y_true, y_pred),
            'precision': precision_score(y_true, y_pred, zero_division=0),
            'recall': recall_score(y_true, y_pred, zero_division=0),
            'f1': f1_score(y_true, y_pred, zero_division=0),
            'roc_auc': roc_auc_score(y_true, y_prob),
        }

In [14]:
class SGDOptimizer:
    def __init__(self, learning_rate=0.01):
        self.lr = learning_rate
    def update(self, w, b, dw, db):
        return w - self.lr*dw, b - self.lr*db

class MomentumOptimizer:
    def __init__(self, learning_rate=0.01, momentum=0.9):
        self.lr = learning_rate
        self.m = momentum
        self.vw = None
        self.vb = 0.0
    def update(self, w, b, dw, db):
        if self.vw is None:
            self.vw = np.zeros_like(w)
        self.vw = self.m*self.vw - self.lr*dw
        self.vb = self.m*self.vb - self.lr*db
        return w + self.vw, b + self.vb

class NesterovOptimizer:
    def __init__(self, learning_rate=0.01, momentum=0.9):
        self.lr = learning_rate
        self.m = momentum
        self.vw = None
        self.vb = 0.0

    def update(self, w, b, dw, db):
        if self.vw is None:
            self.vw = np.zeros_like(w)

        self.vw = self.m * self.vw - self.lr * dw
        self.vb = self.m * self.vb - self.lr * db

        w_next = w + (self.m * self.vw - self.lr * dw)
        b_next = b + (self.m * self.vb - self.lr * db)

        return w_next, b_next

class AdagradOptimizer:
    def __init__(self, learning_rate=0.01, epsilon=1e-8):
        self.lr = learning_rate
        self.eps = epsilon
        self.gs_w = None
        self.gs_b = 0.0
    def update(self, w, b, dw, db):
        if self.gs_w is None:
            self.gs_w = np.zeros_like(w)
        self.gs_w += dw**2
        self.gs_b += db**2
        return (w - self.lr*dw/(np.sqrt(self.gs_w)+self.eps),
                b - self.lr*db/(np.sqrt(self.gs_b)+self.eps))

class RMSpropOptimizer:
    def __init__(self, learning_rate=0.001, decay=0.9, epsilon=1e-8):
        self.lr = learning_rate
        self.decay = decay
        self.eps = epsilon
        self.ms_w = None
        self.ms_b = 0.0
    def update(self, w, b, dw, db):
        if self.ms_w is None:
            self.ms_w = np.zeros_like(w)
        self.ms_w = self.decay*self.ms_w + (1-self.decay)*(dw**2)
        self.ms_b = self.decay*self.ms_b + (1-self.decay)*(db**2)
        return (w - self.lr*dw/(np.sqrt(self.ms_w)+self.eps),
                b - self.lr*db/(np.sqrt(self.ms_b)+self.eps))

class AdamOptimizer:
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.lr = learning_rate
        self.b1 = beta1
        self.b2 = beta2
        self.eps = epsilon
        self.mw = None
        self.vw = None
        self.mb = 0.0
        self.vb = 0.0
        self.t = 0
    def update(self, w, b, dw, db):
        if self.mw is None:
            self.mw = np.zeros_like(w)
            self.vw = np.zeros_like(w)
        self.t += 1
        self.mw = self.b1*self.mw + (1-self.b1)*dw
        self.mb = self.b1*self.mb + (1-self.b1)*db
        self.vw = self.b2*self.vw + (1-self.b2)*(dw**2)
        self.vb = self.b2*self.vb + (1-self.b2)*(db**2)
        mw_hat = self.mw/(1 - self.b1**self.t)
        mb_hat = self.mb/(1 - self.b1**self.t)
        vw_hat = self.vw/(1 - self.b2**self.t)
        vb_hat = self.vb/(1 - self.b2**self.t)
        return (w - self.lr*mw_hat/(np.sqrt(vw_hat)+self.eps),
                b - self.lr*mb_hat/(np.sqrt(vb_hat)+self.eps))


In [15]:
@dataclass
class TrainConfig:
    epochs: int = 50
    batch_size: int = 32
    seed: int = 42
    l2_lambda: float = 0.0

def set_seed(seed=42):
    np.random.seed(seed)

def train_epoch(model, X_tr, y_tr, optimizer, cfg: TrainConfig, track_gradients=False):
    n = X_tr.shape[0]
    idx = np.random.permutation(n)
    total_loss = 0.0
    grad_norms = []
    bs = cfg.batch_size

    for i in range(0, n, bs):
        xb = X_tr[idx[i:i+bs]]
        yb = y_tr[idx[i:i+bs]]
        yp = model.forward(xb)
        base_loss = model.compute_loss(yb, yp)
        l2_pen = 0.5 * cfg.l2_lambda * np.dot(model.weights, model.weights)
        total_loss += (base_loss + l2_pen) * len(yb)

        dw, db = model.compute_gradients(xb, yb, yp)
        if cfg.l2_lambda > 0.0:
            dw = dw + cfg.l2_lambda * model.weights

        if track_gradients:
            grad_norms.append(np.linalg.norm(dw))

        model.weights, model.bias = optimizer.update(model.weights, model.bias, dw, db)

    if track_gradients:
        return total_loss / n, np.mean(grad_norms)
    return total_loss / n

def train_model_detailed(model, optimizer, X_tr, y_tr, X_va, y_va, cfg: TrainConfig):
    set_seed(cfg.seed)
    history = {
        'epoch': [], 'train_loss': [], 'val_loss': [],
        'val_acc': [], 'val_f1': [], 'val_roc_auc': [], 'grad_norm': []
    }

    for e in range(cfg.epochs):
        tr_loss, grad_norm = train_epoch(model, X_tr, y_tr, optimizer, cfg, track_gradients=True)
        val = model.evaluate(X_va, y_va)

        history['epoch'].append(e+1)
        history['train_loss'].append(tr_loss)
        history['val_loss'].append(val['loss'])
        history['val_acc'].append(val['accuracy'])
        history['val_f1'].append(val['f1'])
        history['val_roc_auc'].append(val['roc_auc'])
        history['grad_norm'].append(grad_norm)

    return pd.DataFrame(history)

In [16]:
def make_optimizer(name, lr=0.01, momentum=0.9):
    if name == 'SGD': return SGDOptimizer(lr)
    if name == 'Momentum': return MomentumOptimizer(lr, momentum)
    if name == 'Nesterov': return NesterovOptimizer(lr, momentum)
    if name == 'Adagrad': return AdagradOptimizer(lr)
    if name == 'RMSprop': return RMSpropOptimizer(lr)
    if name == 'Adam': return AdamOptimizer(lr)
    raise ValueError(name)


BEST_HPARAMS = {
    'Adam':     {'lr': 0.01,  'momentum': None},
    'Momentum': {'lr': 0.01,  'momentum': 0.95},
    'Nesterov': {'lr': 0.01,  'momentum': 0.95},
    'RMSprop':  {'lr': 0.001, 'momentum': None},
    'SGD':      {'lr': 0.1,   'momentum': None},
    'Adagrad':  {'lr': 0.03,  'momentum': None},
}

OPTIMIZER_NAMES = ['Adam', 'Nesterov', 'Momentum', 'RMSprop', 'SGD', 'Adagrad']


os.makedirs('pima_results', exist_ok=True)


In [17]:
convergence_histories = {}
convergence_times = {}

for name in OPTIMIZER_NAMES:
    print(f"Training {name}...")
    n_features = X_train.shape[1]
    lr = BEST_HPARAMS[name]['lr']
    mom = BEST_HPARAMS[name]['momentum']

    model = LogisticRegression(n_features)
    opt = make_optimizer(name, lr=lr, momentum=(mom if mom is not None else 0.9))
    cfg = TrainConfig(epochs=50, batch_size=32, seed=42)

    start = pd.Timestamp.now()
    hist_df = train_model_detailed(model, opt, X_train, y_train, X_val, y_val, cfg)
    elapsed = (pd.Timestamp.now() - start).total_seconds()

    convergence_histories[name] = hist_df
    convergence_times[name] = elapsed

    test_results = model.evaluate(X_test, y_test)
    print(f"  → F1 @ ep10: {hist_df.loc[9,'val_f1']:.4f}, ep30: {hist_df.loc[29,'val_f1']:.4f}, "
          f"ep50: {hist_df.loc[49,'val_f1']:.4f} | Test F1: {test_results['f1']:.4f} | Time: {elapsed:.2f}s")


fig, axes = plt.subplots(2, 2, figsize=(14, 10))
colors = plt.cm.tab10(range(len(OPTIMIZER_NAMES)))


ax = axes[0, 0]
for i, name in enumerate(OPTIMIZER_NAMES):
    ax.plot(convergence_histories[name]['epoch'],
            convergence_histories[name]['train_loss'],
            label=name, linewidth=2.5, color=colors[i], alpha=0.8)
ax.set_xlabel('Epoch', fontsize=11)
ax.set_ylabel('Training Loss', fontsize=11)
ax.set_title('Training Loss Convergence', fontsize=12, fontweight='bold')
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)


ax = axes[0, 1]
for i, name in enumerate(OPTIMIZER_NAMES):
    ax.plot(convergence_histories[name]['epoch'],
            convergence_histories[name]['val_loss'],
            label=name, linewidth=2.5, color=colors[i], alpha=0.8)
ax.set_xlabel('Epoch', fontsize=11)
ax.set_ylabel('Validation Loss', fontsize=11)
ax.set_title('Validation Loss Convergence', fontsize=12, fontweight='bold')
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)


ax = axes[1, 0]
for i, name in enumerate(OPTIMIZER_NAMES):
    ax.plot(convergence_histories[name]['epoch'],
            convergence_histories[name]['val_f1'],
            label=name, linewidth=2.5, color=colors[i], alpha=0.8)
ax.set_xlabel('Epoch', fontsize=11)
ax.set_ylabel('Validation F1 Score', fontsize=11)
ax.set_title('Validation F1 Convergence', fontsize=12, fontweight='bold')
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)


ax = axes[1, 1]
for i, name in enumerate(OPTIMIZER_NAMES):
    ax.plot(convergence_histories[name]['epoch'],
            convergence_histories[name]['grad_norm'],
            label=name, linewidth=2.5, color=colors[i], alpha=0.8)
ax.set_xlabel('Epoch', fontsize=11)
ax.set_ylabel('Gradient Norm', fontsize=11)
ax.set_title('Gradient Stability', fontsize=12, fontweight='bold')
ax.set_yscale('log')
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('pima_results/1_convergence_analysis.png', dpi=200, bbox_inches='tight')
plt.close()


for name in OPTIMIZER_NAMES:
    convergence_histories[name].to_csv(
        f'pima_results/convergence_{name}.csv', index=False
    )

print("\n Convergence analysis complete!")


Training Adam...
  → F1 @ ep10: 0.5634, ep30: 0.5352, ep50: 0.5217 | Test F1: 0.5714 | Time: 0.57s
Training Nesterov...
  → F1 @ ep10: 0.5556, ep30: 0.5352, ep50: 0.5217 | Test F1: 0.5714 | Time: 0.55s
Training Momentum...
  → F1 @ ep10: 0.5556, ep30: 0.5352, ep50: 0.5217 | Test F1: 0.5806 | Time: 0.51s
Training RMSprop...
  → F1 @ ep10: 0.5263, ep30: 0.5676, ep50: 0.5676 | Test F1: 0.5538 | Time: 0.54s
Training SGD...
  → F1 @ ep10: 0.5429, ep30: 0.5217, ep50: 0.5217 | Test F1: 0.5625 | Time: 0.65s
Training Adagrad...
  → F1 @ ep10: 0.5867, ep30: 0.5429, ep50: 0.5429 | Test F1: 0.5846 | Time: 0.53s

✓ Convergence analysis complete!


In [19]:
def compute_auc_f1(history_df, max_epoch=50):

    return np.trapz(history_df['val_f1'][:max_epoch], dx=1)

early_stopping_results = []
for name in OPTIMIZER_NAMES:
    hist = convergence_histories[name]


    final_f1 = hist['val_f1'].iloc[-1]
    target_f1 = 0.95 * final_f1

    qualified_epochs = hist[hist['val_f1'] >= target_f1]['epoch']
    early_stop_epoch = qualified_epochs.iloc[0] if not qualified_epochs.empty else 50


    auc_f1 = compute_auc_f1(hist, 50)

    early_stopping_results.append({
        'Optimizer': name,
        'Final F1': final_f1,
        'F1 @ Epoch 10': hist.loc[9, 'val_f1'],
        'F1 @ Epoch 20': hist.loc[19, 'val_f1'],
        'Early Stop Epoch': early_stop_epoch,
        'AUC F1 (0-50)': auc_f1,
        'Time (sec)': convergence_times[name]
    })
    print(f"{name:9s}: Early stop @ ep {early_stop_epoch:2d}, AUC={auc_f1:.2f}, Final F1={final_f1:.4f}")

df_early_stop = pd.DataFrame(early_stopping_results)


fig, axes = plt.subplots(1, 3, figsize=(16, 5))


ax = axes[0]
bars = ax.bar(df_early_stop['Optimizer'], df_early_stop['Early Stop Epoch'],
              color=colors, alpha=0.7, edgecolor='black', linewidth=1.5)
ax.set_ylabel('Epochs to 95% of Final F1', fontsize=11)
ax.set_title('Early Stopping Comparison', fontsize=12, fontweight='bold')
ax.grid(axis='y', alpha=0.3)
for i, bar in enumerate(bars):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height)}', ha='center', va='bottom', fontsize=10, fontweight='bold')


ax = axes[1]
bars = ax.bar(df_early_stop['Optimizer'], df_early_stop['AUC F1 (0-50)'],
              color=colors, alpha=0.7, edgecolor='black', linewidth=1.5)
ax.set_ylabel('Area Under F1 Curve', fontsize=11)
ax.set_title('Overall Convergence Quality', fontsize=12, fontweight='bold')
ax.grid(axis='y', alpha=0.3)
for i, bar in enumerate(bars):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.1f}', ha='center', va='bottom', fontsize=10, fontweight='bold')


ax = axes[2]
bars = ax.bar(df_early_stop['Optimizer'], df_early_stop['Time (sec)'],
              color=colors, alpha=0.7, edgecolor='black', linewidth=1.5)
ax.set_ylabel('Training Time (seconds)', fontsize=11)
ax.set_title('Computational Efficiency', fontsize=12, fontweight='bold')
ax.grid(axis='y', alpha=0.3)
for i, bar in enumerate(bars):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.2f}', ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.savefig('pima_results/2_early_stopping_analysis.png', dpi=200, bbox_inches='tight')
plt.close()

df_early_stop.to_csv('pima_results/early_stopping_results.csv', index=False)
print("\n Early stopping analysis complete!")


  return np.trapz(history_df['val_f1'][:max_epoch], dx=1)


Adam     : Early stop @ ep  1, AUC=26.18, Final F1=0.5217
Nesterov : Early stop @ ep  1, AUC=26.13, Final F1=0.5217
Momentum : Early stop @ ep  1, AUC=26.14, Final F1=0.5217
RMSprop  : Early stop @ ep  1, AUC=27.64, Final F1=0.5676
SGD      : Early stop @ ep  1, AUC=26.43, Final F1=0.5217
Adagrad  : Early stop @ ep  1, AUC=27.35, Final F1=0.5429

✓ Early stopping analysis complete!


In [20]:
lr_multipliers = [0.1, 0.5, 1.0, 2.0, 5.0]
lr_sensitivity_results = []

for name in OPTIMIZER_NAMES:
    print(f"Testing {name}...")
    base_lr = BEST_HPARAMS[name]['lr']
    mom = BEST_HPARAMS[name]['momentum']

    for mult in lr_multipliers:
        lr = base_lr * mult
        n_features = X_train.shape[1]
        model = LogisticRegression(n_features)
        opt = make_optimizer(name, lr=lr, momentum=(mom if mom is not None else 0.9))
        cfg = TrainConfig(epochs=30, batch_size=32, seed=42)

        hist_df = train_model_detailed(model, opt, X_train, y_train, X_val, y_val, cfg)

        lr_sensitivity_results.append({
            'Optimizer': name,
            'LR Multiplier': mult,
            'Learning Rate': lr,
            'Final Val F1': hist_df['val_f1'].iloc[-1],
            'Final Val Loss': hist_df['val_loss'].iloc[-1]
        })

    print(f"  → LR range: {base_lr*0.1:.5f} to {base_lr*5:.5f}")

df_lr_sens = pd.DataFrame(lr_sensitivity_results)


fig, axes = plt.subplots(1, 2, figsize=(14, 5))


ax = axes[0]
for i, name in enumerate(OPTIMIZER_NAMES):
    data = df_lr_sens[df_lr_sens['Optimizer'] == name]
    ax.plot(data['LR Multiplier'], data['Final Val F1'],
            marker='o', label=name, linewidth=2.5, markersize=8,
            color=colors[i], alpha=0.8)
ax.set_xlabel('Learning Rate Multiplier (× optimal)', fontsize=11)
ax.set_ylabel('Final Validation F1', fontsize=11)
ax.set_title('Learning Rate Sensitivity: F1 Score', fontsize=12, fontweight='bold')
ax.set_xscale('log')
ax.set_xticks(lr_multipliers)
ax.set_xticklabels(['0.1×', '0.5×', '1×', '2×', '5×'])
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)


ax = axes[1]
for i, name in enumerate(OPTIMIZER_NAMES):
    data = df_lr_sens[df_lr_sens['Optimizer'] == name]
    ax.plot(data['LR Multiplier'], data['Final Val Loss'],
            marker='o', label=name, linewidth=2.5, markersize=8,
            color=colors[i], alpha=0.8)
ax.set_xlabel('Learning Rate Multiplier (× optimal)', fontsize=11)
ax.set_ylabel('Final Validation Loss', fontsize=11)
ax.set_title('Learning Rate Sensitivity: Loss', fontsize=12, fontweight='bold')
ax.set_xscale('log')
ax.set_xticks(lr_multipliers)
ax.set_xticklabels(['0.1×', '0.5×', '1×', '2×', '5×'])
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('pima_results/3_lr_sensitivity.png', dpi=200, bbox_inches='tight')
plt.close()

df_lr_sens.to_csv('pima_results/lr_sensitivity_results.csv', index=False)
print("\n Learning rate sensitivity analysis complete!")


Testing Adam...
  → LR range: 0.00100 to 0.05000
Testing Nesterov...
  → LR range: 0.00100 to 0.05000
Testing Momentum...
  → LR range: 0.00100 to 0.05000
Testing RMSprop...
  → LR range: 0.00010 to 0.00500
Testing SGD...
  → LR range: 0.01000 to 0.50000
Testing Adagrad...
  → LR range: 0.00300 to 0.15000

✓ Learning rate sensitivity analysis complete!


In [21]:
def add_label_noise(y, noise_rate=0.15, seed=42):
    np.random.seed(seed)
    y_noisy = y.copy()
    n_flip = int(len(y) * noise_rate)
    flip_idx = np.random.choice(len(y), n_flip, replace=False)
    y_noisy[flip_idx] = 1 - y_noisy[flip_idx]
    return y_noisy

noise_rates = [0.0, 0.10, 0.20]
noise_results = []

for noise_rate in noise_rates:
    print(f"\nTesting with {noise_rate*100:.0f}% label noise...")
    y_train_noisy = add_label_noise(y_train, noise_rate, seed=42)

    for name in OPTIMIZER_NAMES:
        n_features = X_train.shape[1]
        lr = BEST_HPARAMS[name]['lr']
        mom = BEST_HPARAMS[name]['momentum']

        model = LogisticRegression(n_features)
        opt = make_optimizer(name, lr=lr, momentum=(mom if mom is not None else 0.9))
        cfg = TrainConfig(epochs=40, batch_size=32, seed=42)

        hist_df = train_model_detailed(model, opt, X_train, y_train_noisy, X_val, y_val, cfg)
        test_results = model.evaluate(X_test, y_test)

        noise_results.append({
            'Optimizer': name,
            'Noise Rate': noise_rate,
            'Val F1': hist_df['val_f1'].iloc[-1],
            'Test F1': test_results['f1'],
            'Test Accuracy': test_results['accuracy']
        })

        print(f"  {name:9s}: Val F1={hist_df['val_f1'].iloc[-1]:.4f}, Test F1={test_results['f1']:.4f}")

df_noise = pd.DataFrame(noise_results)


fig, axes = plt.subplots(1, 2, figsize=(14, 5))


ax = axes[0]
for i, name in enumerate(OPTIMIZER_NAMES):
    data = df_noise[df_noise['Optimizer'] == name]
    ax.plot(data['Noise Rate']*100, data['Test F1'],
            marker='o', label=name, linewidth=2.5, markersize=8,
            color=colors[i], alpha=0.8)
ax.set_xlabel('Label Noise Rate (%)', fontsize=11)
ax.set_ylabel('Test F1 Score', fontsize=11)
ax.set_title('Robustness to Label Noise', fontsize=12, fontweight='bold')
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)


ax = axes[1]
for i, name in enumerate(OPTIMIZER_NAMES):
    data = df_noise[df_noise['Optimizer'] == name]
    baseline_f1 = data[data['Noise Rate'] == 0.0]['Test F1'].values[0]
    degradation = [(baseline_f1 - row['Test F1']) for _, row in data.iterrows()]
    ax.plot(data['Noise Rate']*100, degradation,
            marker='o', label=name, linewidth=2.5, markersize=8,
            color=colors[i], alpha=0.8)
ax.set_xlabel('Label Noise Rate (%)', fontsize=11)
ax.set_ylabel('F1 Score Degradation', fontsize=11)
ax.set_title('Performance Degradation vs Noise', fontsize=12, fontweight='bold')
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('pima_results/4_noise_robustness.png', dpi=200, bbox_inches='tight')
plt.close()

df_noise.to_csv('pima_results/noise_robustness_results.csv', index=False)
print("\n Noise robustness analysis complete!")



Testing with 0% label noise...
  Adam     : Val F1=0.5429, Test F1=0.5625
  Nesterov : Val F1=0.5634, Test F1=0.5625
  Momentum : Val F1=0.5634, Test F1=0.5625
  RMSprop  : Val F1=0.5867, Test F1=0.5588
  SGD      : Val F1=0.5217, Test F1=0.5806
  Adagrad  : Val F1=0.5429, Test F1=0.5625

Testing with 10% label noise...
  Adam     : Val F1=0.5278, Test F1=0.5758
  Nesterov : Val F1=0.5278, Test F1=0.5538
  Momentum : Val F1=0.5278, Test F1=0.5538
  RMSprop  : Val F1=0.5600, Test F1=0.6176
  SGD      : Val F1=0.5278, Test F1=0.5538
  Adagrad  : Val F1=0.5070, Test F1=0.5538

Testing with 20% label noise...
  Adam     : Val F1=0.5000, Test F1=0.5846
  Nesterov : Val F1=0.5000, Test F1=0.5846
  Momentum : Val F1=0.5000, Test F1=0.5846
  RMSprop  : Val F1=0.4789, Test F1=0.6087
  SGD      : Val F1=0.5000, Test F1=0.5625
  Adagrad  : Val F1=0.4789, Test F1=0.5758

✓ Noise robustness analysis complete!


In [22]:
batch_sizes = [16, 32, 64, 128, 256]
batch_results = []

for bs in batch_sizes:
    print(f"\nTesting with batch size {bs}...")
    for name in OPTIMIZER_NAMES:
        n_features = X_train.shape[1]
        lr = BEST_HPARAMS[name]['lr']
        mom = BEST_HPARAMS[name]['momentum']

        model = LogisticRegression(n_features)
        opt = make_optimizer(name, lr=lr, momentum=(mom if mom is not None else 0.9))
        cfg = TrainConfig(epochs=40, batch_size=bs, seed=42)

        start = pd.Timestamp.now()
        hist_df = train_model_detailed(model, opt, X_train, y_train, X_val, y_val, cfg)
        elapsed = (pd.Timestamp.now() - start).total_seconds()

        test_results = model.evaluate(X_test, y_test)

        batch_results.append({
            'Optimizer': name,
            'Batch Size': bs,
            'Val F1': hist_df['val_f1'].iloc[-1],
            'Test F1': test_results['f1'],
            'Time (sec)': elapsed
        })

        print(f"  {name:9s}: Test F1={test_results['f1']:.4f}, Time={elapsed:.2f}s")

df_batch = pd.DataFrame(batch_results)


fig, axes = plt.subplots(1, 2, figsize=(14, 5))


ax = axes[0]
for i, name in enumerate(OPTIMIZER_NAMES):
    data = df_batch[df_batch['Optimizer'] == name]
    ax.plot(data['Batch Size'], data['Test F1'],
            marker='o', label=name, linewidth=2.5, markersize=8,
            color=colors[i], alpha=0.8)
ax.set_xlabel('Batch Size', fontsize=11)
ax.set_ylabel('Test F1 Score', fontsize=11)
ax.set_title('Batch Size Sensitivity: F1 Score', fontsize=12, fontweight='bold')
ax.set_xscale('log', base=2)
ax.set_xticks(batch_sizes)
ax.set_xticklabels([str(bs) for bs in batch_sizes])
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)


ax = axes[1]
for i, name in enumerate(OPTIMIZER_NAMES):
    data = df_batch[df_batch['Optimizer'] == name]
    ax.plot(data['Batch Size'], data['Time (sec)'],
            marker='o', label=name, linewidth=2.5, markersize=8,
            color=colors[i], alpha=0.8)
ax.set_xlabel('Batch Size', fontsize=11)
ax.set_ylabel('Training Time (seconds)', fontsize=11)
ax.set_title('Batch Size Sensitivity: Time', fontsize=12, fontweight='bold')
ax.set_xscale('log', base=2)
ax.set_xticks(batch_sizes)
ax.set_xticklabels([str(bs) for bs in batch_sizes])
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('pima_results/5_batch_sensitivity.png', dpi=200, bbox_inches='tight')
plt.close()

df_batch.to_csv('pima_results/batch_sensitivity_results.csv', index=False)
print("\n Batch size sensitivity analysis complete!")



Testing with batch size 16...
  Adam     : Test F1=0.5625, Time=0.48s
  Nesterov : Test F1=0.5714, Time=0.45s
  Momentum : Test F1=0.5625, Time=0.48s
  RMSprop  : Test F1=0.5625, Time=0.46s
  SGD      : Test F1=0.5806, Time=0.44s
  Adagrad  : Test F1=0.5938, Time=0.43s

Testing with batch size 32...
  Adam     : Test F1=0.5625, Time=0.43s
  Nesterov : Test F1=0.5625, Time=0.40s
  Momentum : Test F1=0.5625, Time=0.40s
  RMSprop  : Test F1=0.5588, Time=0.40s
  SGD      : Test F1=0.5806, Time=0.38s
  Adagrad  : Test F1=0.5625, Time=0.41s

Testing with batch size 64...
  Adam     : Test F1=0.5625, Time=0.35s
  Nesterov : Test F1=0.5625, Time=0.54s
  Momentum : Test F1=0.5625, Time=0.52s
  RMSprop  : Test F1=0.5217, Time=0.52s
  SGD      : Test F1=0.5806, Time=0.58s
  Adagrad  : Test F1=0.5312, Time=0.54s

Testing with batch size 128...
  Adam     : Test F1=0.5846, Time=0.36s
  Nesterov : Test F1=0.5714, Time=0.34s
  Momentum : Test F1=0.5714, Time=0.34s
  RMSprop  : Test F1=0.5634, Time=0

In [23]:
summary_data = []
for name in OPTIMIZER_NAMES:

    hist = convergence_histories[name]
    n_features = X_train.shape[1]
    lr = BEST_HPARAMS[name]['lr']
    mom = BEST_HPARAMS[name]['momentum']
    model = LogisticRegression(n_features)
    opt = make_optimizer(name, lr=lr, momentum=(mom if mom is not None else 0.9))
    cfg = TrainConfig(epochs=50, batch_size=32, seed=42)


    _ = train_model_detailed(model, opt, X_train, y_train, X_val, y_val, cfg)
    test_results = model.evaluate(X_test, y_test)


    early_stop = df_early_stop[df_early_stop['Optimizer'] == name]['Early Stop Epoch'].values[0]
    auc_f1 = df_early_stop[df_early_stop['Optimizer'] == name]['AUC F1 (0-50)'].values[0]
    time_taken = convergence_times[name]


    lr_std = df_lr_sens[df_lr_sens['Optimizer'] == name]['Final Val F1'].std()


    noise_data = df_noise[df_noise['Optimizer'] == name]
    noise_drop = noise_data[noise_data['Noise Rate'] == 0.0]['Test F1'].values[0] - \
                 noise_data[noise_data['Noise Rate'] == 0.20]['Test F1'].values[0]

    summary_data.append({
        'Optimizer': name,
        'Test F1': test_results['f1'],
        'Test Accuracy': test_results['accuracy'],
        'Test ROC-AUC': test_results['roc_auc'],
        'Early Stop Epoch': early_stop,
        'AUC F1 (Convergence)': auc_f1,
        'Time (sec)': time_taken,
        'LR Sensitivity (σ)': lr_std,
        'Noise Robustness (Δ)': noise_drop
    })

df_summary = pd.DataFrame(summary_data)
df_summary = df_summary.round(4)

print("\nFINAL COMPARISON TABLE:")
print("="*70)
print(df_summary.to_string(index=False))
print("="*70)

df_summary.to_csv('pima_results/FINAL_SUMMARY.csv', index=False)


fig = plt.figure(figsize=(16, 10))
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)


ax1 = fig.add_subplot(gs[0, :])
x_pos = np.arange(len(OPTIMIZER_NAMES))
width = 0.25
ax1.bar(x_pos - width, df_summary['Test F1'], width, label='F1 Score', alpha=0.8)
ax1.bar(x_pos, df_summary['Test Accuracy'], width, label='Accuracy', alpha=0.8)
ax1.bar(x_pos + width, df_summary['Test ROC-AUC'], width, label='ROC-AUC', alpha=0.8)
ax1.set_xticks(x_pos)
ax1.set_xticklabels(OPTIMIZER_NAMES)
ax1.set_ylabel('Score', fontsize=11)
ax1.set_title('Final Test Performance Comparison', fontsize=13, fontweight='bold')
ax1.legend(fontsize=10)
ax1.grid(axis='y', alpha=0.3)


ax2 = fig.add_subplot(gs[1, 0])
bars = ax2.barh(OPTIMIZER_NAMES, df_summary['AUC F1 (Convergence)'], color=colors, alpha=0.7)
ax2.set_xlabel('AUC F1 Score', fontsize=10)
ax2.set_title('Convergence Quality', fontsize=11, fontweight='bold')
ax2.grid(axis='x', alpha=0.3)


ax3 = fig.add_subplot(gs[1, 1])
bars = ax3.barh(OPTIMIZER_NAMES, df_summary['Time (sec)'], color=colors, alpha=0.7)
ax3.set_xlabel('Training Time (sec)', fontsize=10)
ax3.set_title('Computational Efficiency', fontsize=11, fontweight='bold')
ax3.grid(axis='x', alpha=0.3)


ax4 = fig.add_subplot(gs[1, 2])
bars = ax4.barh(OPTIMIZER_NAMES, df_summary['Early Stop Epoch'], color=colors, alpha=0.7)
ax4.set_xlabel('Epochs to 95% Final F1', fontsize=10)
ax4.set_title('Early Stopping Point', fontsize=11, fontweight='bold')
ax4.grid(axis='x', alpha=0.3)


ax5 = fig.add_subplot(gs[2, 0])
bars = ax5.barh(OPTIMIZER_NAMES, df_summary['LR Sensitivity'], color=colors, alpha=0.7)
ax5.set_xlabel('Std Dev of F1 (lower=robust)', fontsize=10)
ax5.set_title('LR Sensitivity', fontsize=11, fontweight='bold')
ax5.grid(axis='x', alpha=0.3)


ax6 = fig.add_subplot(gs[2, 1])
bars = ax6.barh(OPTIMIZER_NAMES, df_summary['Noise Robustness'], color=colors, alpha=0.7)
ax6.set_xlabel('F1 Drop at 20% noise (lower=robust)', fontsize=10)
ax6.set_title('Noise Robustness', fontsize=11, fontweight='bold')
ax6.grid(axis='x', alpha=0.3)


ax7 = fig.add_subplot(gs[2, 2])

composite = (
    (df_summary['Test F1'] - df_summary['Test F1'].min()) / (df_summary['Test F1'].max() - df_summary['Test F1'].min()) * 0.3 +
    (df_summary['AUC F1 (Convergence)'] - df_summary['AUC F1 (Convergence)'].min()) / (df_summary['AUC F1 (Convergence)'].max() - df_summary['AUC F1 (Convergence)'].min()) * 0.25 +
    (1 - (df_summary['Time (sec)'] - df_summary['Time (sec)'].min()) / (df_summary['Time (sec)'].max() - df_summary['Time (sec)'].min())) * 0.15 +
    (1 - (df_summary['LR Sensitivity '] - df_summary['LR Sensitivity '].min()) / (df_summary['LR Sensitivity (σ)'].max() - df_summary['LR Sensitivity (σ)'].min())) * 0.15 +
    (1 - (df_summary['Noise Robustness '] - df_summary['Noise Robustness '].min()) / (df_summary['Noise Robustness (Δ)'].max() - df_summary['Noise Robustness (Δ)'].min())) * 0.15
)
df_summary['Composite Score'] = composite.fillna(0)
df_ranked = df_summary.sort_values('Composite Score', ascending=False)

bars = ax7.barh(df_ranked['Optimizer'], df_ranked['Composite Score'],
                color=colors[:len(df_ranked)], alpha=0.7)
ax7.set_xlabel('Composite Score', fontsize=10)
ax7.set_title('Overall Ranking', fontsize=11, fontweight='bold')
ax7.grid(axis='x', alpha=0.3)
ax7.invert_yaxis()

plt.suptitle('COMPREHENSIVE OPTIMIZER COMPARISON DASHBOARD (PIMA)',
             fontsize=15, fontweight='bold', y=0.995)
plt.savefig('pima_results/COMPREHENSIVE_DASHBOARD.png', dpi=200, bbox_inches='tight')
plt.close()



FINAL COMPARISON TABLE:
Optimizer  Test F1  Test Accuracy  Test ROC-AUC  Early Stop Epoch  AUC F1 (Convergence)  Time (sec)  LR Sensitivity (σ)  Noise Robustness (Δ)
     Adam   0.5714         0.7672        0.8257                 1               26.1828      0.5686              0.0137               -0.0221
 Nesterov   0.5714         0.7672        0.8260                 1               26.1337      0.5534              0.0126               -0.0221
 Momentum   0.5806         0.7759        0.8257                 1               26.1446      0.5091              0.0122               -0.0221
  RMSprop   0.5538         0.7500        0.8283                 1               27.6404      0.5427              0.0105               -0.0499
      SGD   0.5625         0.7586        0.8270                 1               26.4299      0.6536              0.0263                0.0181
  Adagrad   0.5846         0.7672        0.8306                 1               27.3528      0.5311              0.0162    