In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dataclasses import dataclass
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import os
import pickle

plt.rcParams['figure.dpi'] = 150
plt.rcParams['axes.grid'] = True
try:
    plt.style.use('seaborn-v0_8-darkgrid')
except OSError:
    plt.style.use('seaborn-darkgrid')

np.set_printoptions(suppress=True)

In [None]:
adult_columns = [
    'age','workclass','fnlwgt','education','education-num',
    'marital-status','occupation','relationship','race','sex',
    'capital-gain','capital-loss','hours-per-week','native-country','income'
]

DATA_PATH = '/content/adult.csv'


df = pd.read_csv(
    DATA_PATH,
    names=adult_columns,
    na_values=['?'],
    skipinitialspace=True
)
df = df.dropna().reset_index(drop=True)

df['income'] = df['income'].str.strip()
y = (df['income'] == '>50K').astype(int).to_numpy()
X_df = df.drop(columns=['income'])

categorical_cols = ['workclass','education','marital-status','occupation',
                   'relationship','race','sex','native-country']
numeric_cols = ['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
])

X = preprocessor.fit_transform(X_df).astype(np.float64)

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
print(f"Positive rate (>50K): {y.mean():.4f}\n")


In [None]:
class LogisticRegression:
    def __init__(self, n_features):
        self.weights = np.zeros(n_features, dtype=np.float64)
        self.bias = 0.0

    def sigmoid(self, z):
        z = np.clip(z, -500, 500)
        return 1.0/(1.0 + np.exp(-z))

    def forward(self, X):
        return self.sigmoid(X @ self.weights + self.bias)

    def compute_loss(self, y_true, y_pred):
        eps = 1e-15
        return -np.mean(y_true*np.log(y_pred+eps) + (1-y_true)*np.log(1-y_pred+eps))

    def compute_gradients(self, X, y_true, y_pred):
        m = X.shape[0]
        err = y_pred - y_true
        dw = (X.T @ err)/m
        db = np.mean(err)
        return dw, db

    def predict(self, X):
        return (self.forward(X) > 0.5).astype(int)

    def evaluate(self, X, y_true):
        y_prob = self.forward(X)
        y_pred = (y_prob > 0.5).astype(int)
        return {
            'loss': self.compute_loss(y_true, y_prob),
            'accuracy': accuracy_score(y_true, y_pred),
            'precision': precision_score(y_true, y_pred, zero_division=0),
            'recall': recall_score(y_true, y_pred, zero_division=0),
            'f1': f1_score(y_true, y_pred, zero_division=0),
            'roc_auc': roc_auc_score(y_true, y_prob),
        }

In [None]:
class SGDOptimizer:
    def __init__(self, learning_rate=0.01):
        self.lr = learning_rate
    def update(self, w, b, dw, db):
        return w - self.lr*dw, b - self.lr*db

class MomentumOptimizer:
    def __init__(self, learning_rate=0.01, momentum=0.9):
        self.lr = learning_rate
        self.m = momentum
        self.vw = None
        self.vb = 0.0

    def update(self, w, b, dw, db):
        if self.vw is None:
            self.vw = np.zeros_like(w)
        self.vw = self.m * self.vw - self.lr * dw
        self.vb = self.m * self.vb - self.lr * db
        w = w + self.vw
        b = b + self.vb
        return w, b

class NesterovOptimizer:
    def __init__(self, learning_rate=0.01, momentum=0.9):
        self.lr = learning_rate
        self.m = momentum
        self.vw = None
        self.vb = 0.0

    def update(self, w, b, dw, db):
        if self.vw is None:
            self.vw = np.zeros_like(w)

        vw_prev = self.vw.copy()
        vb_prev = self.vb

        self.vw = self.m * self.vw - self.lr * dw
        self.vb = self.m * self.vb - self.lr * db

        w = w + (-self.m * vw_prev + (1 + self.m) * self.vw)
        b = b + (-self.m * vb_prev + (1 + self.m) * self.vb)

        return w, b

class AdagradOptimizer:
    def __init__(self, learning_rate=0.01, epsilon=1e-8):
        self.lr = learning_rate
        self.eps = epsilon
        self.gs_w = None
        self.gs_b = 0.0
    def update(self, w, b, dw, db):
        if self.gs_w is None:
            self.gs_w = np.zeros_like(w)
        self.gs_w += dw**2
        self.gs_b += db**2
        return (w - self.lr*dw/(np.sqrt(self.gs_w)+self.eps),
                b - self.lr*db/(np.sqrt(self.gs_b)+self.eps))

class RMSpropOptimizer:
    def __init__(self, learning_rate=0.001, decay=0.9, epsilon=1e-8):
        self.lr = learning_rate
        self.decay = decay
        self.eps = epsilon
        self.ms_w = None
        self.ms_b = 0.0

    def update(self, w, b, dw, db):
        if self.ms_w is None:
            self.ms_w = np.zeros_like(w)
        self.ms_w = self.decay * self.ms_w + (1 - self.decay) * (dw ** 2)
        self.ms_b = self.decay * self.ms_b + (1 - self.decay) * (db ** 2)

        w = w - self.lr * dw / (np.sqrt(self.ms_w) + self.eps)
        b = b - self.lr * db / (np.sqrt(self.ms_b) + self.eps)
        return w, b


class AdamOptimizer:
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.lr = learning_rate
        self.b1 = beta1
        self.b2 = beta2
        self.eps = epsilon
        self.mw = None
        self.vw = None
        self.mb = 0.0
        self.vb = 0.0
        self.t = 0
    def update(self, w, b, dw, db):
        if self.mw is None:
            self.mw = np.zeros_like(w)
            self.vw = np.zeros_like(w)
        self.t += 1
        self.mw = self.b1*self.mw + (1-self.b1)*dw
        self.mb = self.b1*self.mb + (1-self.b1)*db
        self.vw = self.b2*self.vw + (1-self.b2)*(dw**2)
        self.vb = self.b2*self.vb + (1-self.b2)*(db**2)
        mw_hat = self.mw/(1 - self.b1**self.t)
        mb_hat = self.mb/(1 - self.b1**self.t)
        vw_hat = self.vw/(1 - self.b2**self.t)
        vb_hat = self.vb/(1 - self.b2**self.t)
        return (w - self.lr*mw_hat/(np.sqrt(vw_hat)+self.eps),
                b - self.lr*mb_hat/(np.sqrt(vb_hat)+self.eps))

In [None]:
@dataclass
class TrainConfig:
    epochs: int = 50
    batch_size: int = 32
    seed: int = 42
    l2_lambda: float = 0.0

def set_seed(seed=42):
    np.random.seed(seed)

def train_epoch(model, X_tr, y_tr, optimizer, cfg: TrainConfig, track_gradients=False):
    n = X_tr.shape[0]
    idx = np.random.permutation(n)
    total_loss = 0.0
    grad_norms = []
    bs = cfg.batch_size

    for i in range(0, n, bs):
        xb = X_tr[idx[i:i+bs]]
        yb = y_tr[idx[i:i+bs]]
        yp = model.forward(xb)
        base_loss = model.compute_loss(yb, yp)
        l2_pen = 0.5 * cfg.l2_lambda * np.dot(model.weights, model.weights)
        total_loss += (base_loss + l2_pen) * len(yb)

        dw, db = model.compute_gradients(xb, yb, yp)
        if cfg.l2_lambda > 0.0:
            dw = dw + cfg.l2_lambda * model.weights

        if track_gradients:
            grad_norms.append(np.linalg.norm(dw))

        model.weights, model.bias = optimizer.update(model.weights, model.bias, dw, db)

    if track_gradients:
        return total_loss / n, np.mean(grad_norms)
    return total_loss / n

def train_model_detailed(model, optimizer, X_tr, y_tr, X_va, y_va, cfg: TrainConfig):
    set_seed(cfg.seed)
    history = {
        'epoch': [], 'train_loss': [], 'val_loss': [],
        'val_acc': [], 'val_f1': [], 'val_roc_auc': [], 'grad_norm': []
    }

    for e in range(cfg.epochs):
        tr_loss, grad_norm = train_epoch(model, X_tr, y_tr, optimizer, cfg, track_gradients=True)
        val = model.evaluate(X_va, y_va)

        history['epoch'].append(e+1)
        history['train_loss'].append(tr_loss)
        history['val_loss'].append(val['loss'])
        history['val_acc'].append(val['accuracy'])
        history['val_f1'].append(val['f1'])
        history['val_roc_auc'].append(val['roc_auc'])
        history['grad_norm'].append(grad_norm)

    return pd.DataFrame(history)

In [None]:
def make_optimizer(name, lr=0.01, momentum=0.9):
    if name == 'SGD': return SGDOptimizer(lr)
    if name == 'Momentum': return MomentumOptimizer(lr, momentum)
    if name == 'Nesterov': return NesterovOptimizer(lr, momentum)
    if name == 'Adagrad': return AdagradOptimizer(lr)
    if name == 'RMSprop': return RMSpropOptimizer(lr)
    if name == 'Adam': return AdamOptimizer(lr)
    raise ValueError(name)

BEST_HPARAMS = {
    'Adam':     {'lr': 0.01,  'momentum': None},
    'Momentum': {'lr': 0.01,  'momentum': 0.95},
    'Nesterov': {'lr': 0.01,  'momentum': 0.95},
    'RMSprop':  {'lr': 0.001, 'momentum': None},
    'SGD':      {'lr': 0.1,   'momentum': None},
    'Adagrad':  {'lr': 0.03,  'momentum': None},
}

OPTIMIZER_NAMES = ['Adam', 'Nesterov', 'Momentum', 'RMSprop', 'SGD', 'Adagrad']



In [None]:
convergence_histories = {}
convergence_times = {}

for name in OPTIMIZER_NAMES:
    print(f"Training {name}...")
    n_features = X_train.shape[1]
    lr = BEST_HPARAMS[name]['lr']
    mom = BEST_HPARAMS[name]['momentum']

    model = LogisticRegression(n_features)
    opt = make_optimizer(name, lr=lr, momentum=(mom if mom is not None else 0.9))
    cfg = TrainConfig(epochs=50, batch_size=32, seed=42)

    start = pd.Timestamp.now()
    hist_df = train_model_detailed(model, opt, X_train, y_train, X_val, y_val, cfg)
    elapsed = (pd.Timestamp.now() - start).total_seconds()

    convergence_histories[name] = hist_df
    convergence_times[name] = elapsed

    test_results = model.evaluate(X_test, y_test)
    print(f"  → F1 @ ep10: {hist_df.loc[9,'val_f1']:.4f}, ep30: {hist_df.loc[29,'val_f1']:.4f}, "
          f"ep50: {hist_df.loc[49,'val_f1']:.4f} | Test F1: {test_results['f1']:.4f} | Time: {elapsed:.2f}s")

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
colors = plt.cm.tab10(range(len(OPTIMIZER_NAMES)))

ax = axes[0, 0]
for i, name in enumerate(OPTIMIZER_NAMES):
    ax.plot(convergence_histories[name]['epoch'],
            convergence_histories[name]['train_loss'],
            label=name, linewidth=2.5, color=colors[i], alpha=0.8)
ax.set_xlabel('Epoch', fontsize=11)
ax.set_ylabel('Training Loss', fontsize=11)
ax.set_title('Training Loss Convergence', fontsize=12, fontweight='bold')
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)

ax = axes[0, 1]
for i, name in enumerate(OPTIMIZER_NAMES):
    ax.plot(convergence_histories[name]['epoch'],
            convergence_histories[name]['val_loss'],
            label=name, linewidth=2.5, color=colors[i], alpha=0.8)
ax.set_xlabel('Epoch', fontsize=11)
ax.set_ylabel('Validation Loss', fontsize=11)
ax.set_title('Validation Loss Convergence', fontsize=12, fontweight='bold')
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)

ax = axes[1, 0]
for i, name in enumerate(OPTIMIZER_NAMES):
    ax.plot(convergence_histories[name]['epoch'],
            convergence_histories[name]['val_f1'],
            label=name, linewidth=2.5, color=colors[i], alpha=0.8)
ax.set_xlabel('Epoch', fontsize=11)
ax.set_ylabel('Validation F1 Score', fontsize=11)
ax.set_title('Validation F1 Convergence', fontsize=12, fontweight='bold')
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)

ax = axes[1, 1]
for i, name in enumerate(OPTIMIZER_NAMES):
    ax.plot(convergence_histories[name]['epoch'],
            convergence_histories[name]['grad_norm'],
            label=name, linewidth=2.5, color=colors[i], alpha=0.8)
ax.set_xlabel('Epoch', fontsize=11)
ax.set_ylabel('Gradient Norm', fontsize=11)
ax.set_title('Gradient Stability', fontsize=12, fontweight='bold')
ax.set_yscale('log')
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

early_f1_gains = {
    name: (hist.loc[9, 'val_f1'] - hist.loc[0, 'val_f1'])
    for name, hist in convergence_histories.items()
}

stability = {
    name: convergence_histories[name]['val_loss'].iloc[-20:].std()
    for name in OPTIMIZER_NAMES
}

In [None]:

def compute_auc_f1(history_df, max_epoch=50):

    vals = history_df['val_f1'][:max_epoch].to_numpy()
    return np.trapezoid(vals, dx=1)

def find_convergence_epoch(history_df, metric='val_f1', threshold=0.01, patience=3):

    values = history_df[metric].values
    if len(values) < patience + 1:
        return len(values)

    improvements = np.abs(np.diff(values))

    for i in range(len(improvements) - (patience - 1)):
        if np.all(improvements[i:i+patience] < threshold):
            return i + 1
    return len(history_df)

early_stopping_results = []
for name in OPTIMIZER_NAMES:
    hist = convergence_histories[name]

    final_f1 = hist['val_f1'].iloc[-1]
    early_stop_epoch = find_convergence_epoch(
        hist, metric='val_f1', threshold=0.01, patience=3
    )

    auc_f1 = compute_auc_f1(hist, 50)

    early_stopping_results.append({
        'Optimizer': name,
        'Final F1': final_f1,
        'F1 @ Epoch 10': hist.loc[9, 'val_f1'],
        'F1 @ Epoch 20': hist.loc[19, 'val_f1'],
        'Early Stop Epoch': early_stop_epoch,
        'AUC F1 (0-50)': auc_f1,
        'Time (sec)': convergence_times[name]
    })
    print(f"{name:9s}: Early stop @ ep {early_stop_epoch:2d}, AUC={auc_f1:.2f}, Final F1={final_f1:.4f}")

df_early_stop = pd.DataFrame(early_stopping_results)

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

ax = axes[0]
bars = ax.bar(df_early_stop['Optimizer'], df_early_stop['Early Stop Epoch'],
              color=colors, alpha=0.7, edgecolor='black', linewidth=1.5)
ax.set_ylabel('Epochs to convergence (new metric)', fontsize=11)
ax.set_title('Early Stopping Comparison', fontsize=12, fontweight='bold')
ax.grid(axis='y', alpha=0.3)
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height)}', ha='center', va='bottom', fontsize=10, fontweight='bold')

ax = axes[1]
bars = ax.bar(df_early_stop['Optimizer'], df_early_stop['AUC F1 (0-50)'],
              color=colors, alpha=0.7, edgecolor='black', linewidth=1.5)
ax.set_ylabel('Area Under F1 Curve', fontsize=11)
ax.set_title('Overall Convergence Quality', fontsize=12, fontweight='bold')
ax.grid(axis='y', alpha=0.3)
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.1f}', ha='center', va='bottom', fontsize=10, fontweight='bold')

ax = axes[2]
bars = ax.bar(df_early_stop['Optimizer'], df_early_stop['Time (sec)'],
              color=colors, alpha=0.7, edgecolor='black', linewidth=1.5)
ax.set_ylabel('Training Time (seconds)', fontsize=11)
ax.set_title('Computational Efficiency', fontsize=12, fontweight='bold')
ax.grid(axis='y', alpha=0.3)
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.2f}', ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.show()



In [None]:

lr_multipliers = [0.1, 0.5, 1.0, 2.0, 5.0]
lr_sensitivity_results = []

for name in OPTIMIZER_NAMES:
    print(f"Testing {name}...")
    base_lr = BEST_HPARAMS[name]['lr']
    mom = BEST_HPARAMS[name]['momentum']

    for mult in lr_multipliers:
        lr = base_lr * mult
        n_features = X_train.shape[1]
        model = LogisticRegression(n_features)
        opt = make_optimizer(name, lr=lr, momentum=(mom if mom is not None else 0.9))
        cfg = TrainConfig(epochs=30, batch_size=32, seed=42)

        hist_df = train_model_detailed(model, opt, X_train, y_train, X_val, y_val, cfg)

        lr_sensitivity_results.append({
            'Optimizer': name,
            'LR Multiplier': mult,
            'Learning Rate': lr,
            'Final Val F1': hist_df['val_f1'].iloc[-1],
            'Final Val Loss': hist_df['val_loss'].iloc[-1]
        })

    print(f"  → LR range: {base_lr*0.1:.5f} to {base_lr*5:.5f}")

df_lr_sens = pd.DataFrame(lr_sensitivity_results)

lr_range_df = (
    df_lr_sens.groupby('Optimizer')['Final Val F1']
    .agg(['max', 'min'])
)
lr_range_df['range'] = lr_range_df['max'] - lr_range_df['min']

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

ax = axes[0]
for i, name in enumerate(OPTIMIZER_NAMES):
    data = df_lr_sens[df_lr_sens['Optimizer'] == name]
    ax.plot(data['LR Multiplier'], data['Final Val F1'],
            marker='o', label=name, linewidth=2.5, markersize=8,
            color=colors[i], alpha=0.8)
ax.set_xlabel('Learning Rate Multiplier (× optimal)', fontsize=11)
ax.set_ylabel('Final Validation F1', fontsize=11)
ax.set_title('Learning Rate Sensitivity: F1 Score', fontsize=12, fontweight='bold')
ax.set_xscale('log')
ax.set_xticks(lr_multipliers)
ax.set_xticklabels(['0.1×', '0.5×', '1×', '2×', '5×'])
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)

ax = axes[1]
for i, name in enumerate(OPTIMIZER_NAMES):
    data = df_lr_sens[df_lr_sens['Optimizer'] == name]
    ax.plot(data['LR Multiplier'], data['Final Val Loss'],
            marker='o', label=name, linewidth=2.5, markersize=8,
            color=colors[i], alpha=0.8)
ax.set_xlabel('Learning Rate Multiplier (× optimal)', fontsize=11)
ax.set_ylabel('Final Validation Loss', fontsize=11)
ax.set_title('Learning Rate Sensitivity: Loss', fontsize=12, fontweight='bold')
ax.set_xscale('log')
ax.set_xticks(lr_multipliers)
ax.set_xticklabels(['0.1×', '0.5×', '1×', '2×', '5×'])
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()



In [None]:

def add_label_noise(y, noise_rate=0.15, seed=42):
    np.random.seed(seed)
    y_noisy = y.copy()
    n_flip = int(len(y) * noise_rate)
    flip_idx = np.random.choice(len(y), n_flip, replace=False)
    y_noisy[flip_idx] = 1 - y_noisy[flip_idx]
    return y_noisy

noise_rates = [0.0, 0.10, 0.20]
noise_results = []

for noise_rate in noise_rates:
    print(f"\nTesting with {noise_rate*100:.0f}% label noise...")
    y_train_noisy = add_label_noise(y_train, noise_rate, seed=42)

    for name in OPTIMIZER_NAMES:
        n_features = X_train.shape[1]
        lr = BEST_HPARAMS[name]['lr']
        mom = BEST_HPARAMS[name]['momentum']

        model = LogisticRegression(n_features)
        opt = make_optimizer(name, lr=lr, momentum=(mom if mom is not None else 0.9))
        cfg = TrainConfig(epochs=40, batch_size=32, seed=42)

        hist_df = train_model_detailed(model, opt, X_train, y_train_noisy, X_val, y_val, cfg)
        test_results = model.evaluate(X_test, y_test)

        noise_results.append({
            'Optimizer': name,
            'Noise Rate': noise_rate,
            'Val F1': hist_df['val_f1'].iloc[-1],
            'Test F1': test_results['f1'],
            'Test Accuracy': test_results['accuracy']
        })

        print(f"  {name:9s}: Val F1={hist_df['val_f1'].iloc[-1]:.4f}, Test F1={test_results['f1']:.4f}")

df_noise = pd.DataFrame(noise_results)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

ax = axes[0]
for i, name in enumerate(OPTIMIZER_NAMES):
    data = df_noise[df_noise['Optimizer'] == name]
    ax.plot(data['Noise Rate']*100, data['Test F1'],
            marker='o', label=name, linewidth=2.5, markersize=8,
            color=colors[i], alpha=0.8)
ax.set_xlabel('Label Noise Rate (%)', fontsize=11)
ax.set_ylabel('Test F1 Score', fontsize=11)
ax.set_title('Robustness to Label Noise', fontsize=12, fontweight='bold')
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)

ax = axes[1]
for i, name in enumerate(OPTIMIZER_NAMES):
    data = df_noise[df_noise['Optimizer'] == name]
    baseline_f1 = data[data['Noise Rate'] == 0.0]['Test F1'].values[0]
    degradation = [(baseline_f1 - row['Test F1']) for _, row in data.iterrows()]
    ax.plot(data['Noise Rate']*100, degradation,
            marker='o', label=name, linewidth=2.5, markersize=8,
            color=colors[i], alpha=0.8)
ax.set_xlabel('Label Noise Rate (%)', fontsize=11)
ax.set_ylabel('F1 Score Degradation', fontsize=11)
ax.set_title('Performance Degradation vs Noise', fontsize=12, fontweight='bold')
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()



In [None]:

batch_sizes = [16, 32, 64, 128, 256]
batch_results = []

for bs in batch_sizes:
    print(f"\nTesting with batch size {bs}...")
    for name in OPTIMIZER_NAMES:
        n_features = X_train.shape[1]
        lr = BEST_HPARAMS[name]['lr']
        mom = BEST_HPARAMS[name]['momentum']

        model = LogisticRegression(n_features)
        opt = make_optimizer(name, lr=lr, momentum=(mom if mom is not None else 0.9))
        cfg = TrainConfig(epochs=40, batch_size=bs, seed=42)

        start = pd.Timestamp.now()
        hist_df = train_model_detailed(model, opt, X_train, y_train, X_val, y_val, cfg)
        elapsed = (pd.Timestamp.now() - start).total_seconds()

        test_results = model.evaluate(X_test, y_test)

        batch_results.append({
            'Optimizer': name,
            'Batch Size': bs,
            'Val F1': hist_df['val_f1'].iloc[-1],
            'Test F1': test_results['f1'],
            'Time (sec)': elapsed
        })

        print(f"  {name:9s}: Test F1={test_results['f1']:.4f}, Time={elapsed:.2f}s")

df_batch = pd.DataFrame(batch_results)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

ax = axes[0]
for i, name in enumerate(OPTIMIZER_NAMES):
    data = df_batch[df_batch['Optimizer'] == name]
    ax.plot(data['Batch Size'], data['Test F1'],
            marker='o', label=name, linewidth=2.5, markersize=8,
            color=colors[i], alpha=0.8)
ax.set_xlabel('Batch Size', fontsize=11)
ax.set_ylabel('Test F1 Score', fontsize=11)
ax.set_title('Batch Size Sensitivity: F1 Score', fontsize=12, fontweight='bold')
ax.set_xscale('log', base=2)
ax.set_xticks(batch_sizes)
ax.set_xticklabels([str(bs) for bs in batch_sizes])
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)

ax = axes[1]
for i, name in enumerate(OPTIMIZER_NAMES):
    data = df_batch[df_batch['Optimizer'] == name]
    ax.plot(data['Batch Size'], data['Time (sec)'],
            marker='o', label=name, linewidth=2.5, markersize=8,
            color=colors[i], alpha=0.8)
ax.set_xlabel('Batch Size', fontsize=11)
ax.set_ylabel('Training Time (seconds)', fontsize=11)
ax.set_title('Batch Size Sensitivity: Time', fontsize=12, fontweight='bold')
ax.set_xscale('log', base=2)
ax.set_xticks(batch_sizes)
ax.set_xticklabels([str(bs) for bs in batch_sizes])
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

