In [5]:
# ===== BNCI-only benchmark (fixed safe meta loading) =====
# Requirements: numpy, torch, sklearn, mne, pandas, joblib
import os, math, random, numpy as np, pandas as pd
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import joblib, torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# reproducibility
def seed_everything(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
seed_everything(42)

# ---------- load BNCI preprocessed (safe meta extraction) ----------
BNCI_PREPRO = "preprocessed_BNCI.npz"
if not os.path.exists(BNCI_PREPRO):
    raise FileNotFoundError("preprocessed_BNCI.npz not found. Run BNCI extraction first.")
d = np.load(BNCI_PREPRO, allow_pickle=True)
X = d['X'].astype(np.float32)   # (N, n_ch, n_times)
y = d['y'].astype(int)

# safe meta load (handles 0-d numpy objects)
meta = {}
if 'meta' in d:
    meta_raw = d['meta']
    try:
        if np.ndim(meta_raw) == 0:
            # meta stored as object scalar (e.g. array({}, dtype=object))
            meta = meta_raw.item() if hasattr(meta_raw, 'item') else {}
            if meta is None: meta = {}
        else:
            meta = dict(meta_raw)
    except Exception:
        try:
            meta = dict(meta_raw)
        except Exception:
            meta = {}
else:
    meta = {}

print("Loaded BNCI:", X.shape, "labels:", dict(zip(*np.unique(y, return_counts=True))))
print("BNCI meta keys:", list(meta.keys()))

# ---------- small augmentation helper (on-the-fly) ----------
def random_augment_numpy(epoch):
    # epoch: n_ch x n_times
    e = epoch.copy()
    if np.random.rand() < 0.5:
        e = e + np.random.normal(0, 0.01, e.shape)   # noise
    if np.random.rand() < 0.4:
        shift = np.random.randint(-10, 11)
        e = np.roll(e, shift, axis=1)
    if np.random.rand() < 0.3:
        # channel dropout
        ch = e.shape[0]
        drop_mask = np.random.rand(ch) < 0.05
        e[drop_mask,:] = 0
    return e

# ---------- CSP + classical baseline (k-fold) ----------
use_csp = True
if use_csp:
    from mne.decoding import CSP
    # choose small number of components (<= n_ch)
    n_components = min(8, X.shape[1])
    csp = CSP(n_components=n_components, log=True, norm_trace=False)
    X_csp = csp.fit_transform(X, y)   # shape (N, n_components)
    print("CSP features:", X_csp.shape)
else:
    X_csp = None

classifiers = {
    'LDA': Pipeline([('sc', StandardScaler()), ('clf', LinearDiscriminantAnalysis())]),
    'SVM-rbf': Pipeline([('sc', StandardScaler()), ('clf', SVC(kernel='rbf', C=1, probability=True))]),
    'RandomForest': RandomForestClassifier(n_estimators=200, random_state=42),
    'MLP': Pipeline([('sc', StandardScaler()), ('clf', MLPClassifier(hidden_layer_sizes=(100,), max_iter=400))])
}

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results_cls = {}
if X_csp is not None:
    for name, clf in classifiers.items():
        accs=[]; f1s=[]
        for tr,te in kf.split(X_csp, y):
            clf.fit(X_csp[tr], y[tr])
            p = clf.predict(X_csp[te])
            accs.append(accuracy_score(y[te], p)); f1s.append(f1_score(y[te], p, average='weighted'))
        results_cls[name] = {'acc_mean': np.mean(accs), 'acc_std': np.std(accs), 'f1_mean': np.mean(f1s)}
        print(f"[CSP] {name}: acc {results_cls[name]['acc_mean']:.3f} ± {results_cls[name]['acc_std']:.3f}, f1 {results_cls[name]['f1_mean']:.3f}")

# save CSP artifact
if X_csp is not None:
    joblib.dump({'csp': csp, 'cls_results': results_cls}, 'bnci_classical_artifacts.pkl')

# ---------- EEGNet implementation (small, robust) ----------
class BNCI_Dataset(Dataset):
    def __init__(self, X, y, augment=False):
        self.X = X.astype(np.float32); self.y = y.astype(int)
        self.augment = augment
    def __len__(self): return len(self.y)
    def __getitem__(self, idx):
        x = self.X[idx]
        if self.augment:
            x = random_augment_numpy(x)
        return torch.tensor(x, dtype=torch.float32), torch.tensor(int(self.y[idx]), dtype=torch.long)

class EEGNet(nn.Module):
    def __init__(self, chans, samples, classes=2, kern_len=64, F1=8, D=2, F2=16, dropout=0.5):
        super().__init__()
        self.first = nn.Sequential(
            nn.Conv2d(1, F1, (1, kern_len), padding=(0, kern_len//2), bias=False),
            nn.BatchNorm2d(F1),
            nn.Conv2d(F1, F1*D, (chans, 1), bias=False),
            nn.BatchNorm2d(F1*D),
            nn.ELU(), nn.AvgPool2d((1,4)), nn.Dropout(dropout)
        )
        self.second = nn.Sequential(
            nn.Conv2d(F1*D, F2, (1, 16), bias=False),
            nn.BatchNorm2d(F2), nn.ELU(), nn.AvgPool2d((1,8)), nn.Flatten()
        )
        with torch.no_grad():
            dummy = torch.zeros(1,1,chans,samples)
            feat = self.first(dummy); feat = self.second(feat)
            hid_dim = feat.shape[1]
        self.classify = nn.Linear(hid_dim, classes)
    def forward(self,x):
        x = x.unsqueeze(1); x = self.first(x); x = self.second(x); return self.classify(x)

# ---------- k-fold EEGNet training ----------
device = 'cuda' if torch.cuda.is_available() else 'cpu'
n_splits = 5
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
fold_results = []
fold_idx = 0
for tr_idx, te_idx in kf.split(X, y):
    fold_idx += 1
    print(f"\n=== Fold {fold_idx}/{n_splits} ===")
    Xtr, Xte = X[tr_idx], X[te_idx]
    ytr, yte = y[tr_idx], y[te_idx]
    ds_tr = BNCI_Dataset(Xtr, ytr, augment=True)
    ds_te = BNCI_Dataset(Xte, yte, augment=False)
    loader_tr = DataLoader(ds_tr, batch_size=32, shuffle=True)
    loader_te = DataLoader(ds_te, batch_size=64, shuffle=False)
    chans, samples = X.shape[1], X.shape[2]
    num_classes = int(len(np.unique(y)))
    model = EEGNet(chans, samples, classes=num_classes).to(device)
    opt = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
    loss_fn = nn.CrossEntropyLoss()
    best_acc = 0; best_state = None
    for ep in range(1, 31):  # 30 epochs per fold
        model.train(); losses=[]
        for xb,yb in loader_tr:
            xb,yb = xb.to(device), yb.to(device)
            opt.zero_grad(); logits = model(xb); loss = loss_fn(logits,yb); loss.backward(); opt.step()
            losses.append(loss.item())
        # eval
        model.eval()
        ys=[]; preds=[]
        with torch.no_grad():
            for xb,yb in loader_te:
                xb = xb.to(device)
                logits = model(xb)
                preds.extend(logits.argmax(dim=1).cpu().numpy()); ys.extend(yb.numpy())
        acc = accuracy_score(ys, preds); f1v = f1_score(ys, preds, average='weighted')
        if acc > best_acc:
            best_acc = acc; best_state = model.state_dict()
        if ep==1 or ep%5==0:
            print(f" ep{ep}: tr_loss={np.mean(losses):.4f}, val_acc={acc:.4f}, f1={f1v:.4f}")
    # save best for fold
    if best_state is not None:
        torch.save(best_state, f"eegnet_fold{fold_idx}_best.pth")
    print(f"Fold {fold_idx} best acc: {best_acc:.4f}")
    fold_results.append({'fold':fold_idx, 'best_acc':best_acc})
# summary
print("\n=== EEGNet k-fold results ===")
print(pd.DataFrame(fold_results).set_index('fold'))

# ---------- final summary table (combine CSP/classical + EEGNet mean) ----------
rows=[]
if X_csp is not None:
    for k,v in results_cls.items():
        rows.append({'model': k+' (CSP)', 'acc_mean': v['acc_mean'], 'acc_std': v['acc_std'], 'f1_mean': v['f1_mean']})
# EEGNet fold mean
eeg_mean = np.mean([r['best_acc'] for r in fold_results])
rows.append({'model': 'EEGNet (k-fold)', 'acc_mean': float(eeg_mean), 'acc_std': float(np.std([r['best_acc'] for r in fold_results])), 'f1_mean': None})
df = pd.DataFrame(rows).sort_values('acc_mean', ascending=False).reset_index(drop=True)
print("\n=== Final comparison ===\n", df)
df.to_csv('bnci_benchmark_summary.csv', index=False)
print("Saved bnci_benchmark_summary.csv and fold models eegnet_fold*_best.pth")


Loaded BNCI: (640, 25, 561) labels: {np.int64(0): np.int64(160), np.int64(1): np.int64(160), np.int64(2): np.int64(160), np.int64(3): np.int64(160)}
BNCI meta keys: ['sfreq']
Computing rank from data with rank=None
    Using tolerance 1.9e+02 (2.2e-16 eps * 25 dim * 3.4e+16  max singular value)
    Estimated rank (data): 25
    data: rank 25 computed from 25 data channels with 0 projectors
Reducing data rank from 25 -> 25
Estimating class=0 covariance using EMPIRICAL
Done.
Estimating class=1 covariance using EMPIRICAL
Done.
Estimating class=2 covariance using EMPIRICAL
Done.
Estimating class=3 covariance using EMPIRICAL
Done.
CSP features: (640, 8)
[CSP] LDA: acc 0.261 ± 0.030, f1 0.255
[CSP] SVM-rbf: acc 0.211 ± 0.020, f1 0.196
[CSP] RandomForest: acc 0.250 ± 0.034, f1 0.247




[CSP] MLP: acc 0.223 ± 0.046, f1 0.222

=== Fold 1/5 ===
 ep1: tr_loss=1.4002, val_acc=0.2422, f1=0.2082
 ep5: tr_loss=1.3142, val_acc=0.3359, f1=0.3338
 ep10: tr_loss=1.2207, val_acc=0.3516, f1=0.3438
 ep15: tr_loss=1.1609, val_acc=0.3750, f1=0.3654
 ep20: tr_loss=1.1095, val_acc=0.3984, f1=0.3900
 ep25: tr_loss=0.9879, val_acc=0.4219, f1=0.4216
 ep30: tr_loss=0.9052, val_acc=0.4531, f1=0.4545
Fold 1 best acc: 0.4922

=== Fold 2/5 ===
 ep1: tr_loss=1.3952, val_acc=0.3750, f1=0.3542
 ep5: tr_loss=1.2887, val_acc=0.3672, f1=0.3638
 ep10: tr_loss=1.2177, val_acc=0.3281, f1=0.3130
 ep15: tr_loss=1.1488, val_acc=0.4219, f1=0.4063
 ep20: tr_loss=1.0770, val_acc=0.4297, f1=0.4295
 ep25: tr_loss=1.0209, val_acc=0.4922, f1=0.4905
 ep30: tr_loss=0.9478, val_acc=0.4766, f1=0.4766
Fold 2 best acc: 0.5078

=== Fold 3/5 ===
 ep1: tr_loss=1.3818, val_acc=0.2422, f1=0.1990
 ep5: tr_loss=1.2818, val_acc=0.2656, f1=0.2624
 ep10: tr_loss=1.2132, val_acc=0.2578, f1=0.2431
 ep15: tr_loss=1.1451, val_acc=0

In [7]:
# ===== Stage 2: EEGNet / ShallowConvNet / DeepConvNet benchmark (k-fold) =====
# Paste into BNCI_benchmark.ipynb. Requires: numpy, torch, sklearn, pandas, joblib
import os, random, math, numpy as np, pandas as pd
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
import torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

# reproducibility
def seed_everything(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
seed_everything(42)

# ---------- Load BNCI preprocessed (safe meta load) ----------
PRE = "preprocessed_BNCI.npz"
assert os.path.exists(PRE), "preprocessed_BNCI.npz not found. Run extraction first."
d = np.load(PRE, allow_pickle=True)
X = d['X'].astype(np.float32)   # shape: (N, n_ch, n_times)
y = d['y'].astype(int)
meta = {}
if 'meta' in d:
    mr = d['meta']
    try:
        meta = mr.item() if np.ndim(mr)==0 else dict(mr)
    except Exception:
        try:
            meta = dict(mr)
        except Exception:
            meta = {}
print("Loaded:", X.shape, "labels:", dict(zip(*np.unique(y, return_counts=True))))

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device:", device)

# ---------- Augmentation ----------
def augment_numpy(epoch):
    # epoch: n_ch x n_times
    e = epoch.copy()
    # gaussian noise
    if np.random.rand() < 0.5:
        e = e + np.random.normal(0, 0.01, e.shape)
    # time shift
    if np.random.rand() < 0.4:
        shift = np.random.randint(-10, 11)
        e = np.roll(e, shift, axis=1)
    # channel dropout
    if np.random.rand() < 0.3:
        ch = e.shape[0]
        drop = np.random.rand(ch) < 0.05
        e[drop,:] = 0
    return e

class BNCI_Dataset(Dataset):
    def __init__(self, X, y, augment=False):
        self.X = X.astype(np.float32); self.y = y.astype(int); self.augment = augment
    def __len__(self): return len(self.y)
    def __getitem__(self, idx):
        x = self.X[idx]
        if self.augment:
            x = augment_numpy(x)
        return torch.tensor(x, dtype=torch.float32), torch.tensor(int(self.y[idx]), dtype=torch.long)

# ---------- Model definitions ----------
# EEGNet (small/robust)
class EEGNet(nn.Module):
    def __init__(self, chans, samples, classes=2, F1=8, D=2, F2=16, kern_len=64, dropout=0.5):
        super().__init__()
        self.first = nn.Sequential(
            nn.Conv2d(1, F1, (1, kern_len), padding=(0, kern_len//2), bias=False),
            nn.BatchNorm2d(F1),
            nn.Conv2d(F1, F1*D, (chans, 1), bias=False),
            nn.BatchNorm2d(F1*D),
            nn.ELU(), nn.AvgPool2d((1,4)), nn.Dropout(dropout)
        )
        self.second = nn.Sequential(
            nn.Conv2d(F1*D, F2, (1, 16), bias=False),
            nn.BatchNorm2d(F2), nn.ELU(), nn.AvgPool2d((1,8)), nn.Flatten()
        )
        with torch.no_grad():
            dummy = torch.zeros(1,1,chans,samples)
            h = self.first(dummy); h = self.second(h)
            hid = h.shape[1]
        self.classify = nn.Linear(hid, classes)
    def forward(self,x):
        x = x.unsqueeze(1); x = self.first(x); x = self.second(x); return self.classify(x)

# ShallowConvNet (Bandpower-like)
class ShallowConvNet(nn.Module):
    def __init__(self, chans, samples, classes=2, F=40, kern_len=25, dropout=0.5):
        super().__init__()
        self.temporal = nn.Conv2d(1, F, (1, kern_len), padding=(0, kern_len//2), bias=False)
        self.spatial = nn.Conv2d(F, F, (chans, 1), bias=False)
        self.pool = nn.Sequential(nn.ELU(), nn.AvgPool2d((1, 75)), nn.Dropout(dropout))
        # dynamic flatten
        with torch.no_grad():
            dummy = torch.zeros(1,1,chans,samples)
            out = self.pool(self.spatial(self.temporal(dummy)))
            hid = int(torch.prod(torch.tensor(out.shape[1:])))
        self.classify = nn.Linear(hid, classes)
    def forward(self,x):
        x = x.unsqueeze(1)
        x = self.temporal(x)
        x = self.spatial(x)
        x = self.pool(x)
        x = x.flatten(1)
        return self.classify(x)

# DeepConvNet (as in Schirrmeister et al.)
class DeepConvNet(nn.Module):
    def __init__(self, chans, samples, classes=2, dropout=0.5):
        super().__init__()
        self.block1 = nn.Sequential(
            nn.Conv2d(1, 25, (1,5), padding=(0,2), bias=False),
            nn.Conv2d(25, 25, (chans,1), bias=False),
            nn.BatchNorm2d(25),
            nn.ELU(),
            nn.MaxPool2d((1,2)),
            nn.Dropout(dropout)
        )
        self.block2 = nn.Sequential(
            nn.Conv2d(25, 50, (1,5), padding=(0,2), bias=False),
            nn.BatchNorm2d(50), nn.ELU(), nn.MaxPool2d((1,2)), nn.Dropout(dropout)
        )
        self.block3 = nn.Sequential(
            nn.Conv2d(50, 100, (1,5), padding=(0,2), bias=False),
            nn.BatchNorm2d(100), nn.ELU(), nn.MaxPool2d((1,2)), nn.Dropout(dropout)
        )
        self.block4 = nn.Sequential(
            nn.Conv2d(100, 200, (1,5), padding=(0,2), bias=False),
            nn.BatchNorm2d(200), nn.ELU(), nn.MaxPool2d((1,2)), nn.Dropout(dropout)
        )
        with torch.no_grad():
            dummy = torch.zeros(1,1,chans,samples)
            out = self.block1(dummy); out = self.block2(out); out = self.block3(out); out = self.block4(out)
            hid = int(torch.prod(torch.tensor(out.shape[1:])))
        self.classify = nn.Linear(hid, classes)
    def forward(self,x):
        x = x.unsqueeze(1)
        x = self.block1(x); x = self.block2(x); x = self.block3(x); x = self.block4(x)
        x = x.flatten(1)
        return self.classify(x)

# ---------- Training utilities ----------
def train_epoch(model, loader, opt, loss_fn, device):
    model.train(); losses=[]
    for xb,yb in loader:
        xb = xb.to(device, dtype=torch.float32); yb = yb.to(device, dtype=torch.long)
        opt.zero_grad(); logits = model(xb); loss = loss_fn(logits, yb)
        loss.backward(); opt.step()
        losses.append(loss.item())
    return float(np.mean(losses)) if len(losses)>0 else 0.0

def eval_model(model, loader, device):
    model.eval(); ys=[]; preds=[]
    with torch.no_grad():
        for xb,yb in loader:
            xb = xb.to(device, dtype=torch.float32)
            logits = model(xb)
            preds.extend(logits.argmax(dim=1).cpu().numpy()); ys.extend(yb.numpy())
    return np.array(ys), np.array(preds)

# ---------- K-Fold training for each model ----------
models_to_run = ['EEGNet', 'ShallowConvNet', 'DeepConvNet']
n_splits = 5
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
summary_rows = []

for model_name in models_to_run:
    print("\n===== Training:", model_name, "=====")
    fold = 0
    fold_best_accs = []
    for tr_idx, te_idx in kf.split(X, y):
        fold += 1
        print(f"\n--- {model_name} fold {fold}/{n_splits} ---")
        Xtr, Xte = X[tr_idx], X[te_idx]; ytr, yte = y[tr_idx], y[te_idx]
        ds_tr = BNCI_Dataset(Xtr, ytr, augment=True)
        ds_te = BNCI_Dataset(Xte, yte, augment=False)
        loader_tr = DataLoader(ds_tr, batch_size=32, shuffle=True)
        loader_te = DataLoader(ds_te, batch_size=64, shuffle=False)

        chans, samples = X.shape[1], X.shape[2]
        num_classes = int(len(np.unique(y)))
        # instantiate model
        if model_name == 'EEGNet':
            model = EEGNet(chans, samples, classes=num_classes).to(device)
        elif model_name == 'ShallowConvNet':
            model = ShallowConvNet(chans, samples, classes=num_classes).to(device)
        else:
            model = DeepConvNet(chans, samples, classes=num_classes).to(device)

        opt = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
        loss_fn = nn.CrossEntropyLoss()
        # FIXED scheduler: removed verbose argument for compatibility
        scheduler = ReduceLROnPlateau(opt, mode='max', factor=0.5, patience=3)

        best_acc = 0.0; best_state = None; stale = 0; patience = 8
        for ep in range(1, 51):   # up to 50 epochs
            tr_loss = train_epoch(model, loader_tr, opt, loss_fn, device)
            ys_val, preds_val = eval_model(model, loader_te, device)
            val_acc = accuracy_score(ys_val, preds_val); val_f1 = f1_score(ys_val, preds_val, average='weighted')
            scheduler.step(val_acc)
            if val_acc > best_acc + 1e-4:
                best_acc = val_acc; best_state = model.state_dict(); stale = 0
            else:
                stale += 1
            if ep==1 or ep%5==0 or stale==0:
                print(f" ep{ep}: tr_loss={tr_loss:.4f}, val_acc={val_acc:.4f}, val_f1={val_f1:.4f} (best {best_acc:.4f})")
            if stale >= patience:
                print(" Early stopping (no val improvement).")
                break

        if best_state is not None:
            fname = f"{model_name}_fold{fold}_best.pth"
            torch.save(best_state, fname)
            print(" Saved best model:", fname)
        fold_best_accs.append(best_acc)
        summary_rows.append({'model': model_name, 'fold': fold, 'best_acc': float(best_acc), 'best_f1': float(val_f1)})

    mean_acc = float(np.mean(fold_best_accs)); std_acc = float(np.std(fold_best_accs))
    print(f"\n>>> {model_name} summary: mean_acc={mean_acc:.4f} ± {std_acc:.4f}")

# ---------- Final comparison ----------
df = pd.DataFrame(summary_rows)
summary_table = df.groupby('model').agg({'best_acc':['mean','std'], 'best_f1':'mean'})
summary_table.columns = ['acc_mean','acc_std','f1_mean']
summary_table = summary_table.reset_index().sort_values('acc_mean', ascending=False)
print("\n=== Final Stage-2 comparison ===\n", summary_table)
summary_table.to_csv('stage2_benchmark_summary.csv', index=False)
print("Saved stage2_benchmark_summary.csv and fold models (*.pth)")


Loaded: (640, 25, 561) labels: {np.int64(0): np.int64(160), np.int64(1): np.int64(160), np.int64(2): np.int64(160), np.int64(3): np.int64(160)}
Device: cuda

===== Training: EEGNet =====

--- EEGNet fold 1/5 ---
 ep1: tr_loss=1.3971, val_acc=0.2891, val_f1=0.2722 (best 0.2891)
 ep2: tr_loss=1.3611, val_acc=0.3047, val_f1=0.2945 (best 0.3047)
 ep3: tr_loss=1.3407, val_acc=0.3281, val_f1=0.3117 (best 0.3281)
 ep5: tr_loss=1.3082, val_acc=0.3594, val_f1=0.3596 (best 0.3594)
 ep8: tr_loss=1.2570, val_acc=0.3672, val_f1=0.3674 (best 0.3672)
 ep10: tr_loss=1.2251, val_acc=0.3281, val_f1=0.3213 (best 0.3672)
 ep12: tr_loss=1.2149, val_acc=0.4141, val_f1=0.4073 (best 0.4141)
 ep13: tr_loss=1.1965, val_acc=0.4297, val_f1=0.4127 (best 0.4297)
 ep14: tr_loss=1.1822, val_acc=0.4531, val_f1=0.4443 (best 0.4531)
 ep15: tr_loss=1.1728, val_acc=0.4141, val_f1=0.3932 (best 0.4531)
 ep18: tr_loss=1.1036, val_acc=0.4766, val_f1=0.4606 (best 0.4766)
 ep20: tr_loss=1.0845, val_acc=0.3984, val_f1=0.3855 (be