In [1]:
import os
import warnings
import math
import json
import random
from copy import deepcopy
from pathlib import Path

import numpy as np
import pandas as pd
import joblib
import polars as pl
from tqdm.auto import tqdm

# Scikit-learn
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import StratifiedGroupKFold, PredefinedSplit
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score

# SciPy
from scipy.spatial.transform import Rotation as R

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

warnings.filterwarnings("ignore")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# =============================================================================
# 1) CONFIG & SEED
# =============================================================================
class Config:
    RAW_DIR = Path("/kaggle/input/cmi-detect-behavior-with-sensor-data")
    IDS_DEL_PATH = Path("/kaggle/input/processdata/ids2del_prop_0.2.csv")
    EXPORT_DIR = Path("/kaggle/working/")
    BATCH_SIZE = 64
    PAD_PERCENTILE = 94
    LR_INIT = 3e-4
    WD = 3e-4
    MIXUP_ALPHA = 0.4
    EPOCHS = 220
    PATIENCE = 30
    T_0 = 25
    EMA_DECAY = 0.99
    ACC_COLS = ["acc_x", "acc_y", "acc_z"]
    ROT_COLS = ["rot_w", "rot_x", "rot_y", "rot_z"]
    SEED = 42

os.makedirs(Config.EXPORT_DIR, exist_ok=True)

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(Config.SEED)

# =============================================================================
# 2) LOSS / AUGS / UTILITIES
# =============================================================================
class FocalLoss(nn.Module):
    """
    Focal loss cho soft labels (mixup). Tính trên log_softmax.
    """
    def __init__(self, alpha=0.25, gamma=2.0, reduction="mean"):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        # inputs: logits (N, C); targets: soft labels (N, C)
        logp = F.log_softmax(inputs, dim=1)
        p = logp.exp()
        ce = -(targets * logp)                   # per-class CE
        focal = self.alpha * (1 - p) ** self.gamma * ce
        loss = focal.sum(dim=1)                  # sum over classes
        return loss.mean() if self.reduction == "mean" else loss.sum()

def handle_quaternion_missing_values(rot_data: np.ndarray) -> np.ndarray:
    rot_cleaned = rot_data.copy()
    for i in range(len(rot_data)):
        row = rot_data[i]
        missing_count = np.isnan(row).sum()
        if missing_count == 0:
            norm = np.linalg.norm(row)
            rot_cleaned[i] = row / norm if norm > 1e-8 else [1.0, 0.0, 0.0, 0.0]
        elif missing_count == 1:
            missing_idx = np.where(np.isnan(row))[0][0]
            valid_values = row[~np.isnan(row)]
            sum_squares = np.sum(valid_values**2)
            if sum_squares <= 1.0:
                missing_value = np.sqrt(max(0, 1.0 - sum_squares))
                if i > 0 and not np.isnan(rot_cleaned[i-1, missing_idx]) and rot_cleaned[i-1, missing_idx] < 0:
                    missing_value = -missing_value
                rot_cleaned[i, missing_idx] = missing_value
            else:
                rot_cleaned[i] = [1.0, 0.0, 0.0, 0.0]
        else:
            rot_cleaned[i] = [1.0, 0.0, 0.0, 0.0]
    return rot_cleaned

def calculate_angular_velocity_from_quat(rot_data, time_delta=1/200):
    quat_values = rot_data[:, [1, 2, 3, 0]] if rot_data.shape[1] == 4 else rot_data
    num_samples = quat_values.shape[0]
    angular_vel = np.zeros((num_samples, 3))
    for i in range(num_samples - 1):
        q_t, q_t_plus_dt = quat_values[i], quat_values[i+1]
        if np.any(np.isnan(q_t)) or np.any(np.isnan(q_t_plus_dt)):
            continue
        try:
            rot_t, rot_t_plus_dt = R.from_quat(q_t), R.from_quat(q_t_plus_dt)
            delta_rot = rot_t.inv() * rot_t_plus_dt
            angular_vel[i, :] = delta_rot.as_rotvec() / time_delta
        except ValueError:
            pass
    return angular_vel

def calculate_angular_distance(rot_data):
    quat_values = rot_data[:, [1, 2, 3, 0]] if rot_data.shape[1] == 4 else rot_data
    num_samples = quat_values.shape[0]
    angular_dist = np.zeros(num_samples)
    for i in range(num_samples - 1):
        q1, q2 = quat_values[i], quat_values[i+1]
        if np.any(np.isnan(q1)) or np.any(np.isnan(q2)):
            continue
        try:
            r1, r2 = R.from_quat(q1), R.from_quat(q2)
            angle = np.linalg.norm((r1.inv() * r2).as_rotvec())
            angular_dist[i] = angle
        except ValueError:
            pass
    return angular_dist

def remove_gravity_from_acc(acc_data, rot_data):
    acc_values = acc_data.values if isinstance(acc_data, pd.DataFrame) else acc_data
    quat_values = rot_data.values if isinstance(rot_data, pd.DataFrame) else rot_data
    quat_scipy = quat_values[:, [1, 2, 3, 0]]
    num_samples = acc_values.shape[0]
    linear_accel = np.zeros_like(acc_values)
    gravity_world = np.array([0, 0, 9.81])
    for i in range(num_samples):
        if np.any(np.isnan(quat_scipy[i])):
            linear_accel[i, :] = acc_values[i, :]
            continue
        try:
            rotation = R.from_quat(quat_scipy[i])
            gravity_sensor_frame = rotation.apply(gravity_world, inverse=True)
            linear_accel[i, :] = acc_values[i, :] - gravity_sensor_frame
        except ValueError:
            linear_accel[i, :] = acc_values[i, :]
    return linear_accel

def add_features(df: pd.DataFrame) -> pd.DataFrame:
    df["acc_mag"] = np.linalg.norm(df[Config.ACC_COLS].values, axis=1)
    df["acc_mag_jerk"] = df.groupby("sequence_id")["acc_mag"].diff().fillna(0)
    df["jerk_x"], df["jerk_y"], df["jerk_z"] = np.gradient(df["acc_x"]), np.gradient(df["acc_y"]), np.gradient(df["acc_z"])
    df["jerk_magnitude"] = np.linalg.norm(df[["jerk_x", "jerk_y", "jerk_z"]].values, axis=1)

    window = 20
    for _, g in df.groupby("sequence_id"):
        df.loc[g.index, "acc_xy_corr"] = g["acc_x"].rolling(window, min_periods=1).corr(g["acc_y"]).fillna(0)
        df.loc[g.index, "acc_xz_corr"] = g["acc_x"].rolling(window, min_periods=1).corr(g["acc_z"]).fillna(0)
        df.loc[g.index, "acc_yz_corr"] = g["acc_y"].rolling(window, min_periods=1).corr(g["acc_z"]).fillna(0)

    df["rot_angle"] = 2 * np.arccos(df["rot_w"].clip(-1, 1))
    df["rot_angle_vel"] = df.groupby("sequence_id")["rot_angle"].diff().fillna(0)

    rot_numpy = df[Config.ROT_COLS].to_numpy()
    angular_vel = calculate_angular_velocity_from_quat(rot_numpy)
    angular_dist = calculate_angular_distance(rot_numpy)
    df[["angular_vel_x", "angular_vel_y", "angular_vel_z"]] = angular_vel
    df["angular_distance"] = angular_dist
    df["angular_vel_magnitude"] = np.linalg.norm(angular_vel, axis=1)

    linear_accel = remove_gravity_from_acc(df[Config.ACC_COLS], df[Config.ROT_COLS])
    df[["acc_x2", "acc_y2", "acc_z2"]] = linear_accel
    df["acc_mag2"] = np.linalg.norm(linear_accel, axis=1)
    df["acc_mag_jerk2"] = df.groupby("sequence_id")["acc_mag2"].diff().fillna(0)
    df["jerk_x2"], df["jerk_y2"], df["jerk_z2"] = np.gradient(df["acc_x2"]), np.gradient(df["acc_y2"]), np.gradient(df["acc_z2"])
    df["jerk_magnitude2"] = np.linalg.norm(df[["jerk_x2", "jerk_y2", "jerk_z2"]].values, axis=1)

    for _, g in df.groupby("sequence_id"):
        df.loc[g.index, "acc_xy_corr2"] = g["acc_x2"].rolling(window, min_periods=1).corr(g["acc_y2"]).fillna(0)
        df.loc[g.index, "acc_xz_corr2"] = g["acc_x2"].rolling(window, min_periods=1).corr(g["acc_z2"]).fillna(0)
        df.loc[g.index, "acc_yz_corr2"] = g["acc_y2"].rolling(window, min_periods=1).corr(g["acc_z2"]).fillna(0)

    df.replace([np.inf, -np.inf], 0, inplace=True)
    df.fillna(0, inplace=True)
    return df

def pad_sequences(sequences: list, maxlen: int, padding: str="pre", truncating: str="pre", dtype: str="float32") -> np.ndarray:
    n_samples = len(sequences)
    sample_shape = tuple()
    for s in sequences:
        if len(s) > 0:
            sample_shape = np.asarray(s).shape[1:]
            break
    x = np.zeros((n_samples, maxlen) + sample_shape, dtype=dtype)
    for idx, s in enumerate(sequences):
        if len(s) == 0:
            continue
        s_np = np.asarray(s, dtype=dtype)
        if truncating == "pre":
            s_np = s_np[-maxlen:]
        else:
            s_np = s_np[:maxlen]
        trunc = s_np.shape[0]
        if padding == "pre":
            x[idx, -trunc:] = s_np
        else:
            x[idx, :trunc] = s_np
    return x

def to_categorical(y, num_classes=None):
    y = np.array(y, dtype="int")
    if not num_classes:
        num_classes = np.max(y) + 1
    return np.eye(num_classes)[y]

def preprocess_left_handed_pd(df_pd: pd.DataFrame, demo_pd: pd.DataFrame) -> pd.DataFrame:
    pl_df = pl.DataFrame(df_pd)
    pl_demo = pl.DataFrame(demo_pd)
    l_subjs = pl_demo.filter(pl.col("handedness") == 0)["subject"].to_list()
    r_tr = pl_df.filter(~pl.col("subject").is_in(l_subjs))
    l_tr = pl_df.filter(pl.col("subject").is_in(l_subjs))
    if l_tr.shape[0] == 0:
        return r_tr.to_pandas()

    l_tr = l_tr.with_columns(-pl.col("acc_y"))
    l_tr = l_tr.with_columns(-pl.col("acc_z"))
    rot_np = l_tr.select(Config.ROT_COLS).to_numpy()
    rot_scipy = rot_np[:, [1, 2, 3, 0]]
    r = R.from_quat(rot_scipy)
    tmp = r.as_euler("xyz")
    tmp[:, 1] = -tmp[:, 1]
    tmp[:, 2] = -tmp[:, 2]
    r = R.from_euler("xyz", tmp)
    tmp = r.as_quat()
    l_tr = l_tr.with_columns(pl.DataFrame(tmp, schema=["rot_x", "rot_y", "rot_z", "rot_w"]))
    pl_df2 = pl.concat([r_tr, l_tr]).sort(by="row_id")
    return pl_df2.to_pandas()

# =============================================================================
# 3) AUGMENTATIONS
# =============================================================================
class SignalTransform:
    def __init__(self, p=0.5): self.p = p
    def __call__(self, y: np.ndarray):
        if np.random.rand() < self.p: return self.apply(y)
        return y
    def apply(self, y: np.ndarray): raise NotImplementedError

class TimeStretch(SignalTransform):
    def __init__(self, max_rate=1.5, min_rate=0.5, p=0.5):
        super().__init__(p); self.max_rate, self.min_rate = max_rate, min_rate
    def apply(self, x: np.ndarray):
        rate = np.random.uniform(self.min_rate, self.max_rate)
        L = x.shape[0]; L_new = int(L / rate)
        orig_idx = np.linspace(0, L - 1, L)
        new_idx = np.linspace(0, L - 1, L_new)
        stretched = np.stack([np.interp(new_idx, orig_idx, x[:, i]) for i in range(x.shape[1])], axis=1)
        if L_new < L:
            padded = np.zeros((L, x.shape[1]), dtype=stretched.dtype)
            padded[-L_new:, :] = stretched
            return padded
        return stretched[-L:, :]

class TimeShift(SignalTransform):
    def __init__(self, p=0.5, max_shift_pct=0.25):
        super().__init__(p); self.max_shift_pct = max_shift_pct
    def apply(self, x: np.ndarray):
        L = x.shape[0]
        shift = np.random.randint(-int(L * self.max_shift_pct), int(L * self.max_shift_pct) + 1)
        augmented = np.roll(x, shift, axis=0)
        if shift > 0: augmented[:shift, :] = 0
        elif shift < 0: augmented[shift:, :] = 0
        return augmented

class SignalPermutation(SignalTransform):
    def __init__(self, p=0.5):
        super().__init__(p)
    def apply(self, x: np.ndarray):
        augmented = x.copy()
        # acc: cột 0..2
        perm_acc = np.random.permutation(3)
        augmented[:, 0:3] = augmented[:, perm_acc]
        # gyro: ví dụ ở cột 15..17
        if x.shape[1] > 17:
            perm_gyro = np.random.permutation(3)
            original = augmented[:, 15:18].copy()
            augmented[:, 15:18] = original[:, perm_gyro]
        return augmented

class Compose:
    def __init__(self, transforms: list): self.transforms = transforms
    def __call__(self, y: np.ndarray):
        for trns in self.transforms: y = trns(y)
        return y


# --- Pyramid Temporal Feature (PTF) ---
class PyramidTemporalFeature(nn.Module):
    """
    Module trích xuất đặc trưng theo nhiều tần suất thời gian khác nhau.
    Tương tự FPN nhưng dùng nhiều Conv1D kernel size khác nhau để nắm bắt pattern ngắn & dài.
    """
    def __init__(self, in_channels, out_channels, kernel_sizes=[3, 5, 7]):
        super().__init__()
        self.branches = nn.ModuleList([
            nn.Sequential(
                nn.Conv1d(in_channels, out_channels, kernel_size=k, padding=k//2),
                nn.BatchNorm1d(out_channels),
                nn.ReLU(inplace=True)
            )
            for k in kernel_sizes
        ])
        # Gộp lại các scale
        self.fuse = nn.Conv1d(out_channels * len(kernel_sizes), out_channels, kernel_size=1)

    def forward(self, x):
        # x: [B, C, T]
        features = [branch(x) for branch in self.branches]  # nhiều scale thời gian
        concat = torch.cat(features, dim=1)  # ghép theo kênh
        out = self.fuse(concat)              # hợp nhất lại
        return out                           # [B, out_channels, T]
# =============================================================================
# 4) DATASET
# =============================================================================
class MixupDataset(Dataset):
    def __init__(self, X, y, transforms, alpha=0.2, mode="train"):
        self.X, self.y = X, y
        self.transforms = transforms
        self.alpha, self.mode = alpha, mode
    def __len__(self): return len(self.X)
    def __getitem__(self, idx):
        X, y = self.X[idx].copy(), self.y[idx].copy()
        if self.mode == "train":
            if self.alpha > 0:
                lam = np.random.beta(self.alpha, self.alpha)
                idx2 = np.random.randint(len(self.X))
                X = lam * X + (1 - lam) * self.X[idx2]
                y = lam * y + (1 - lam) * self.y[idx2]
            X = self.transforms(X)
        return torch.FloatTensor(X), torch.FloatTensor(y)

# =============================================================================
# 5) MODEL
# =============================================================================
class SEBlock(nn.Module):
    def __init__(self, channel, reduction=8):
        super().__init__()
        self.avg_pool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction), nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel), nn.Sigmoid())
    def forward(self, x):
        b, c, _ = x.size()
        y = self.avg_pool(x).view(b, c)
        y = self.fc(y).view(b, c, 1)
        return x * y.expand_as(x)

class ResidualSEBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, drop=0.3):
        super().__init__()
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size, padding="same", bias=False)
        self.bn1 = nn.BatchNorm1d(out_channels)
        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size, padding="same", bias=False)
        self.bn2 = nn.BatchNorm1d(out_channels)
        self.se = SEBlock(out_channels)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(drop)
        self.pool = nn.MaxPool1d(2)
        self.shortcut = nn.Sequential(
            nn.Conv1d(in_channels, out_channels, 1, bias=False),
            nn.BatchNorm1d(out_channels)
        ) if in_channels != out_channels else nn.Identity()
    def forward(self, x):
        identity = self.shortcut(x)
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.se(self.bn2(self.conv2(out)))
        out = self.dropout(self.pool(self.relu(out + identity)))
        return out

class AttentionLayer(nn.Module):
    def __init__(self, hidden_dim): super().__init__(); self.attn = nn.Sequential(nn.Linear(hidden_dim, 1), nn.Tanh())
    def forward(self, x):
        weights = F.softmax(self.attn(x).squeeze(-1), dim=1).unsqueeze(-1)
        return torch.sum(x * weights, dim=1)

class CrossAttention(nn.Module):
    def __init__(self, feature_dim, num_heads=8, dropout=0.1):
        super().__init__()
        self.feature_dim = feature_dim
        self.num_heads = num_heads
        self.head_dim = feature_dim // num_heads
        assert feature_dim % num_heads == 0, "feature_dim must be divisible by num_heads"
        self.q_linear = nn.Linear(feature_dim, feature_dim, bias=False)
        self.k_linear = nn.Linear(feature_dim, feature_dim, bias=False)
        self.v_linear = nn.Linear(feature_dim, feature_dim, bias=False)
        self.out_linear = nn.Linear(feature_dim, feature_dim)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(feature_dim)
    def forward(self, query_branch, key_value_branches):
        B, T, C = query_branch.shape
        Q = self.q_linear(query_branch)
        all_kv = torch.cat(key_value_branches, dim=1)
        K = self.k_linear(all_kv)
        V = self.v_linear(all_kv)
        Q = Q.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
        K = K.view(B, -1, self.num_heads, self.head_dim).transpose(1, 2)
        V = V.view(B, -1, self.num_heads, self.head_dim).transpose(1, 2)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
        attn_weights = F.softmax(scores, dim=-1)
        attn_weights = self.dropout(attn_weights)
        attn_output = torch.matmul(attn_weights, V)
        attn_output = attn_output.transpose(1, 2).contiguous().view(B, T, C)
        output = self.out_linear(attn_output)
        output = self.layer_norm(query_branch + output)
        return output

class IMUCrossAttentionFusion(nn.Module):
    def __init__(self, feature_dim=256, num_heads=8, dropout=0.1):
        super().__init__()
        self.cross_attn1 = CrossAttention(feature_dim, num_heads, dropout)
        self.cross_attn2 = CrossAttention(feature_dim, num_heads, dropout)
        self.cross_attn3 = CrossAttention(feature_dim, num_heads, dropout)
    def forward(self, imu1, imu2, imu3):
        e1 = self.cross_attn1(imu1, [imu2, imu3])
        e2 = self.cross_attn2(imu2, [imu1, imu3])
        e3 = self.cross_attn3(imu3, [imu1, imu2])
        return e1, e2, e3

class IMUCrossAttentionModel_PTF(nn.Module):
    def __init__(self, imu_dim, n_classes):
        super().__init__()
        self.imu_dim = imu_dim

        # --- 3 nhánh CNN backbone cho dữ liệu IMU ---
        self.imu_branch1 = nn.Sequential(
            ResidualSEBlock(12, 128, 3),
            ResidualSEBlock(128, 256, 5)
        )
        self.imu_branch2 = nn.Sequential(
            ResidualSEBlock(11, 128, 3),
            ResidualSEBlock(128, 256, 5)
        )
        self.imu_branch3 = nn.Sequential(
            ResidualSEBlock(12, 128, 3),
            ResidualSEBlock(128, 256, 5)
        )

        # --- Pyramid Temporal Feature cho mỗi nhánh ---
        self.ptf1 = PyramidTemporalFeature(256, 128)
        self.ptf2 = PyramidTemporalFeature(256, 128)
        self.ptf3 = PyramidTemporalFeature(256, 128)

        # --- Cross attention fusion ---
        self.cross_attention_fusion = IMUCrossAttentionFusion(feature_dim=128)

        # --- BiLSTM + attention + FC ---
        self.bilstm = nn.LSTM(128 * 3, 512, bidirectional=True, batch_first=True)
        self.attention = AttentionLayer(1024)
        self.fc = nn.Linear(1024, n_classes)
    
    def forward(self, x):
        imu = x[:, :, :self.imu_dim]

        # tách dữ liệu IMU cho 3 nhánh (tuỳ theo cấu trúc cảm biến)
        imu1 = self.imu_branch1(imu[:, :, :12].transpose(1, 2))     # [B, C, T]
        imu2 = self.imu_branch2(imu[:, :, 12:23].transpose(1, 2))
        imu3 = self.imu_branch3(imu[:, :, 23:].transpose(1, 2))
        
        # --- Trích đặc trưng đa tần số thời gian ---
        imu1_ptf = self.ptf1(imu1).transpose(1, 2)
        imu2_ptf = self.ptf2(imu2).transpose(1, 2)
        imu3_ptf = self.ptf3(imu3).transpose(1, 2)

        # --- Cross attention fusion giữa 3 nhánh ---
        imu1, imu2, imu3 = self.cross_attention_fusion(imu1_ptf, imu2_ptf, imu3_ptf)
    
        # --- Ghép & đưa qua BiLSTM ---
        merged = torch.cat((imu1, imu2, imu3), dim=2)
        lstm_out, _ = self.bilstm(merged)

        # --- Attention + phân lớp ---
        attended = self.attention(lstm_out)
        return self.fc(attended)

class ModelEMA(nn.Module):
    def __init__(self, model, decay=0.99, device=None):
        super().__init__()
        self.module = deepcopy(model)
        self.module.eval()
        self.decay = decay
        self.device = device
        if self.device:
            self.module.to(device=device)
    def _update(self, model, update_fn):
        with torch.no_grad():
            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
                if self.device:
                    model_v = model_v.to(device=self.device)
                ema_v.copy_(update_fn(ema_v, model_v))
    def update(self, model): self._update(model, update_fn=lambda e, m: self.decay * e + (1. - self.decay) * m)
    def set(self, model): self._update(model, update_fn=lambda e, m: m)

# =============================================================================
# 6) TRAIN ONE FOLD
# =============================================================================
def train_single_fold(df_train, df_val, imu_cols, le, fold_num):
    print(f"\n========== Begin Fold {fold_num} | Train IMU Cross-Attention ==========")

    # pad_len theo percentile của fold
    seq_lengths = [len(group) for _, group in df_train.groupby("sequence_id")]
    pad_len = int(np.percentile(seq_lengths, Config.PAD_PERCENTILE))

    # build sequences
    X_tr_list = [seq[imu_cols].to_numpy() for _, seq in df_train.groupby("sequence_id")]
    y_tr_int  = [seq["gesture_int"].iloc[0] for _, seq in df_train.groupby("sequence_id")]
    X_tr = pad_sequences(X_tr_list, maxlen=pad_len)
    y_tr = to_categorical(y_tr_int, num_classes=len(le.classes_))

    X_va_list = [seq[imu_cols].to_numpy() for _, seq in df_val.groupby("sequence_id")]
    y_va_int  = [seq["gesture_int"].iloc[0] for _, seq in df_val.groupby("sequence_id")]
    X_va = pad_sequences(X_va_list, maxlen=pad_len)
    y_va = to_categorical(y_va_int, num_classes=len(le.classes_))

    model = IMUCrossAttentionModel_PTF(len(imu_cols), len(le.classes_)).to(device)
    print(f"Model has {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M parameters.")

    ema_model = ModelEMA(model, decay=Config.EMA_DECAY, device=device)
    optimizer = torch.optim.Adam(model.parameters(), lr=Config.LR_INIT, weight_decay=Config.WD)
    criterion = FocalLoss()
    scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=Config.T_0)

    transforms = Compose([TimeShift(p=0.35, max_shift_pct=0.25),
                          TimeStretch(p=0.35),
                          SignalPermutation(p=0.5)])

    train_dataset = MixupDataset(X_tr, y_tr, alpha=Config.MIXUP_ALPHA, mode="train", transforms=transforms)
    train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=True, drop_last=True)
    val_dataset = MixupDataset(X_va, y_va, alpha=0, mode="valid", transforms=Compose([]))
    val_loader = DataLoader(val_dataset, batch_size=Config.BATCH_SIZE)

    best_val_loss = float('inf')
    patience_counter = 0
    model_path = Config.EXPORT_DIR / f"model_best_fold_{fold_num}.pt"

    for epoch in range(Config.EPOCHS):
        model.train()
        train_loss = 0.0
        pbar = tqdm(train_loader, desc=f"Fold {fold_num} | Epoch {epoch+1}/{Config.EPOCHS}")
        for batch_X, batch_y in pbar:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            ema_model.update(model)
            pbar.set_postfix(loss=float(loss.item()))
        train_loss /= len(train_loader)
        scheduler.step()

        # validation loss
        val_loss = 0.0
        ema_model.module.eval()
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                out = ema_model.module(batch_X.to(device))
                val_loss += criterion(out, batch_y.to(device)).item()
        val_loss /= len(val_loader)

        print(f"Fold {fold_num} | Epoch [{epoch+1}/{Config.EPOCHS}] Train {train_loss:.4f} | Val {val_loss:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(ema_model.module.state_dict(), model_path)
            print(f"-> Improved Val Loss {best_val_loss:.4f}. Saved to {model_path}")
        else:
            patience_counter += 1
            if patience_counter >= Config.PATIENCE:
                print(f"Early stopping after {epoch+1} epochs.")
                break

    ema_model.module.eval()
    va_probs = []
    with torch.no_grad():
        for batch_X, _ in DataLoader(val_dataset, batch_size=Config.BATCH_SIZE):
            logits = ema_model.module(batch_X.to(device))
            probs = torch.softmax(logits, dim=1).cpu().numpy()
            va_probs.append(probs)
    va_probs = np.vstack(va_probs)                      # (N_val, C)
    va_preds = va_probs.argmax(1)
    oof_f1 = f1_score(y_va_int, va_preds, average="macro")

    with open(Config.EXPORT_DIR / f"oof_score_fold_{fold_num}.json", "w") as f:
        json.dump({"fold": fold_num, "macro_f1": float(oof_f1), "val_loss": float(best_val_loss)}, f)

    print(f"Fold {fold_num} done. Best Val Loss: {best_val_loss:.4f} | OOF macro-F1: {oof_f1:.4f}")
    return pad_len

# =============================================================================
# 7) MAIN WORKFLOW 
# =============================================================================
def main():
    print("Load data...")
    df = pd.read_csv(Config.RAW_DIR / "train.csv")
    dem = pd.read_csv(Config.RAW_DIR / "train_demographics.csv")

    if Config.IDS_DEL_PATH.exists():
        ids2del = pd.read_csv(Config.IDS_DEL_PATH)
        df = df[~df["sequence_id"].isin(ids2del["sequence_id"])]
        print("Đã loại bỏ các sequence nhiễu từ ids2del.")
 
    print("✅ Removed sequences successfully.")

    # labels
    le = LabelEncoder()
    df["gesture_int"] = le.fit_transform(df["gesture"])
    print("Quaternions & NaNs...")
    df[Config.ROT_COLS] = handle_quaternion_missing_values(df[Config.ROT_COLS].to_numpy())
    df = pd.concat([seq.ffill().bfill().fillna(0) for _, seq in df.groupby("sequence_id")], axis=0)

    df = preprocess_left_handed_pd(df, dem)

    print("Adding features...")
    df = add_features(df)

    meta_cols = {"gesture", "gesture_int", "sequence_type", "behavior", "orientation",
                 "row_id", "subject", "phase", "sequence_id", "sequence_counter",
                 "handedness", "height", "weight", "age"}
    imu_cols = [c for c in df.columns if c not in meta_cols and not (c.startswith("thm_") or c.startswith("tof_"))]
    print(f"Sử dụng {len(imu_cols)} IMU features.")

    sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
    
    for fold, (tr_idx, va_idx) in enumerate(sgkf.split(df, y=df["gesture_int"], groups=df["subject"]),start=1):
        print(f"\n--- Processing Fold {fold} ---")
        df_train = df.iloc[tr_idx].copy()
        df_val   = df.iloc[va_idx].copy()

        pad_len = train_single_fold(df_train, df_val, imu_cols, le, fold_num=fold)
        prep = {
            "transformer": None,
            "categories": list(le.classes_),
            "features": imu_cols,
            "max_length": pad_len,
            "fold": fold
        }
        joblib.dump(prep, Config.EXPORT_DIR / f"prep_fold_{fold}.joblib")
        print(f"Saved prep_fold_{fold}.joblib")

    print("\n Hoàn tất 5-fold CV.")
   
if __name__ == "__main__":
    main()


Using device: cuda
Load data...
Đã loại bỏ các sequence nhiễu từ ids2del.
✅ Removed sequences successfully.
Quaternions & NaNs...
Adding features...
Sử dụng 35 IMU features.

--- Processing Fold 1 ---

Model has 7.3M parameters.


Fold 1 | Epoch 1/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [1/220] Train 0.4609 | Val 0.4408
-> Improved Val Loss 0.4408. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 2/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [2/220] Train 0.3378 | Val 0.2935
-> Improved Val Loss 0.2935. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 3/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [3/220] Train 0.3103 | Val 0.2437
-> Improved Val Loss 0.2437. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 4/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [4/220] Train 0.2923 | Val 0.2221
-> Improved Val Loss 0.2221. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 5/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [5/220] Train 0.2850 | Val 0.2117
-> Improved Val Loss 0.2117. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 6/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [6/220] Train 0.2759 | Val 0.2051
-> Improved Val Loss 0.2051. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 7/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [7/220] Train 0.2747 | Val 0.2010
-> Improved Val Loss 0.2010. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 8/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [8/220] Train 0.2619 | Val 0.1982
-> Improved Val Loss 0.1982. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 9/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [9/220] Train 0.2532 | Val 0.1933
-> Improved Val Loss 0.1933. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 10/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [10/220] Train 0.2588 | Val 0.1884
-> Improved Val Loss 0.1884. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 11/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [11/220] Train 0.2519 | Val 0.1857
-> Improved Val Loss 0.1857. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 12/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [12/220] Train 0.2437 | Val 0.1832
-> Improved Val Loss 0.1832. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 13/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [13/220] Train 0.2441 | Val 0.1791
-> Improved Val Loss 0.1791. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 14/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [14/220] Train 0.2367 | Val 0.1794


Fold 1 | Epoch 15/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [15/220] Train 0.2305 | Val 0.1764
-> Improved Val Loss 0.1764. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 16/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [16/220] Train 0.2321 | Val 0.1725
-> Improved Val Loss 0.1725. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 17/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [17/220] Train 0.2221 | Val 0.1697
-> Improved Val Loss 0.1697. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 18/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [18/220] Train 0.2184 | Val 0.1692
-> Improved Val Loss 0.1692. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 19/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [19/220] Train 0.2199 | Val 0.1693


Fold 1 | Epoch 20/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [20/220] Train 0.2126 | Val 0.1669
-> Improved Val Loss 0.1669. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 21/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [21/220] Train 0.2097 | Val 0.1667
-> Improved Val Loss 0.1667. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 22/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [22/220] Train 0.2145 | Val 0.1664
-> Improved Val Loss 0.1664. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 23/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [23/220] Train 0.2112 | Val 0.1662
-> Improved Val Loss 0.1662. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 24/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [24/220] Train 0.2086 | Val 0.1658
-> Improved Val Loss 0.1658. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 25/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [25/220] Train 0.2068 | Val 0.1654
-> Improved Val Loss 0.1654. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 26/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [26/220] Train 0.2405 | Val 0.1701


Fold 1 | Epoch 27/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [27/220] Train 0.2443 | Val 0.1732


Fold 1 | Epoch 28/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [28/220] Train 0.2371 | Val 0.1715


Fold 1 | Epoch 29/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [29/220] Train 0.2329 | Val 0.1687


Fold 1 | Epoch 30/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [30/220] Train 0.2313 | Val 0.1663


Fold 1 | Epoch 31/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [31/220] Train 0.2262 | Val 0.1665


Fold 1 | Epoch 32/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [32/220] Train 0.2292 | Val 0.1664


Fold 1 | Epoch 33/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [33/220] Train 0.2307 | Val 0.1626
-> Improved Val Loss 0.1626. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 34/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [34/220] Train 0.2246 | Val 0.1637


Fold 1 | Epoch 35/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [35/220] Train 0.2232 | Val 0.1620
-> Improved Val Loss 0.1620. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 36/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [36/220] Train 0.2138 | Val 0.1616
-> Improved Val Loss 0.1616. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 37/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [37/220] Train 0.2161 | Val 0.1622


Fold 1 | Epoch 38/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [38/220] Train 0.2132 | Val 0.1612
-> Improved Val Loss 0.1612. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 39/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [39/220] Train 0.2106 | Val 0.1610
-> Improved Val Loss 0.1610. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 40/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [40/220] Train 0.2042 | Val 0.1612


Fold 1 | Epoch 41/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [41/220] Train 0.2007 | Val 0.1588
-> Improved Val Loss 0.1588. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 42/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [42/220] Train 0.1965 | Val 0.1579
-> Improved Val Loss 0.1579. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 43/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [43/220] Train 0.1955 | Val 0.1582


Fold 1 | Epoch 44/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [44/220] Train 0.1906 | Val 0.1583


Fold 1 | Epoch 45/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [45/220] Train 0.1883 | Val 0.1591


Fold 1 | Epoch 46/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [46/220] Train 0.1853 | Val 0.1585


Fold 1 | Epoch 47/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [47/220] Train 0.1902 | Val 0.1570
-> Improved Val Loss 0.1570. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 48/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [48/220] Train 0.1846 | Val 0.1576


Fold 1 | Epoch 49/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [49/220] Train 0.1883 | Val 0.1578


Fold 1 | Epoch 50/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [50/220] Train 0.1851 | Val 0.1580


Fold 1 | Epoch 51/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [51/220] Train 0.2193 | Val 0.1593


Fold 1 | Epoch 52/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [52/220] Train 0.2230 | Val 0.1577


Fold 1 | Epoch 53/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [53/220] Train 0.2195 | Val 0.1610


Fold 1 | Epoch 54/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [54/220] Train 0.2180 | Val 0.1605


Fold 1 | Epoch 55/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [55/220] Train 0.2173 | Val 0.1617


Fold 1 | Epoch 56/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [56/220] Train 0.2136 | Val 0.1634


Fold 1 | Epoch 57/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [57/220] Train 0.2096 | Val 0.1603


Fold 1 | Epoch 58/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [58/220] Train 0.2093 | Val 0.1592


Fold 1 | Epoch 59/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [59/220] Train 0.2095 | Val 0.1585


Fold 1 | Epoch 60/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [60/220] Train 0.2018 | Val 0.1599


Fold 1 | Epoch 61/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [61/220] Train 0.2039 | Val 0.1595


Fold 1 | Epoch 62/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [62/220] Train 0.1971 | Val 0.1602


Fold 1 | Epoch 63/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [63/220] Train 0.1962 | Val 0.1573


Fold 1 | Epoch 64/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [64/220] Train 0.1940 | Val 0.1574


Fold 1 | Epoch 65/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [65/220] Train 0.1915 | Val 0.1551
-> Improved Val Loss 0.1551. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 66/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [66/220] Train 0.1855 | Val 0.1565


Fold 1 | Epoch 67/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [67/220] Train 0.1870 | Val 0.1568


Fold 1 | Epoch 68/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [68/220] Train 0.1813 | Val 0.1553


Fold 1 | Epoch 69/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [69/220] Train 0.1792 | Val 0.1555


Fold 1 | Epoch 70/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [70/220] Train 0.1787 | Val 0.1562


Fold 1 | Epoch 71/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [71/220] Train 0.1744 | Val 0.1552


Fold 1 | Epoch 72/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [72/220] Train 0.1750 | Val 0.1549
-> Improved Val Loss 0.1549. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 73/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [73/220] Train 0.1744 | Val 0.1547
-> Improved Val Loss 0.1547. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 74/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [74/220] Train 0.1729 | Val 0.1548


Fold 1 | Epoch 75/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [75/220] Train 0.1731 | Val 0.1551


Fold 1 | Epoch 76/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [76/220] Train 0.2053 | Val 0.1584


Fold 1 | Epoch 77/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [77/220] Train 0.2021 | Val 0.1581


Fold 1 | Epoch 78/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [78/220] Train 0.2063 | Val 0.1595


Fold 1 | Epoch 79/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [79/220] Train 0.2037 | Val 0.1586


Fold 1 | Epoch 80/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [80/220] Train 0.2021 | Val 0.1573


Fold 1 | Epoch 81/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [81/220] Train 0.2009 | Val 0.1585


Fold 1 | Epoch 82/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [82/220] Train 0.1993 | Val 0.1586


Fold 1 | Epoch 83/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [83/220] Train 0.1924 | Val 0.1603


Fold 1 | Epoch 84/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [84/220] Train 0.1913 | Val 0.1566


Fold 1 | Epoch 85/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [85/220] Train 0.1960 | Val 0.1559


Fold 1 | Epoch 86/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [86/220] Train 0.1866 | Val 0.1563


Fold 1 | Epoch 87/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [87/220] Train 0.1908 | Val 0.1570


Fold 1 | Epoch 88/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [88/220] Train 0.1829 | Val 0.1550


Fold 1 | Epoch 89/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [89/220] Train 0.1808 | Val 0.1578


Fold 1 | Epoch 90/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [90/220] Train 0.1797 | Val 0.1569


Fold 1 | Epoch 91/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [91/220] Train 0.1793 | Val 0.1589


Fold 1 | Epoch 92/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [92/220] Train 0.1758 | Val 0.1599


Fold 1 | Epoch 93/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [93/220] Train 0.1733 | Val 0.1593


Fold 1 | Epoch 94/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [94/220] Train 0.1705 | Val 0.1612


Fold 1 | Epoch 95/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [95/220] Train 0.1690 | Val 0.1595


Fold 1 | Epoch 96/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [96/220] Train 0.1665 | Val 0.1583


Fold 1 | Epoch 97/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [97/220] Train 0.1640 | Val 0.1583


Fold 1 | Epoch 98/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [98/220] Train 0.1637 | Val 0.1589


Fold 1 | Epoch 99/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [99/220] Train 0.1661 | Val 0.1591


Fold 1 | Epoch 100/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [100/220] Train 0.1599 | Val 0.1590


Fold 1 | Epoch 101/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [101/220] Train 0.1967 | Val 0.1546
-> Improved Val Loss 0.1546. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 102/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [102/220] Train 0.1973 | Val 0.1550


Fold 1 | Epoch 103/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [103/220] Train 0.1973 | Val 0.1582


Fold 1 | Epoch 104/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [104/220] Train 0.1921 | Val 0.1553


Fold 1 | Epoch 105/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [105/220] Train 0.1935 | Val 0.1510
-> Improved Val Loss 0.1510. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 106/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [106/220] Train 0.1912 | Val 0.1523


Fold 1 | Epoch 107/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [107/220] Train 0.1875 | Val 0.1520


Fold 1 | Epoch 108/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [108/220] Train 0.1889 | Val 0.1513


Fold 1 | Epoch 109/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [109/220] Train 0.1851 | Val 0.1545


Fold 1 | Epoch 110/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [110/220] Train 0.1803 | Val 0.1521


Fold 1 | Epoch 111/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [111/220] Train 0.1842 | Val 0.1524


Fold 1 | Epoch 112/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [112/220] Train 0.1772 | Val 0.1511


Fold 1 | Epoch 113/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [113/220] Train 0.1750 | Val 0.1532


Fold 1 | Epoch 114/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [114/220] Train 0.1716 | Val 0.1520


Fold 1 | Epoch 115/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [115/220] Train 0.1738 | Val 0.1526


Fold 1 | Epoch 116/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [116/220] Train 0.1723 | Val 0.1543


Fold 1 | Epoch 117/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [117/220] Train 0.1679 | Val 0.1554


Fold 1 | Epoch 118/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [118/220] Train 0.1672 | Val 0.1563


Fold 1 | Epoch 119/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [119/220] Train 0.1625 | Val 0.1584


Fold 1 | Epoch 120/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [120/220] Train 0.1633 | Val 0.1586


Fold 1 | Epoch 121/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [121/220] Train 0.1624 | Val 0.1592


Fold 1 | Epoch 122/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [122/220] Train 0.1543 | Val 0.1595


Fold 1 | Epoch 123/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [123/220] Train 0.1572 | Val 0.1593


Fold 1 | Epoch 124/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [124/220] Train 0.1523 | Val 0.1590


Fold 1 | Epoch 125/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [125/220] Train 0.1573 | Val 0.1603


Fold 1 | Epoch 126/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [126/220] Train 0.1842 | Val 0.1564


Fold 1 | Epoch 127/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [127/220] Train 0.1920 | Val 0.1575


Fold 1 | Epoch 128/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [128/220] Train 0.1929 | Val 0.1576


Fold 1 | Epoch 129/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [129/220] Train 0.1868 | Val 0.1561


Fold 1 | Epoch 130/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [130/220] Train 0.1847 | Val 0.1565


Fold 1 | Epoch 131/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [131/220] Train 0.1851 | Val 0.1543


Fold 1 | Epoch 132/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [132/220] Train 0.1816 | Val 0.1557


Fold 1 | Epoch 133/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [133/220] Train 0.1838 | Val 0.1525


Fold 1 | Epoch 134/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [134/220] Train 0.1818 | Val 0.1506
-> Improved Val Loss 0.1506. Saved to /kaggle/working/model_best_fold_1.pt


Fold 1 | Epoch 135/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [135/220] Train 0.1757 | Val 0.1523


Fold 1 | Epoch 136/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [136/220] Train 0.1762 | Val 0.1547


Fold 1 | Epoch 137/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [137/220] Train 0.1763 | Val 0.1527


Fold 1 | Epoch 138/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [138/220] Train 0.1704 | Val 0.1528


Fold 1 | Epoch 139/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [139/220] Train 0.1713 | Val 0.1527


Fold 1 | Epoch 140/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [140/220] Train 0.1660 | Val 0.1533


Fold 1 | Epoch 141/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [141/220] Train 0.1649 | Val 0.1545


Fold 1 | Epoch 142/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [142/220] Train 0.1620 | Val 0.1540


Fold 1 | Epoch 143/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [143/220] Train 0.1577 | Val 0.1552


Fold 1 | Epoch 144/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [144/220] Train 0.1568 | Val 0.1563


Fold 1 | Epoch 145/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [145/220] Train 0.1579 | Val 0.1576


Fold 1 | Epoch 146/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [146/220] Train 0.1581 | Val 0.1581


Fold 1 | Epoch 147/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [147/220] Train 0.1540 | Val 0.1587


Fold 1 | Epoch 148/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [148/220] Train 0.1546 | Val 0.1596


Fold 1 | Epoch 149/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [149/220] Train 0.1521 | Val 0.1588


Fold 1 | Epoch 150/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [150/220] Train 0.1567 | Val 0.1589


Fold 1 | Epoch 151/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [151/220] Train 0.1785 | Val 0.1548


Fold 1 | Epoch 152/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [152/220] Train 0.1832 | Val 0.1572


Fold 1 | Epoch 153/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [153/220] Train 0.1836 | Val 0.1541


Fold 1 | Epoch 154/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [154/220] Train 0.1875 | Val 0.1531


Fold 1 | Epoch 155/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [155/220] Train 0.1787 | Val 0.1570


Fold 1 | Epoch 156/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [156/220] Train 0.1798 | Val 0.1558


Fold 1 | Epoch 157/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [157/220] Train 0.1806 | Val 0.1553


Fold 1 | Epoch 158/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [158/220] Train 0.1777 | Val 0.1534


Fold 1 | Epoch 159/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [159/220] Train 0.1787 | Val 0.1521


Fold 1 | Epoch 160/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [160/220] Train 0.1720 | Val 0.1544


Fold 1 | Epoch 161/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [161/220] Train 0.1687 | Val 0.1542


Fold 1 | Epoch 162/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [162/220] Train 0.1666 | Val 0.1525


Fold 1 | Epoch 163/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [163/220] Train 0.1630 | Val 0.1520


Fold 1 | Epoch 164/220:   0%|          | 0/102 [00:00<?, ?it/s]

Fold 1 | Epoch [164/220] Train 0.1633 | Val 0.1557
Early stopping after 164 epochs.
Fold 1 done. Best Val Loss: 0.1506 | OOF macro-F1: 0.7276
Saved prep_fold_1.joblib

--- Processing Fold 2 ---

Model has 7.3M parameters.


Fold 2 | Epoch 1/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [1/220] Train 0.4716 | Val 0.4297
-> Improved Val Loss 0.4297. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 2/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [2/220] Train 0.3541 | Val 0.2618
-> Improved Val Loss 0.2618. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 3/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [3/220] Train 0.3214 | Val 0.2027
-> Improved Val Loss 0.2027. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 4/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [4/220] Train 0.3032 | Val 0.1740
-> Improved Val Loss 0.1740. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 5/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [5/220] Train 0.2953 | Val 0.1612
-> Improved Val Loss 0.1612. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 6/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [6/220] Train 0.2884 | Val 0.1503
-> Improved Val Loss 0.1503. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 7/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [7/220] Train 0.2848 | Val 0.1427
-> Improved Val Loss 0.1427. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 8/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [8/220] Train 0.2732 | Val 0.1359
-> Improved Val Loss 0.1359. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 9/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [9/220] Train 0.2720 | Val 0.1315
-> Improved Val Loss 0.1315. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 10/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [10/220] Train 0.2684 | Val 0.1284
-> Improved Val Loss 0.1284. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 11/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [11/220] Train 0.2631 | Val 0.1265
-> Improved Val Loss 0.1265. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 12/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [12/220] Train 0.2563 | Val 0.1222
-> Improved Val Loss 0.1222. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 13/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [13/220] Train 0.2495 | Val 0.1188
-> Improved Val Loss 0.1188. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 14/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [14/220] Train 0.2451 | Val 0.1168
-> Improved Val Loss 0.1168. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 15/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [15/220] Train 0.2418 | Val 0.1158
-> Improved Val Loss 0.1158. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 16/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [16/220] Train 0.2367 | Val 0.1122
-> Improved Val Loss 0.1122. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 17/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [17/220] Train 0.2330 | Val 0.1117
-> Improved Val Loss 0.1117. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 18/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [18/220] Train 0.2307 | Val 0.1111
-> Improved Val Loss 0.1111. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 19/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [19/220] Train 0.2286 | Val 0.1099
-> Improved Val Loss 0.1099. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 20/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [20/220] Train 0.2280 | Val 0.1087
-> Improved Val Loss 0.1087. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 21/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [21/220] Train 0.2210 | Val 0.1076
-> Improved Val Loss 0.1076. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 22/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [22/220] Train 0.2186 | Val 0.1066
-> Improved Val Loss 0.1066. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 23/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [23/220] Train 0.2157 | Val 0.1066
-> Improved Val Loss 0.1066. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 24/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [24/220] Train 0.2161 | Val 0.1061
-> Improved Val Loss 0.1061. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 25/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [25/220] Train 0.2118 | Val 0.1060
-> Improved Val Loss 0.1060. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 26/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [26/220] Train 0.2497 | Val 0.1110


Fold 2 | Epoch 27/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [27/220] Train 0.2544 | Val 0.1115


Fold 2 | Epoch 28/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [28/220] Train 0.2470 | Val 0.1110


Fold 2 | Epoch 29/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [29/220] Train 0.2463 | Val 0.1103


Fold 2 | Epoch 30/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [30/220] Train 0.2407 | Val 0.1099


Fold 2 | Epoch 31/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [31/220] Train 0.2406 | Val 0.1094


Fold 2 | Epoch 32/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [32/220] Train 0.2399 | Val 0.1075


Fold 2 | Epoch 33/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [33/220] Train 0.2349 | Val 0.1059
-> Improved Val Loss 0.1059. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 34/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [34/220] Train 0.2343 | Val 0.1064


Fold 2 | Epoch 35/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [35/220] Train 0.2313 | Val 0.1070


Fold 2 | Epoch 36/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [36/220] Train 0.2242 | Val 0.1054
-> Improved Val Loss 0.1054. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 37/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [37/220] Train 0.2249 | Val 0.1050
-> Improved Val Loss 0.1050. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 38/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [38/220] Train 0.2205 | Val 0.1039
-> Improved Val Loss 0.1039. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 39/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [39/220] Train 0.2183 | Val 0.1037
-> Improved Val Loss 0.1037. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 40/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [40/220] Train 0.2153 | Val 0.1016
-> Improved Val Loss 0.1016. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 41/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [41/220] Train 0.2109 | Val 0.1020


Fold 2 | Epoch 42/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [42/220] Train 0.2096 | Val 0.1008
-> Improved Val Loss 0.1008. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 43/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [43/220] Train 0.2046 | Val 0.1005
-> Improved Val Loss 0.1005. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 44/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [44/220] Train 0.2013 | Val 0.0999
-> Improved Val Loss 0.0999. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 45/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [45/220] Train 0.1991 | Val 0.1006


Fold 2 | Epoch 46/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [46/220] Train 0.1993 | Val 0.1010


Fold 2 | Epoch 47/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [47/220] Train 0.1947 | Val 0.1008


Fold 2 | Epoch 48/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [48/220] Train 0.1921 | Val 0.1001


Fold 2 | Epoch 49/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [49/220] Train 0.1939 | Val 0.0996
-> Improved Val Loss 0.0996. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 50/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [50/220] Train 0.1906 | Val 0.1000


Fold 2 | Epoch 51/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [51/220] Train 0.2344 | Val 0.1017


Fold 2 | Epoch 52/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [52/220] Train 0.2335 | Val 0.1036


Fold 2 | Epoch 53/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [53/220] Train 0.2285 | Val 0.1042


Fold 2 | Epoch 54/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [54/220] Train 0.2224 | Val 0.1033


Fold 2 | Epoch 55/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [55/220] Train 0.2208 | Val 0.1024


Fold 2 | Epoch 56/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [56/220] Train 0.2221 | Val 0.1023


Fold 2 | Epoch 57/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [57/220] Train 0.2171 | Val 0.1032


Fold 2 | Epoch 58/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [58/220] Train 0.2158 | Val 0.1013


Fold 2 | Epoch 59/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [59/220] Train 0.2182 | Val 0.1027


Fold 2 | Epoch 60/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [60/220] Train 0.2101 | Val 0.1015


Fold 2 | Epoch 61/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [61/220] Train 0.2089 | Val 0.1000


Fold 2 | Epoch 62/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [62/220] Train 0.2075 | Val 0.0995
-> Improved Val Loss 0.0995. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 63/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [63/220] Train 0.2069 | Val 0.1001


Fold 2 | Epoch 64/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [64/220] Train 0.1995 | Val 0.1009


Fold 2 | Epoch 65/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [65/220] Train 0.1957 | Val 0.0995


Fold 2 | Epoch 66/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [66/220] Train 0.1928 | Val 0.0989
-> Improved Val Loss 0.0989. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 67/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [67/220] Train 0.1933 | Val 0.0991


Fold 2 | Epoch 68/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [68/220] Train 0.1910 | Val 0.1009


Fold 2 | Epoch 69/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [69/220] Train 0.1872 | Val 0.0995


Fold 2 | Epoch 70/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [70/220] Train 0.1861 | Val 0.0996


Fold 2 | Epoch 71/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [71/220] Train 0.1829 | Val 0.0994


Fold 2 | Epoch 72/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [72/220] Train 0.1779 | Val 0.0992


Fold 2 | Epoch 73/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [73/220] Train 0.1808 | Val 0.0995


Fold 2 | Epoch 74/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [74/220] Train 0.1813 | Val 0.0998


Fold 2 | Epoch 75/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [75/220] Train 0.1777 | Val 0.1000


Fold 2 | Epoch 76/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [76/220] Train 0.2163 | Val 0.1001


Fold 2 | Epoch 77/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [77/220] Train 0.2170 | Val 0.1006


Fold 2 | Epoch 78/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [78/220] Train 0.2189 | Val 0.1007


Fold 2 | Epoch 79/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [79/220] Train 0.2168 | Val 0.0999


Fold 2 | Epoch 80/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [80/220] Train 0.2136 | Val 0.0995


Fold 2 | Epoch 81/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [81/220] Train 0.2111 | Val 0.0992


Fold 2 | Epoch 82/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [82/220] Train 0.2049 | Val 0.0997


Fold 2 | Epoch 83/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [83/220] Train 0.2088 | Val 0.0977
-> Improved Val Loss 0.0977. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 84/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [84/220] Train 0.2016 | Val 0.0975
-> Improved Val Loss 0.0975. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 85/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [85/220] Train 0.2012 | Val 0.0977


Fold 2 | Epoch 86/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [86/220] Train 0.2014 | Val 0.0972
-> Improved Val Loss 0.0972. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 87/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [87/220] Train 0.1929 | Val 0.0983


Fold 2 | Epoch 88/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [88/220] Train 0.1950 | Val 0.0970
-> Improved Val Loss 0.0970. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 89/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [89/220] Train 0.1890 | Val 0.0993


Fold 2 | Epoch 90/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [90/220] Train 0.1867 | Val 0.0995


Fold 2 | Epoch 91/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [91/220] Train 0.1847 | Val 0.0986


Fold 2 | Epoch 92/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [92/220] Train 0.1826 | Val 0.0996


Fold 2 | Epoch 93/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [93/220] Train 0.1779 | Val 0.1004


Fold 2 | Epoch 94/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [94/220] Train 0.1783 | Val 0.1013


Fold 2 | Epoch 95/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [95/220] Train 0.1756 | Val 0.1004


Fold 2 | Epoch 96/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [96/220] Train 0.1704 | Val 0.1009


Fold 2 | Epoch 97/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [97/220] Train 0.1716 | Val 0.1015


Fold 2 | Epoch 98/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [98/220] Train 0.1700 | Val 0.1012


Fold 2 | Epoch 99/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [99/220] Train 0.1691 | Val 0.1013


Fold 2 | Epoch 100/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [100/220] Train 0.1740 | Val 0.1011


Fold 2 | Epoch 101/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [101/220] Train 0.2047 | Val 0.0997


Fold 2 | Epoch 102/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [102/220] Train 0.2021 | Val 0.1006


Fold 2 | Epoch 103/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [103/220] Train 0.2010 | Val 0.0968
-> Improved Val Loss 0.0968. Saved to /kaggle/working/model_best_fold_2.pt


Fold 2 | Epoch 104/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [104/220] Train 0.2056 | Val 0.0968


Fold 2 | Epoch 105/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [105/220] Train 0.2034 | Val 0.0991


Fold 2 | Epoch 106/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [106/220] Train 0.2003 | Val 0.0981


Fold 2 | Epoch 107/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [107/220] Train 0.1975 | Val 0.0989


Fold 2 | Epoch 108/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [108/220] Train 0.1945 | Val 0.0987


Fold 2 | Epoch 109/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [109/220] Train 0.1905 | Val 0.1027


Fold 2 | Epoch 110/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [110/220] Train 0.1932 | Val 0.1019


Fold 2 | Epoch 111/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [111/220] Train 0.1898 | Val 0.1013


Fold 2 | Epoch 112/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [112/220] Train 0.1863 | Val 0.1008


Fold 2 | Epoch 113/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [113/220] Train 0.1832 | Val 0.1010


Fold 2 | Epoch 114/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [114/220] Train 0.1809 | Val 0.1002


Fold 2 | Epoch 115/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [115/220] Train 0.1790 | Val 0.1005


Fold 2 | Epoch 116/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [116/220] Train 0.1747 | Val 0.1039


Fold 2 | Epoch 117/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [117/220] Train 0.1747 | Val 0.1034


Fold 2 | Epoch 118/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [118/220] Train 0.1682 | Val 0.1035


Fold 2 | Epoch 119/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [119/220] Train 0.1672 | Val 0.1051


Fold 2 | Epoch 120/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [120/220] Train 0.1685 | Val 0.1046


Fold 2 | Epoch 121/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [121/220] Train 0.1630 | Val 0.1042


Fold 2 | Epoch 122/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [122/220] Train 0.1650 | Val 0.1049


Fold 2 | Epoch 123/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [123/220] Train 0.1600 | Val 0.1052


Fold 2 | Epoch 124/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [124/220] Train 0.1632 | Val 0.1050


Fold 2 | Epoch 125/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [125/220] Train 0.1629 | Val 0.1054


Fold 2 | Epoch 126/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [126/220] Train 0.1966 | Val 0.1018


Fold 2 | Epoch 127/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [127/220] Train 0.2020 | Val 0.1014


Fold 2 | Epoch 128/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [128/220] Train 0.1935 | Val 0.0993


Fold 2 | Epoch 129/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [129/220] Train 0.1963 | Val 0.0984


Fold 2 | Epoch 130/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [130/220] Train 0.1912 | Val 0.0989


Fold 2 | Epoch 131/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [131/220] Train 0.1939 | Val 0.1018


Fold 2 | Epoch 132/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [132/220] Train 0.1863 | Val 0.1019


Fold 2 | Epoch 133/220:   0%|          | 0/100 [00:00<?, ?it/s]

Fold 2 | Epoch [133/220] Train 0.1897 | Val 0.1000
Early stopping after 133 epochs.
Fold 2 done. Best Val Loss: 0.0968 | OOF macro-F1: 0.7924
Saved prep_fold_2.joblib

--- Processing Fold 3 ---

Model has 7.3M parameters.


Fold 3 | Epoch 1/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [1/220] Train 0.4710 | Val 0.4415
-> Improved Val Loss 0.4415. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 2/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [2/220] Train 0.3600 | Val 0.2904
-> Improved Val Loss 0.2904. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 3/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [3/220] Train 0.3163 | Val 0.2216
-> Improved Val Loss 0.2216. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 4/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [4/220] Train 0.3015 | Val 0.1930
-> Improved Val Loss 0.1930. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 5/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [5/220] Train 0.2970 | Val 0.1776
-> Improved Val Loss 0.1776. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 6/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [6/220] Train 0.2798 | Val 0.1670
-> Improved Val Loss 0.1670. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 7/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [7/220] Train 0.2735 | Val 0.1607
-> Improved Val Loss 0.1607. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 8/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [8/220] Train 0.2683 | Val 0.1572
-> Improved Val Loss 0.1572. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 9/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [9/220] Train 0.2687 | Val 0.1551
-> Improved Val Loss 0.1551. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 10/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [10/220] Train 0.2574 | Val 0.1517
-> Improved Val Loss 0.1517. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 11/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [11/220] Train 0.2500 | Val 0.1474
-> Improved Val Loss 0.1474. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 12/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [12/220] Train 0.2511 | Val 0.1422
-> Improved Val Loss 0.1422. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 13/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [13/220] Train 0.2426 | Val 0.1407
-> Improved Val Loss 0.1407. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 14/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [14/220] Train 0.2394 | Val 0.1399
-> Improved Val Loss 0.1399. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 15/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [15/220] Train 0.2366 | Val 0.1368
-> Improved Val Loss 0.1368. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 16/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [16/220] Train 0.2305 | Val 0.1359
-> Improved Val Loss 0.1359. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 17/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [17/220] Train 0.2273 | Val 0.1350
-> Improved Val Loss 0.1350. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 18/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [18/220] Train 0.2243 | Val 0.1333
-> Improved Val Loss 0.1333. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 19/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [19/220] Train 0.2185 | Val 0.1326
-> Improved Val Loss 0.1326. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 20/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [20/220] Train 0.2185 | Val 0.1328


Fold 3 | Epoch 21/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [21/220] Train 0.2144 | Val 0.1325
-> Improved Val Loss 0.1325. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 22/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [22/220] Train 0.2156 | Val 0.1316
-> Improved Val Loss 0.1316. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 23/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [23/220] Train 0.2101 | Val 0.1307
-> Improved Val Loss 0.1307. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 24/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [24/220] Train 0.2125 | Val 0.1307


Fold 3 | Epoch 25/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [25/220] Train 0.2094 | Val 0.1307
-> Improved Val Loss 0.1307. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 26/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [26/220] Train 0.2516 | Val 0.1328


Fold 3 | Epoch 27/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [27/220] Train 0.2511 | Val 0.1365


Fold 3 | Epoch 28/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [28/220] Train 0.2470 | Val 0.1360


Fold 3 | Epoch 29/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [29/220] Train 0.2382 | Val 0.1360


Fold 3 | Epoch 30/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [30/220] Train 0.2349 | Val 0.1323


Fold 3 | Epoch 31/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [31/220] Train 0.2321 | Val 0.1341


Fold 3 | Epoch 32/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [32/220] Train 0.2331 | Val 0.1339


Fold 3 | Epoch 33/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [33/220] Train 0.2272 | Val 0.1353


Fold 3 | Epoch 34/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [34/220] Train 0.2265 | Val 0.1330


Fold 3 | Epoch 35/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [35/220] Train 0.2253 | Val 0.1314


Fold 3 | Epoch 36/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [36/220] Train 0.2212 | Val 0.1283
-> Improved Val Loss 0.1283. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 37/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [37/220] Train 0.2213 | Val 0.1277
-> Improved Val Loss 0.1277. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 38/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [38/220] Train 0.2161 | Val 0.1284


Fold 3 | Epoch 39/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [39/220] Train 0.2130 | Val 0.1281


Fold 3 | Epoch 40/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [40/220] Train 0.2115 | Val 0.1285


Fold 3 | Epoch 41/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [41/220] Train 0.2058 | Val 0.1279


Fold 3 | Epoch 42/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [42/220] Train 0.2041 | Val 0.1259
-> Improved Val Loss 0.1259. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 43/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [43/220] Train 0.1988 | Val 0.1260


Fold 3 | Epoch 44/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [44/220] Train 0.1972 | Val 0.1273


Fold 3 | Epoch 45/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [45/220] Train 0.1954 | Val 0.1270


Fold 3 | Epoch 46/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [46/220] Train 0.1921 | Val 0.1260


Fold 3 | Epoch 47/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [47/220] Train 0.1874 | Val 0.1255
-> Improved Val Loss 0.1255. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 48/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [48/220] Train 0.1885 | Val 0.1259


Fold 3 | Epoch 49/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [49/220] Train 0.1889 | Val 0.1262


Fold 3 | Epoch 50/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [50/220] Train 0.1890 | Val 0.1265


Fold 3 | Epoch 51/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [51/220] Train 0.2216 | Val 0.1249
-> Improved Val Loss 0.1249. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 52/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [52/220] Train 0.2241 | Val 0.1279


Fold 3 | Epoch 53/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [53/220] Train 0.2275 | Val 0.1294


Fold 3 | Epoch 54/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [54/220] Train 0.2189 | Val 0.1289


Fold 3 | Epoch 55/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [55/220] Train 0.2200 | Val 0.1294


Fold 3 | Epoch 56/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [56/220] Train 0.2149 | Val 0.1280


Fold 3 | Epoch 57/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [57/220] Train 0.2137 | Val 0.1264


Fold 3 | Epoch 58/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [58/220] Train 0.2114 | Val 0.1283


Fold 3 | Epoch 59/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [59/220] Train 0.2093 | Val 0.1278


Fold 3 | Epoch 60/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [60/220] Train 0.2078 | Val 0.1229
-> Improved Val Loss 0.1229. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 61/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [61/220] Train 0.2022 | Val 0.1211
-> Improved Val Loss 0.1211. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 62/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [62/220] Train 0.2048 | Val 0.1222


Fold 3 | Epoch 63/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [63/220] Train 0.1982 | Val 0.1229


Fold 3 | Epoch 64/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [64/220] Train 0.1961 | Val 0.1241


Fold 3 | Epoch 65/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [65/220] Train 0.1938 | Val 0.1220


Fold 3 | Epoch 66/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [66/220] Train 0.1944 | Val 0.1227


Fold 3 | Epoch 67/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [67/220] Train 0.1878 | Val 0.1230


Fold 3 | Epoch 68/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [68/220] Train 0.1861 | Val 0.1215


Fold 3 | Epoch 69/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [69/220] Train 0.1843 | Val 0.1216


Fold 3 | Epoch 70/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [70/220] Train 0.1823 | Val 0.1224


Fold 3 | Epoch 71/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [71/220] Train 0.1764 | Val 0.1230


Fold 3 | Epoch 72/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [72/220] Train 0.1784 | Val 0.1238


Fold 3 | Epoch 73/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [73/220] Train 0.1727 | Val 0.1242


Fold 3 | Epoch 74/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [74/220] Train 0.1728 | Val 0.1240


Fold 3 | Epoch 75/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [75/220] Train 0.1752 | Val 0.1239


Fold 3 | Epoch 76/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [76/220] Train 0.2074 | Val 0.1229


Fold 3 | Epoch 77/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [77/220] Train 0.2115 | Val 0.1258


Fold 3 | Epoch 78/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [78/220] Train 0.2127 | Val 0.1235


Fold 3 | Epoch 79/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [79/220] Train 0.2060 | Val 0.1228


Fold 3 | Epoch 80/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [80/220] Train 0.2046 | Val 0.1218


Fold 3 | Epoch 81/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [81/220] Train 0.2001 | Val 0.1233


Fold 3 | Epoch 82/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [82/220] Train 0.2034 | Val 0.1213


Fold 3 | Epoch 83/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [83/220] Train 0.2010 | Val 0.1201
-> Improved Val Loss 0.1201. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 84/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [84/220] Train 0.1954 | Val 0.1215


Fold 3 | Epoch 85/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [85/220] Train 0.1918 | Val 0.1231


Fold 3 | Epoch 86/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [86/220] Train 0.1962 | Val 0.1231


Fold 3 | Epoch 87/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [87/220] Train 0.1902 | Val 0.1224


Fold 3 | Epoch 88/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [88/220] Train 0.1868 | Val 0.1223


Fold 3 | Epoch 89/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [89/220] Train 0.1847 | Val 0.1246


Fold 3 | Epoch 90/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [90/220] Train 0.1827 | Val 0.1228


Fold 3 | Epoch 91/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [91/220] Train 0.1811 | Val 0.1229


Fold 3 | Epoch 92/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [92/220] Train 0.1780 | Val 0.1235


Fold 3 | Epoch 93/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [93/220] Train 0.1756 | Val 0.1243


Fold 3 | Epoch 94/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [94/220] Train 0.1746 | Val 0.1245


Fold 3 | Epoch 95/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [95/220] Train 0.1697 | Val 0.1250


Fold 3 | Epoch 96/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [96/220] Train 0.1694 | Val 0.1246


Fold 3 | Epoch 97/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [97/220] Train 0.1647 | Val 0.1243


Fold 3 | Epoch 98/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [98/220] Train 0.1675 | Val 0.1247


Fold 3 | Epoch 99/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [99/220] Train 0.1680 | Val 0.1248


Fold 3 | Epoch 100/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [100/220] Train 0.1651 | Val 0.1254


Fold 3 | Epoch 101/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [101/220] Train 0.2044 | Val 0.1223


Fold 3 | Epoch 102/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [102/220] Train 0.2036 | Val 0.1229


Fold 3 | Epoch 103/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [103/220] Train 0.2021 | Val 0.1218


Fold 3 | Epoch 104/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [104/220] Train 0.1978 | Val 0.1212


Fold 3 | Epoch 105/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [105/220] Train 0.1951 | Val 0.1206


Fold 3 | Epoch 106/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [106/220] Train 0.1937 | Val 0.1221


Fold 3 | Epoch 107/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [107/220] Train 0.1920 | Val 0.1195
-> Improved Val Loss 0.1195. Saved to /kaggle/working/model_best_fold_3.pt


Fold 3 | Epoch 108/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [108/220] Train 0.1889 | Val 0.1215


Fold 3 | Epoch 109/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [109/220] Train 0.1893 | Val 0.1206


Fold 3 | Epoch 110/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [110/220] Train 0.1881 | Val 0.1236


Fold 3 | Epoch 111/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [111/220] Train 0.1816 | Val 0.1228


Fold 3 | Epoch 112/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [112/220] Train 0.1791 | Val 0.1224


Fold 3 | Epoch 113/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [113/220] Train 0.1741 | Val 0.1223


Fold 3 | Epoch 114/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [114/220] Train 0.1791 | Val 0.1228


Fold 3 | Epoch 115/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [115/220] Train 0.1722 | Val 0.1230


Fold 3 | Epoch 116/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [116/220] Train 0.1736 | Val 0.1237


Fold 3 | Epoch 117/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [117/220] Train 0.1698 | Val 0.1239


Fold 3 | Epoch 118/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [118/220] Train 0.1673 | Val 0.1240


Fold 3 | Epoch 119/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [119/220] Train 0.1663 | Val 0.1231


Fold 3 | Epoch 120/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [120/220] Train 0.1638 | Val 0.1228


Fold 3 | Epoch 121/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [121/220] Train 0.1625 | Val 0.1243


Fold 3 | Epoch 122/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [122/220] Train 0.1616 | Val 0.1257


Fold 3 | Epoch 123/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [123/220] Train 0.1587 | Val 0.1261


Fold 3 | Epoch 124/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [124/220] Train 0.1596 | Val 0.1261


Fold 3 | Epoch 125/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [125/220] Train 0.1594 | Val 0.1264


Fold 3 | Epoch 126/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [126/220] Train 0.1916 | Val 0.1226


Fold 3 | Epoch 127/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [127/220] Train 0.1949 | Val 0.1246


Fold 3 | Epoch 128/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [128/220] Train 0.1912 | Val 0.1245


Fold 3 | Epoch 129/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [129/220] Train 0.1960 | Val 0.1249


Fold 3 | Epoch 130/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [130/220] Train 0.1899 | Val 0.1249


Fold 3 | Epoch 131/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [131/220] Train 0.1865 | Val 0.1240


Fold 3 | Epoch 132/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [132/220] Train 0.1866 | Val 0.1248


Fold 3 | Epoch 133/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [133/220] Train 0.1837 | Val 0.1252


Fold 3 | Epoch 134/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [134/220] Train 0.1834 | Val 0.1248


Fold 3 | Epoch 135/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [135/220] Train 0.1730 | Val 0.1239


Fold 3 | Epoch 136/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [136/220] Train 0.1765 | Val 0.1249


Fold 3 | Epoch 137/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 3 | Epoch [137/220] Train 0.1749 | Val 0.1249
Early stopping after 137 epochs.
Fold 3 done. Best Val Loss: 0.1195 | OOF macro-F1: 0.7629
Saved prep_fold_3.joblib

--- Processing Fold 4 ---

Model has 7.3M parameters.


Fold 4 | Epoch 1/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [1/220] Train 0.4616 | Val 0.4370
-> Improved Val Loss 0.4370. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 2/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [2/220] Train 0.3425 | Val 0.2764
-> Improved Val Loss 0.2764. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 3/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [3/220] Train 0.3093 | Val 0.2206
-> Improved Val Loss 0.2206. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 4/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [4/220] Train 0.2963 | Val 0.1975
-> Improved Val Loss 0.1975. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 5/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [5/220] Train 0.2856 | Val 0.1818
-> Improved Val Loss 0.1818. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 6/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [6/220] Train 0.2769 | Val 0.1761
-> Improved Val Loss 0.1761. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 7/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [7/220] Train 0.2737 | Val 0.1697
-> Improved Val Loss 0.1697. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 8/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [8/220] Train 0.2671 | Val 0.1667
-> Improved Val Loss 0.1667. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 9/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [9/220] Train 0.2666 | Val 0.1619
-> Improved Val Loss 0.1619. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 10/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [10/220] Train 0.2603 | Val 0.1592
-> Improved Val Loss 0.1592. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 11/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [11/220] Train 0.2546 | Val 0.1564
-> Improved Val Loss 0.1564. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 12/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [12/220] Train 0.2477 | Val 0.1533
-> Improved Val Loss 0.1533. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 13/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [13/220] Train 0.2452 | Val 0.1495
-> Improved Val Loss 0.1495. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 14/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [14/220] Train 0.2396 | Val 0.1489
-> Improved Val Loss 0.1489. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 15/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [15/220] Train 0.2362 | Val 0.1474
-> Improved Val Loss 0.1474. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 16/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [16/220] Train 0.2265 | Val 0.1468
-> Improved Val Loss 0.1468. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 17/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [17/220] Train 0.2261 | Val 0.1464
-> Improved Val Loss 0.1464. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 18/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [18/220] Train 0.2252 | Val 0.1455
-> Improved Val Loss 0.1455. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 19/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [19/220] Train 0.2199 | Val 0.1438
-> Improved Val Loss 0.1438. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 20/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [20/220] Train 0.2173 | Val 0.1424
-> Improved Val Loss 0.1424. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 21/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [21/220] Train 0.2106 | Val 0.1418
-> Improved Val Loss 0.1418. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 22/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [22/220] Train 0.2138 | Val 0.1413
-> Improved Val Loss 0.1413. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 23/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [23/220] Train 0.2103 | Val 0.1405
-> Improved Val Loss 0.1405. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 24/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [24/220] Train 0.2068 | Val 0.1400
-> Improved Val Loss 0.1400. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 25/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [25/220] Train 0.2091 | Val 0.1401


Fold 4 | Epoch 26/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [26/220] Train 0.2474 | Val 0.1455


Fold 4 | Epoch 27/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [27/220] Train 0.2438 | Val 0.1445


Fold 4 | Epoch 28/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [28/220] Train 0.2429 | Val 0.1460


Fold 4 | Epoch 29/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [29/220] Train 0.2421 | Val 0.1448


Fold 4 | Epoch 30/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [30/220] Train 0.2364 | Val 0.1432


Fold 4 | Epoch 31/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [31/220] Train 0.2294 | Val 0.1423


Fold 4 | Epoch 32/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [32/220] Train 0.2295 | Val 0.1429


Fold 4 | Epoch 33/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [33/220] Train 0.2277 | Val 0.1415


Fold 4 | Epoch 34/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [34/220] Train 0.2245 | Val 0.1407


Fold 4 | Epoch 35/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [35/220] Train 0.2221 | Val 0.1424


Fold 4 | Epoch 36/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [36/220] Train 0.2183 | Val 0.1400


Fold 4 | Epoch 37/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [37/220] Train 0.2171 | Val 0.1373
-> Improved Val Loss 0.1373. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 38/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [38/220] Train 0.2123 | Val 0.1376


Fold 4 | Epoch 39/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [39/220] Train 0.2089 | Val 0.1374


Fold 4 | Epoch 40/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [40/220] Train 0.2088 | Val 0.1370
-> Improved Val Loss 0.1370. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 41/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [41/220] Train 0.2036 | Val 0.1366
-> Improved Val Loss 0.1366. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 42/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [42/220] Train 0.2005 | Val 0.1365
-> Improved Val Loss 0.1365. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 43/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [43/220] Train 0.1996 | Val 0.1371


Fold 4 | Epoch 44/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [44/220] Train 0.1930 | Val 0.1372


Fold 4 | Epoch 45/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [45/220] Train 0.1914 | Val 0.1384


Fold 4 | Epoch 46/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [46/220] Train 0.1852 | Val 0.1382


Fold 4 | Epoch 47/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [47/220] Train 0.1867 | Val 0.1393


Fold 4 | Epoch 48/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [48/220] Train 0.1863 | Val 0.1387


Fold 4 | Epoch 49/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [49/220] Train 0.1831 | Val 0.1384


Fold 4 | Epoch 50/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [50/220] Train 0.1862 | Val 0.1383


Fold 4 | Epoch 51/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [51/220] Train 0.2193 | Val 0.1397


Fold 4 | Epoch 52/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [52/220] Train 0.2225 | Val 0.1398


Fold 4 | Epoch 53/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [53/220] Train 0.2227 | Val 0.1375


Fold 4 | Epoch 54/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [54/220] Train 0.2182 | Val 0.1383


Fold 4 | Epoch 55/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [55/220] Train 0.2178 | Val 0.1395


Fold 4 | Epoch 56/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [56/220] Train 0.2156 | Val 0.1390


Fold 4 | Epoch 57/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [57/220] Train 0.2116 | Val 0.1385


Fold 4 | Epoch 58/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [58/220] Train 0.2101 | Val 0.1390


Fold 4 | Epoch 59/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [59/220] Train 0.2082 | Val 0.1375


Fold 4 | Epoch 60/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [60/220] Train 0.2025 | Val 0.1354
-> Improved Val Loss 0.1354. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 61/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [61/220] Train 0.2011 | Val 0.1382


Fold 4 | Epoch 62/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [62/220] Train 0.1992 | Val 0.1380


Fold 4 | Epoch 63/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [63/220] Train 0.2000 | Val 0.1373


Fold 4 | Epoch 64/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [64/220] Train 0.1908 | Val 0.1376


Fold 4 | Epoch 65/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [65/220] Train 0.1919 | Val 0.1387


Fold 4 | Epoch 66/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [66/220] Train 0.1837 | Val 0.1389


Fold 4 | Epoch 67/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [67/220] Train 0.1833 | Val 0.1373


Fold 4 | Epoch 68/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [68/220] Train 0.1798 | Val 0.1380


Fold 4 | Epoch 69/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [69/220] Train 0.1805 | Val 0.1381


Fold 4 | Epoch 70/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [70/220] Train 0.1756 | Val 0.1387


Fold 4 | Epoch 71/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [71/220] Train 0.1735 | Val 0.1400


Fold 4 | Epoch 72/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [72/220] Train 0.1743 | Val 0.1411


Fold 4 | Epoch 73/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [73/220] Train 0.1714 | Val 0.1417


Fold 4 | Epoch 74/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [74/220] Train 0.1745 | Val 0.1415


Fold 4 | Epoch 75/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [75/220] Train 0.1723 | Val 0.1417


Fold 4 | Epoch 76/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [76/220] Train 0.2077 | Val 0.1412


Fold 4 | Epoch 77/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [77/220] Train 0.2106 | Val 0.1379


Fold 4 | Epoch 78/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [78/220] Train 0.2086 | Val 0.1364


Fold 4 | Epoch 79/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [79/220] Train 0.2032 | Val 0.1355


Fold 4 | Epoch 80/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [80/220] Train 0.2050 | Val 0.1373


Fold 4 | Epoch 81/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [81/220] Train 0.2029 | Val 0.1349
-> Improved Val Loss 0.1349. Saved to /kaggle/working/model_best_fold_4.pt


Fold 4 | Epoch 82/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [82/220] Train 0.1990 | Val 0.1362


Fold 4 | Epoch 83/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [83/220] Train 0.1988 | Val 0.1376


Fold 4 | Epoch 84/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [84/220] Train 0.1971 | Val 0.1381


Fold 4 | Epoch 85/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [85/220] Train 0.1925 | Val 0.1382


Fold 4 | Epoch 86/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [86/220] Train 0.1889 | Val 0.1393


Fold 4 | Epoch 87/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [87/220] Train 0.1877 | Val 0.1386


Fold 4 | Epoch 88/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [88/220] Train 0.1859 | Val 0.1378


Fold 4 | Epoch 89/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [89/220] Train 0.1840 | Val 0.1382


Fold 4 | Epoch 90/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [90/220] Train 0.1797 | Val 0.1370


Fold 4 | Epoch 91/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [91/220] Train 0.1790 | Val 0.1383


Fold 4 | Epoch 92/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [92/220] Train 0.1791 | Val 0.1378


Fold 4 | Epoch 93/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [93/220] Train 0.1699 | Val 0.1396


Fold 4 | Epoch 94/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [94/220] Train 0.1699 | Val 0.1391


Fold 4 | Epoch 95/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [95/220] Train 0.1702 | Val 0.1394


Fold 4 | Epoch 96/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [96/220] Train 0.1667 | Val 0.1405


Fold 4 | Epoch 97/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [97/220] Train 0.1649 | Val 0.1411


Fold 4 | Epoch 98/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [98/220] Train 0.1644 | Val 0.1412


Fold 4 | Epoch 99/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [99/220] Train 0.1628 | Val 0.1409


Fold 4 | Epoch 100/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [100/220] Train 0.1619 | Val 0.1405


Fold 4 | Epoch 101/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [101/220] Train 0.1952 | Val 0.1397


Fold 4 | Epoch 102/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [102/220] Train 0.2000 | Val 0.1372


Fold 4 | Epoch 103/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [103/220] Train 0.1956 | Val 0.1386


Fold 4 | Epoch 104/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [104/220] Train 0.1966 | Val 0.1389


Fold 4 | Epoch 105/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [105/220] Train 0.1957 | Val 0.1372


Fold 4 | Epoch 106/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [106/220] Train 0.1913 | Val 0.1384


Fold 4 | Epoch 107/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [107/220] Train 0.1911 | Val 0.1371


Fold 4 | Epoch 108/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [108/220] Train 0.1878 | Val 0.1395


Fold 4 | Epoch 109/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [109/220] Train 0.1836 | Val 0.1380


Fold 4 | Epoch 110/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [110/220] Train 0.1847 | Val 0.1417


Fold 4 | Epoch 111/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 4 | Epoch [111/220] Train 0.1805 | Val 0.1422
Early stopping after 111 epochs.
Fold 4 done. Best Val Loss: 0.1349 | OOF macro-F1: 0.7391
Saved prep_fold_4.joblib

--- Processing Fold 5 ---

Model has 7.3M parameters.


Fold 5 | Epoch 1/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [1/220] Train 0.4619 | Val 0.4599
-> Improved Val Loss 0.4599. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 2/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [2/220] Train 0.3398 | Val 0.3017
-> Improved Val Loss 0.3017. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 3/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [3/220] Train 0.3151 | Val 0.2332
-> Improved Val Loss 0.2332. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 4/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [4/220] Train 0.2930 | Val 0.2008
-> Improved Val Loss 0.2008. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 5/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [5/220] Train 0.2868 | Val 0.1853
-> Improved Val Loss 0.1853. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 6/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [6/220] Train 0.2798 | Val 0.1784
-> Improved Val Loss 0.1784. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 7/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [7/220] Train 0.2692 | Val 0.1712
-> Improved Val Loss 0.1712. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 8/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [8/220] Train 0.2657 | Val 0.1640
-> Improved Val Loss 0.1640. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 9/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [9/220] Train 0.2612 | Val 0.1616
-> Improved Val Loss 0.1616. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 10/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [10/220] Train 0.2553 | Val 0.1604
-> Improved Val Loss 0.1604. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 11/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [11/220] Train 0.2516 | Val 0.1566
-> Improved Val Loss 0.1566. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 12/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [12/220] Train 0.2461 | Val 0.1524
-> Improved Val Loss 0.1524. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 13/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [13/220] Train 0.2437 | Val 0.1504
-> Improved Val Loss 0.1504. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 14/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [14/220] Train 0.2376 | Val 0.1489
-> Improved Val Loss 0.1489. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 15/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [15/220] Train 0.2357 | Val 0.1502


Fold 5 | Epoch 16/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [16/220] Train 0.2340 | Val 0.1486
-> Improved Val Loss 0.1486. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 17/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [17/220] Train 0.2256 | Val 0.1466
-> Improved Val Loss 0.1466. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 18/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [18/220] Train 0.2223 | Val 0.1436
-> Improved Val Loss 0.1436. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 19/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [19/220] Train 0.2200 | Val 0.1459


Fold 5 | Epoch 20/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [20/220] Train 0.2108 | Val 0.1455


Fold 5 | Epoch 21/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [21/220] Train 0.2158 | Val 0.1450


Fold 5 | Epoch 22/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [22/220] Train 0.2140 | Val 0.1450


Fold 5 | Epoch 23/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [23/220] Train 0.2109 | Val 0.1441


Fold 5 | Epoch 24/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [24/220] Train 0.2084 | Val 0.1437


Fold 5 | Epoch 25/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [25/220] Train 0.2098 | Val 0.1440


Fold 5 | Epoch 26/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [26/220] Train 0.2443 | Val 0.1452


Fold 5 | Epoch 27/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [27/220] Train 0.2460 | Val 0.1471


Fold 5 | Epoch 28/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [28/220] Train 0.2403 | Val 0.1469


Fold 5 | Epoch 29/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [29/220] Train 0.2399 | Val 0.1469


Fold 5 | Epoch 30/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [30/220] Train 0.2335 | Val 0.1470


Fold 5 | Epoch 31/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [31/220] Train 0.2356 | Val 0.1445


Fold 5 | Epoch 32/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [32/220] Train 0.2280 | Val 0.1446


Fold 5 | Epoch 33/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [33/220] Train 0.2282 | Val 0.1468


Fold 5 | Epoch 34/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [34/220] Train 0.2249 | Val 0.1455


Fold 5 | Epoch 35/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [35/220] Train 0.2236 | Val 0.1448


Fold 5 | Epoch 36/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [36/220] Train 0.2179 | Val 0.1424
-> Improved Val Loss 0.1424. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 37/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [37/220] Train 0.2173 | Val 0.1414
-> Improved Val Loss 0.1414. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 38/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [38/220] Train 0.2103 | Val 0.1419


Fold 5 | Epoch 39/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [39/220] Train 0.2105 | Val 0.1438


Fold 5 | Epoch 40/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [40/220] Train 0.2083 | Val 0.1415


Fold 5 | Epoch 41/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [41/220] Train 0.2053 | Val 0.1411
-> Improved Val Loss 0.1411. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 42/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [42/220] Train 0.2031 | Val 0.1419


Fold 5 | Epoch 43/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [43/220] Train 0.1995 | Val 0.1424


Fold 5 | Epoch 44/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [44/220] Train 0.1943 | Val 0.1423


Fold 5 | Epoch 45/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [45/220] Train 0.1945 | Val 0.1410
-> Improved Val Loss 0.1410. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 46/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [46/220] Train 0.1885 | Val 0.1412


Fold 5 | Epoch 47/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [47/220] Train 0.1890 | Val 0.1409
-> Improved Val Loss 0.1409. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 48/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [48/220] Train 0.1861 | Val 0.1401
-> Improved Val Loss 0.1401. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 49/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [49/220] Train 0.1863 | Val 0.1399
-> Improved Val Loss 0.1399. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 50/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [50/220] Train 0.1869 | Val 0.1398
-> Improved Val Loss 0.1398. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 51/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [51/220] Train 0.2191 | Val 0.1382
-> Improved Val Loss 0.1382. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 52/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [52/220] Train 0.2214 | Val 0.1418


Fold 5 | Epoch 53/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [53/220] Train 0.2212 | Val 0.1438


Fold 5 | Epoch 54/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [54/220] Train 0.2211 | Val 0.1456


Fold 5 | Epoch 55/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [55/220] Train 0.2198 | Val 0.1447


Fold 5 | Epoch 56/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [56/220] Train 0.2159 | Val 0.1430


Fold 5 | Epoch 57/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [57/220] Train 0.2143 | Val 0.1409


Fold 5 | Epoch 58/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [58/220] Train 0.2099 | Val 0.1388


Fold 5 | Epoch 59/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [59/220] Train 0.2103 | Val 0.1381
-> Improved Val Loss 0.1381. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 60/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [60/220] Train 0.2061 | Val 0.1360
-> Improved Val Loss 0.1360. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 61/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [61/220] Train 0.2043 | Val 0.1372


Fold 5 | Epoch 62/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [62/220] Train 0.2000 | Val 0.1349
-> Improved Val Loss 0.1349. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 63/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [63/220] Train 0.1965 | Val 0.1375


Fold 5 | Epoch 64/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [64/220] Train 0.1929 | Val 0.1388


Fold 5 | Epoch 65/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [65/220] Train 0.1961 | Val 0.1386


Fold 5 | Epoch 66/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [66/220] Train 0.1845 | Val 0.1374


Fold 5 | Epoch 67/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [67/220] Train 0.1853 | Val 0.1385


Fold 5 | Epoch 68/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [68/220] Train 0.1828 | Val 0.1392


Fold 5 | Epoch 69/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [69/220] Train 0.1799 | Val 0.1403


Fold 5 | Epoch 70/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [70/220] Train 0.1766 | Val 0.1392


Fold 5 | Epoch 71/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [71/220] Train 0.1747 | Val 0.1384


Fold 5 | Epoch 72/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [72/220] Train 0.1748 | Val 0.1377


Fold 5 | Epoch 73/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [73/220] Train 0.1758 | Val 0.1379


Fold 5 | Epoch 74/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [74/220] Train 0.1707 | Val 0.1385


Fold 5 | Epoch 75/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [75/220] Train 0.1754 | Val 0.1385


Fold 5 | Epoch 76/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [76/220] Train 0.2053 | Val 0.1381


Fold 5 | Epoch 77/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [77/220] Train 0.2114 | Val 0.1342
-> Improved Val Loss 0.1342. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 78/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [78/220] Train 0.2086 | Val 0.1411


Fold 5 | Epoch 79/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [79/220] Train 0.2043 | Val 0.1382


Fold 5 | Epoch 80/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [80/220] Train 0.2052 | Val 0.1365


Fold 5 | Epoch 81/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [81/220] Train 0.2045 | Val 0.1367


Fold 5 | Epoch 82/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [82/220] Train 0.1950 | Val 0.1393


Fold 5 | Epoch 83/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [83/220] Train 0.2002 | Val 0.1374


Fold 5 | Epoch 84/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [84/220] Train 0.1975 | Val 0.1367


Fold 5 | Epoch 85/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [85/220] Train 0.1940 | Val 0.1382


Fold 5 | Epoch 86/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [86/220] Train 0.1959 | Val 0.1352


Fold 5 | Epoch 87/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [87/220] Train 0.1877 | Val 0.1344


Fold 5 | Epoch 88/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [88/220] Train 0.1862 | Val 0.1353


Fold 5 | Epoch 89/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [89/220] Train 0.1826 | Val 0.1363


Fold 5 | Epoch 90/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [90/220] Train 0.1817 | Val 0.1368


Fold 5 | Epoch 91/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [91/220] Train 0.1748 | Val 0.1379


Fold 5 | Epoch 92/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [92/220] Train 0.1753 | Val 0.1386


Fold 5 | Epoch 93/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [93/220] Train 0.1776 | Val 0.1382


Fold 5 | Epoch 94/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [94/220] Train 0.1708 | Val 0.1365


Fold 5 | Epoch 95/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [95/220] Train 0.1682 | Val 0.1368


Fold 5 | Epoch 96/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [96/220] Train 0.1655 | Val 0.1377


Fold 5 | Epoch 97/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [97/220] Train 0.1641 | Val 0.1376


Fold 5 | Epoch 98/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [98/220] Train 0.1663 | Val 0.1384


Fold 5 | Epoch 99/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [99/220] Train 0.1649 | Val 0.1388


Fold 5 | Epoch 100/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [100/220] Train 0.1648 | Val 0.1384


Fold 5 | Epoch 101/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [101/220] Train 0.1956 | Val 0.1381


Fold 5 | Epoch 102/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [102/220] Train 0.1991 | Val 0.1372


Fold 5 | Epoch 103/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [103/220] Train 0.1980 | Val 0.1357


Fold 5 | Epoch 104/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [104/220] Train 0.1940 | Val 0.1341
-> Improved Val Loss 0.1341. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 105/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [105/220] Train 0.1955 | Val 0.1338
-> Improved Val Loss 0.1338. Saved to /kaggle/working/model_best_fold_5.pt


Fold 5 | Epoch 106/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [106/220] Train 0.1938 | Val 0.1347


Fold 5 | Epoch 107/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [107/220] Train 0.1915 | Val 0.1338


Fold 5 | Epoch 108/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [108/220] Train 0.1885 | Val 0.1357


Fold 5 | Epoch 109/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [109/220] Train 0.1871 | Val 0.1367


Fold 5 | Epoch 110/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [110/220] Train 0.1844 | Val 0.1356


Fold 5 | Epoch 111/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [111/220] Train 0.1842 | Val 0.1368


Fold 5 | Epoch 112/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [112/220] Train 0.1814 | Val 0.1354


Fold 5 | Epoch 113/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [113/220] Train 0.1800 | Val 0.1342


Fold 5 | Epoch 114/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [114/220] Train 0.1747 | Val 0.1351


Fold 5 | Epoch 115/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [115/220] Train 0.1719 | Val 0.1348


Fold 5 | Epoch 116/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [116/220] Train 0.1668 | Val 0.1365


Fold 5 | Epoch 117/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [117/220] Train 0.1698 | Val 0.1353


Fold 5 | Epoch 118/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [118/220] Train 0.1669 | Val 0.1370


Fold 5 | Epoch 119/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [119/220] Train 0.1655 | Val 0.1369


Fold 5 | Epoch 120/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [120/220] Train 0.1603 | Val 0.1374


Fold 5 | Epoch 121/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [121/220] Train 0.1612 | Val 0.1383


Fold 5 | Epoch 122/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [122/220] Train 0.1605 | Val 0.1387


Fold 5 | Epoch 123/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [123/220] Train 0.1553 | Val 0.1383


Fold 5 | Epoch 124/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [124/220] Train 0.1550 | Val 0.1379


Fold 5 | Epoch 125/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [125/220] Train 0.1569 | Val 0.1379


Fold 5 | Epoch 126/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [126/220] Train 0.1890 | Val 0.1344


Fold 5 | Epoch 127/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [127/220] Train 0.1889 | Val 0.1366


Fold 5 | Epoch 128/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [128/220] Train 0.1907 | Val 0.1347


Fold 5 | Epoch 129/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [129/220] Train 0.1896 | Val 0.1362


Fold 5 | Epoch 130/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [130/220] Train 0.1868 | Val 0.1356


Fold 5 | Epoch 131/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [131/220] Train 0.1852 | Val 0.1359


Fold 5 | Epoch 132/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [132/220] Train 0.1824 | Val 0.1351


Fold 5 | Epoch 133/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [133/220] Train 0.1835 | Val 0.1364


Fold 5 | Epoch 134/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [134/220] Train 0.1797 | Val 0.1361


Fold 5 | Epoch 135/220:   0%|          | 0/101 [00:00<?, ?it/s]

Fold 5 | Epoch [135/220] Train 0.1795 | Val 0.1374
Early stopping after 135 epochs.
Fold 5 done. Best Val Loss: 0.1338 | OOF macro-F1: 0.7422
Saved prep_fold_5.joblib

 Hoàn tất 5-fold CV.
