In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip -q install optuna tqdm ipywidgets pytorch-tabnet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/400.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m82.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# =========================
# Self-Attention S1 Baseline (with Early Stopping)
# =========================
import os, json, warnings, random, pickle
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import average_precision_score, roc_auc_score, f1_score, accuracy_score, precision_recall_curve

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# ---------- Config ----------
TARGET_COLUMN     = 'binary_helpfulness'
TEST_SPLIT_RATIO  = 0.2
RANDOM_STATE      = 42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"✅ Device = {DEVICE}")

FILE_PATHS = {
    "Amazon":  '/content/drive/MyDrive/1014/data/new_amazon.csv',
    "Coursera":'/content/drive/MyDrive/1014/data/new_coursera.csv',
    "Audible": '/content/drive/MyDrive/1014/data/new_audible.csv',
    "Hotel":   '/content/drive/MyDrive/1014/data/new_hotel.csv'
}
S1_FEATURES = {
    "Amazon":  ['Average_Rating','Rating','Deviation_Of_Star_Ratings','Time_Lapsed','Price','Text_Length','Valence','Arousal','Title_Length','Num_of_Ratings','Is_Photo','Flesch_Reading_Ease','FOG_Index','Sentiment_Score','new_depth','new_breadth'],
    "Coursera":['Average_Rating','Rating','Deviation_Of_Star_Ratings','Time_Lapsed','Num_of_Reviews','Num_of_Enrolled','Num_of_top_instructor_courses','Num_of_top_instructor_learners','Text_Length','Valence','Arousal','Num_of_Ratings','Flesch_Reading_Ease','FOG_Index','Sentiment_Score','new_depth','new_breadth'],
    "Audible": ['Average_Rating','Rating','Deviation_Of_Star_Ratings','Time_Lapsed','Text_Length','Valence','Arousal','Title_Length','Num_of_Ratings','Flesch_Reading_Ease','FOG_Index','Sentiment_Score','new_depth','new_breadth'],
    "Hotel":   ['Average_Rating','Rating','Deviation_Of_Star_Ratings','Time_Lapsed','Text_Length','Valence','Arousal','Title_Length','Num_of_Ratings','Flesch_Reading_Ease','FOG_Index','Sentiment_Score','new_depth','new_breadth','Is_Photo','Hotel_Grade','Employee_Friendliness_Score','Facility_Score','Cleanliness_Score','Comfort_Score','Value_For_Money_Score','Location_Score']
}

def set_seed(seed=RANDOM_STATE):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
set_seed()

def _make_numeric_df(df: pd.DataFrame) -> pd.DataFrame:
    num = df.apply(pd.to_numeric, errors='coerce')
    return num.fillna(num.median())

def find_best_threshold(y_true, prob):
    p, r, t = precision_recall_curve(y_true, prob)
    t = np.concatenate([t, [1.0]])
    f1 = (2*p*r)/np.clip(p+r, 1e-9, None)
    idx = int(np.nanargmax(f1))
    return float(t[idx]), float(f1[idx])

# ---------- Model ----------
class S1EmbeddingLayerWithFeatureEncoding(nn.Module):
    def __init__(self, n_features, embedding_dim, dropout=0.1):
        super().__init__()
        self.n_features, self.embedding_dim = n_features, embedding_dim
        self.feature_projections = nn.ModuleList([
            nn.Sequential(nn.Linear(1, embedding_dim), nn.LayerNorm(embedding_dim), nn.ReLU(), nn.Dropout(dropout))
            for _ in range(n_features)
        ])
        self.feature_type_embedding = nn.Embedding(n_features, embedding_dim)

    def forward(self, x):  # x: (B,F)
        B, F = x.shape
        embs = []
        for i in range(F):
            col = x[:, i:i+1]
            embs.append(self.feature_projections[i](col))
        H = torch.stack(embs, dim=1)                    # (B,F,D)
        type_idx = torch.arange(F, device=x.device)
        type_emb = self.feature_type_embedding(type_idx) # (F,D)
        return H + type_emb.unsqueeze(0)                # (B,F,D)

class S1_SelfAttention_Model(nn.Module):
    def __init__(self, n_s1_features, embedding_dim=128, n_layers=2, n_heads=4,
                 dim_ff=256, attn_dropout=0.1, resid_dropout=0.1,
                 use_mean=True, head_hidden=256):
        super().__init__()
        self.use_mean = use_mean
        self.s1_embedding = S1EmbeddingLayerWithFeatureEncoding(n_s1_features, embedding_dim, resid_dropout)
        self.cls_token = nn.Parameter(torch.zeros(1,1,embedding_dim))
        nn.init.trunc_normal_(self.cls_token, std=0.02)
        self.pos_embedding = nn.Parameter(torch.zeros(1, n_s1_features+1, embedding_dim))
        nn.init.trunc_normal_(self.pos_embedding, std=0.02)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=embedding_dim, nhead=n_heads, dim_feedforward=dim_ff,
            dropout=attn_dropout, batch_first=True, activation="gelu", norm_first=True
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers, enable_nested_tensor=False)
        self.dropout = nn.Dropout(resid_dropout)

        head_in = embedding_dim*(2 if use_mean else 1)
        self.head = nn.Sequential(
            nn.LayerNorm(head_in),
            nn.Linear(head_in, head_hidden), nn.GELU(), nn.Dropout(resid_dropout),
            nn.Linear(head_hidden, 1)
        )

    def forward(self, x):  # x: (B,F)
        B = x.size(0)
        H = self.s1_embedding(x)             # (B,F,D)
        cls = self.cls_token.expand(B,-1,-1) # (B,1,D)
        H = torch.cat([cls, H], dim=1)       # (B,F+1,D)
        H = H + self.pos_embedding[:, :H.size(1), :]
        H = self.dropout(self.encoder(H))    # (B,F+1,D)
        cls_h = H[:,0]
        rep = torch.cat([cls_h, H[:,1:].mean(1)], dim=-1) if self.use_mean else cls_h
        return self.head(rep)                # (B,1)

    @torch.no_grad()
    def predict_proba(self, x):
        return torch.sigmoid(self.forward(x)).squeeze(-1)

# ---------- Train utils ----------
class NumpyDataset(Dataset):
    def __init__(self, X, y): self.X=X; self.y=y
    def __len__(self): return len(self.X)
    def __getitem__(self, i): return self.X[i], self.y[i]

def train_one_run(X_tr, y_tr, X_va, y_va, params):
    bs = params.get("batch_size", 1024)
    lr = params.get("lr", 2e-4)
    wd = params.get("weight_decay", 1e-4)
    epochs = params.get("max_epochs", 50)
    patience = params.get("patience", 7)

    model = S1_SelfAttention_Model(
        n_s1_features=X_tr.shape[1],
        embedding_dim=params.get("embedding_dim", 128),
        n_layers=params.get("n_layers", 2),
        n_heads=params.get("n_heads", 4),
        dim_ff=params.get("dim_ff", 256),
        attn_dropout=params.get("attn_dropout", 0.1),
        resid_dropout=params.get("resid_dropout", 0.1),
        use_mean=params.get("use_mean", True),
        head_hidden=params.get("head_hidden", 256)
    ).to(DEVICE)

    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
    crit = nn.BCEWithLogitsLoss()

    dl_tr = DataLoader(NumpyDataset(torch.tensor(X_tr,dtype=torch.float32),
                                    torch.tensor(y_tr,dtype=torch.float32)),
                       batch_size=bs, shuffle=True, drop_last=False)
    dl_va = DataLoader(NumpyDataset(torch.tensor(X_va,dtype=torch.float32),
                                    torch.tensor(y_va,dtype=torch.float32)),
                       batch_size=4096, shuffle=False, drop_last=False)

    best_pr, best_state, bad = -1.0, None, 0
    for ep in range(1, epochs+1):
        model.train()
        for xb, yb in dl_tr:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            logit = model(xb).squeeze(-1)
            loss = crit(logit, yb)
            opt.zero_grad(); loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()

        # validate PR-AUC
        model.eval(); probs=[]; ys=[]
        with torch.no_grad():
            for xb, yb in dl_va:
                xb=xb.to(DEVICE)
                probs.append(model.predict_proba(xb).cpu().numpy())
                ys.append(yb.numpy())
        pr = average_precision_score(np.concatenate(ys), np.concatenate(probs))
        if pr > best_pr + 1e-6:
            best_pr = pr; bad = 0
            best_state = {k:v.detach().cpu().clone() for k,v in model.state_dict().items()}
        else:
            bad += 1
            if bad >= patience: break

    if best_state is not None: model.load_state_dict(best_state)
    return model, best_pr

# ---------- Pipeline ----------
def run_baseline(platform, csv_path, features, save_root="/content/drive/MyDrive/1014/result_selfattn_baseline"):
    print(f"\n=== {platform} ===")
    df = pd.read_csv(csv_path)
    labels = df[TARGET_COLUMN].astype(int).values
    cols = [c for c in features if c in df.columns]
    X_all = _make_numeric_df(df[cols]).to_numpy()

    idx = np.arange(len(df))
    tr_idx, te_idx = train_test_split(idx, test_size=TEST_SPLIT_RATIO,
                                      random_state=RANDOM_STATE, stratify=labels)
    X_tr_raw, X_te_raw = X_all[tr_idx], X_all[te_idx]
    y_tr, y_te = labels[tr_idx], labels[te_idx]

    # scaler (fit only on train)
    scaler = StandardScaler().fit(X_tr_raw)
    X_tr = scaler.transform(X_tr_raw)
    X_te = scaler.transform(X_te_raw)

    # 내부 10%를 val로
    sub, val = train_test_split(np.arange(len(y_tr)), test_size=0.1,
                                random_state=RANDOM_STATE, stratify=y_tr)
    model, _ = train_one_run(X_tr[sub], y_tr[sub], X_tr[val], y_tr[val], {
        "embedding_dim": 128, "n_layers": 2, "n_heads": 4,
        "dim_ff": 256, "attn_dropout": 0.1, "resid_dropout": 0.1,
        "head_hidden": 256, "use_mean": True,
        "lr": 2e-4, "weight_decay": 1e-4,
        "batch_size": 1024, "max_epochs": 50, "patience": 7
    })

    # test
    with torch.no_grad():
        te_prob = model.predict_proba(torch.tensor(X_te, dtype=torch.float32).to(DEVICE)).cpu().numpy()
        tr_prob = model.predict_proba(torch.tensor(X_tr, dtype=torch.float32).to(DEVICE)).cpu().numpy()
    best_th, _ = find_best_threshold(y_tr, tr_prob)
    te_pred = (te_prob >= best_th).astype(int)

    metrics = {
        "Accuracy": float(accuracy_score(y_te, te_pred)),
        "PR_AUC":   float(average_precision_score(y_te, te_prob)),
        "ROC_AUC":  float(roc_auc_score(y_te, te_prob)),
        "F1_score": float(f1_score(y_te, te_pred)),
        "Best_Threshold": float(best_th)
    }
    print("Metrics:", metrics)

    # save
    save_dir = f"{save_root}/{platform}"
    os.makedirs(save_dir, exist_ok=True)
    pd.DataFrame({"index": te_idx, "s1_pred_proba": te_prob, "y_true": y_te,
                  "y_pred_at_best_th": te_pred}).to_csv(f"{save_dir}/s1_pred_proba.csv", index=False)
    with open(f"{save_dir}/results.json","w") as f: json.dump(metrics, f, indent=2)
    torch.save(model.state_dict(), f"{save_dir}/selfattn_model.pt")
    with open(f"{save_dir}/scaler.pkl","wb") as f: pickle.dump(scaler, f)
    print(f"Saved to {save_dir}")

# ---- run all
for platform, path in FILE_PATHS.items():
    run_baseline(platform, path, S1_FEATURES[platform])

✅ Device = cuda

=== Amazon ===
Metrics: {'Accuracy': 0.8874124318914711, 'PR_AUC': 0.4423823616605832, 'ROC_AUC': 0.8293833600002343, 'F1_score': 0.44837918823208933, 'Best_Threshold': 0.26705095171928406}
Saved to /content/drive/MyDrive/1014/result_selfattn_baseline/Amazon

=== Coursera ===
Metrics: {'Accuracy': 0.9505725348051735, 'PR_AUC': 0.45945118341144475, 'ROC_AUC': 0.9079666586586554, 'F1_score': 0.45305378304466726, 'Best_Threshold': 0.253726601600647}
Saved to /content/drive/MyDrive/1014/result_selfattn_baseline/Coursera

=== Audible ===
Metrics: {'Accuracy': 0.9097752446499624, 'PR_AUC': 0.34466288991550603, 'ROC_AUC': 0.8014308485629043, 'F1_score': 0.3775964391691395, 'Best_Threshold': 0.17664691805839539}
Saved to /content/drive/MyDrive/1014/result_selfattn_baseline/Audible

=== Hotel ===
Metrics: {'Accuracy': 0.8033070778168817, 'PR_AUC': 0.24491487655657249, 'ROC_AUC': 0.7114614503559873, 'F1_score': 0.28536634869088695, 'Best_Threshold': 0.134362131357193}
Saved to /

# Optuna 튜닝 + 5-fold CV(early stopping + fold별 Scaler)

In [None]:
# =========================
# Self-Attention + Optuna Tuning (5-fold CV, no leakage)
# =========================
import optuna
from sklearn.model_selection import StratifiedKFold

N_TRIALS = 30  # 필요하면 조절

def cv_objective_factory(X_train, y_train):
    def objective(trial):
        params = {
            "embedding_dim": trial.suggest_categorical("embedding_dim", [64, 96, 128, 160]),
            "n_layers": trial.suggest_int("n_layers", 1, 4),
            "n_heads": trial.suggest_categorical("n_heads", [2, 4, 8]),
            "dim_ff": trial.suggest_categorical("dim_ff", [128, 256, 384, 512]),
            "attn_dropout": trial.suggest_float("attn_dropout", 0.0, 0.3),
            "resid_dropout": trial.suggest_float("resid_dropout", 0.0, 0.3),
            "head_hidden": trial.suggest_categorical("head_hidden", [128, 256, 384]),
            "use_mean": trial.suggest_categorical("use_mean", [True, False]),
            "lr": trial.suggest_float("lr", 1e-4, 5e-3, log=True),
            "weight_decay": trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True),
            "batch_size": trial.suggest_categorical("batch_size", [512, 1024, 2048]),
            "max_epochs": 60,
            "patience": 8
        }
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
        pr_aucs = []
        for tr_idx, va_idx in skf.split(X_train, y_train):
            # ---- per fold scaler (no leakage)
            sc = StandardScaler().fit(X_train[tr_idx])
            X_tr = sc.transform(X_train[tr_idx])
            X_va = sc.transform(X_train[va_idx])
            y_tr, y_va = y_train[tr_idx], y_train[va_idx]

            model, val_pr = train_one_run(X_tr, y_tr, X_va, y_va, params)
            pr_aucs.append(val_pr)
            del model
            torch.cuda.empty_cache()
        return float(np.mean(pr_aucs))
    return objective

def run_tuned(platform, csv_path, features, save_root="/content/drive/MyDrive/1014/result_selfattn_tuned"):
    print(f"\n=== [Tuned] {platform} ===")
    df = pd.read_csv(csv_path)
    labels = df[TARGET_COLUMN].astype(int).values
    cols = [c for c in features if c in df.columns]
    X_all = _make_numeric_df(df[cols]).to_numpy()

    idx = np.arange(len(df))
    tr_idx, te_idx = train_test_split(idx, test_size=TEST_SPLIT_RATIO,
                                      random_state=RANDOM_STATE, stratify=labels)
    X_tr_raw, X_te_raw = X_all[tr_idx], X_all[te_idx]
    y_tr, y_te = labels[tr_idx], labels[te_idx]

    # ----- Optuna CV
    study = optuna.create_study(direction="maximize")
    study.optimize(cv_objective_factory(X_tr_raw, y_tr), n_trials=N_TRIALS)
    best_params = study.best_params
    print("Best Params:", best_params)

    # ----- Final train (train 90% / val 10% split for early stopping)
    sub, val = train_test_split(np.arange(len(y_tr)), test_size=0.1,
                                random_state=RANDOM_STATE, stratify=y_tr)
    sc_final = StandardScaler().fit(X_tr_raw[sub])
    X_sub = sc_final.transform(X_tr_raw[sub])
    X_val = sc_final.transform(X_tr_raw[val])
    X_te  = sc_final.transform(X_te_raw)
    y_sub, y_val = y_tr[sub], y_tr[val]

    model, _ = train_one_run(X_sub, y_sub, X_val, y_val, best_params)

    # ----- Evaluate
    with torch.no_grad():
        te_prob = model.predict_proba(torch.tensor(X_te, dtype=torch.float32).to(DEVICE)).cpu().numpy()
        tr_prob = model.predict_proba(torch.tensor(sc_final.transform(X_tr_raw), dtype=torch.float32).to(DEVICE)).cpu().numpy()
    best_th, _ = find_best_threshold(y_tr, tr_prob)
    te_pred = (te_prob >= best_th).astype(int)

    metrics = {
        "Accuracy": float(accuracy_score(y_te, te_pred)),
        "PR_AUC":   float(average_precision_score(y_te, te_prob)),
        "ROC_AUC":  float(roc_auc_score(y_te, te_prob)),
        "F1_score": float(f1_score(y_te, te_pred)),
        "Best_Threshold": float(best_th),
        "best_params": best_params
    }
    print("Metrics:", metrics)

    # save
    save_dir = f"{save_root}/{platform}"
    os.makedirs(save_dir, exist_ok=True)
    pd.DataFrame({"index": te_idx, "s1_pred_proba": te_prob, "y_true": y_te,
                  "y_pred_at_best_th": te_pred}).to_csv(f"{save_dir}/s1_pred_proba.csv", index=False)
    with open(f"{save_dir}/results.json","w") as f: json.dump(metrics, f, indent=2)
    torch.save(model.state_dict(), f"{save_dir}/selfattn_model.pt")
    with open(f"{save_dir}/scaler.pkl","wb") as f: pickle.dump(sc_final, f)
    print(f"Saved to {save_dir}")

# ---- run all (tuned)
for platform, path in FILE_PATHS.items():
    run_tuned(platform, path, S1_FEATURES[platform])


=== [Tuned] Amazon ===


[I 2025-10-13 09:30:58,646] A new study created in memory with name: no-name-82ab4fc1-cd77-47e4-8da9-0dc09bb283c4
[I 2025-10-13 09:38:26,086] Trial 0 finished with value: 0.43799284265368427 and parameters: {'embedding_dim': 96, 'n_layers': 3, 'n_heads': 4, 'dim_ff': 384, 'attn_dropout': 0.0999859604605935, 'resid_dropout': 0.09673533855880047, 'head_hidden': 384, 'use_mean': False, 'lr': 0.0002334761858808828, 'weight_decay': 4.284270583620203e-06, 'batch_size': 1024}. Best is trial 0 with value: 0.43799284265368427.
[I 2025-10-13 09:45:13,235] Trial 1 finished with value: 0.4139311783045592 and parameters: {'embedding_dim': 64, 'n_layers': 2, 'n_heads': 8, 'dim_ff': 384, 'attn_dropout': 0.1224900618383829, 'resid_dropout': 0.19855560219190987, 'head_hidden': 128, 'use_mean': False, 'lr': 0.00044384895862144974, 'weight_decay': 0.0005197075608589421, 'batch_size': 1024}. Best is trial 0 with value: 0.43799284265368427.
[I 2025-10-13 09:54:06,316] Trial 2 finished with value: 0.3765203

Best Params: {'embedding_dim': 128, 'n_layers': 2, 'n_heads': 8, 'dim_ff': 512, 'attn_dropout': 0.060221876632914985, 'resid_dropout': 0.04577159662728221, 'head_hidden': 256, 'use_mean': True, 'lr': 0.00019165909280524868, 'weight_decay': 0.0006400421802097018, 'batch_size': 512}
Metrics: {'Accuracy': 0.8896363838541087, 'PR_AUC': 0.4900413814890722, 'ROC_AUC': 0.8434756212145808, 'F1_score': 0.470242860955431, 'Best_Threshold': 0.2471323162317276, 'best_params': {'embedding_dim': 128, 'n_layers': 2, 'n_heads': 8, 'dim_ff': 512, 'attn_dropout': 0.060221876632914985, 'resid_dropout': 0.04577159662728221, 'head_hidden': 256, 'use_mean': True, 'lr': 0.00019165909280524868, 'weight_decay': 0.0006400421802097018, 'batch_size': 512}}
Saved to /content/drive/MyDrive/1014/result_selfattn_tuned/Amazon

=== [Tuned] Coursera ===


[I 2025-10-13 14:00:16,567] A new study created in memory with name: no-name-b91ddfd2-f404-4822-93ed-f567abc6c658
[I 2025-10-13 14:06:24,028] Trial 0 finished with value: 0.40106296419399545 and parameters: {'embedding_dim': 96, 'n_layers': 1, 'n_heads': 8, 'dim_ff': 512, 'attn_dropout': 0.26817908103657906, 'resid_dropout': 0.25124710013279544, 'head_hidden': 384, 'use_mean': False, 'lr': 0.004069212622457329, 'weight_decay': 2.6923589475885083e-05, 'batch_size': 2048}. Best is trial 0 with value: 0.40106296419399545.
[I 2025-10-13 14:14:38,674] Trial 1 finished with value: 0.46157275242409346 and parameters: {'embedding_dim': 160, 'n_layers': 2, 'n_heads': 8, 'dim_ff': 384, 'attn_dropout': 0.2866816032054776, 'resid_dropout': 0.13539364433928333, 'head_hidden': 128, 'use_mean': False, 'lr': 0.0005758228474121655, 'weight_decay': 0.00026629964650972646, 'batch_size': 1024}. Best is trial 1 with value: 0.46157275242409346.
[I 2025-10-13 14:25:10,261] Trial 2 finished with value: 0.4077

Best Params: {'embedding_dim': 64, 'n_layers': 4, 'n_heads': 4, 'dim_ff': 256, 'attn_dropout': 0.0640665012031447, 'resid_dropout': 0.0012123683849555298, 'head_hidden': 128, 'use_mean': True, 'lr': 0.0019133905024305065, 'weight_decay': 9.462386314687875e-05, 'batch_size': 2048}
Metrics: {'Accuracy': 0.953414614053876, 'PR_AUC': 0.4782924317382559, 'ROC_AUC': 0.914723283187014, 'F1_score': 0.4682651622002821, 'Best_Threshold': 0.24127382040023804, 'best_params': {'embedding_dim': 64, 'n_layers': 4, 'n_heads': 4, 'dim_ff': 256, 'attn_dropout': 0.0640665012031447, 'resid_dropout': 0.0012123683849555298, 'head_hidden': 128, 'use_mean': True, 'lr': 0.0019133905024305065, 'weight_decay': 9.462386314687875e-05, 'batch_size': 2048}}
Saved to /content/drive/MyDrive/1014/result_selfattn_tuned/Coursera

=== [Tuned] Audible ===


[I 2025-10-13 17:55:11,301] A new study created in memory with name: no-name-5c1994c6-0555-4654-874a-3fce7a9c6489
[I 2025-10-13 18:05:52,769] Trial 0 finished with value: 0.3392689350947518 and parameters: {'embedding_dim': 96, 'n_layers': 4, 'n_heads': 8, 'dim_ff': 256, 'attn_dropout': 0.026958728837422063, 'resid_dropout': 0.1708818346118249, 'head_hidden': 256, 'use_mean': True, 'lr': 0.0012921111962755226, 'weight_decay': 2.00608120294013e-05, 'batch_size': 512}. Best is trial 0 with value: 0.3392689350947518.
[I 2025-10-13 18:13:15,198] Trial 1 finished with value: 0.32114060068213035 and parameters: {'embedding_dim': 128, 'n_layers': 2, 'n_heads': 2, 'dim_ff': 384, 'attn_dropout': 0.10205303146115947, 'resid_dropout': 0.053774434017270785, 'head_hidden': 384, 'use_mean': False, 'lr': 0.0007134134848229149, 'weight_decay': 2.651805877226465e-05, 'batch_size': 512}. Best is trial 0 with value: 0.3392689350947518.
[W 2025-10-13 18:15:13,614] Trial 2 failed with parameters: {'embeddi

KeyboardInterrupt: 