<h1 style='text-align: center;'> Part B - Tweets Fine-tuning, model training and compression with comparison </h1>
<h3 style='text-align: center;'> Group T, IDs: 316398387 ,318481447</h3>

Based on the preprocessing phase done in the previous section, we will approach this with 2 models, one is **Encoder(only) based** model, and the other will be **Decoder only**
<h6 style='text-align: left;'>similar imports like previous part:</h6>




In [58]:
%%capture 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch
import bitsandbytes as bnb
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig, AutoTokenizer, get_linear_schedule_with_warmup, AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import seaborn as sns
from decouple import Config, RepositoryEnv 
import emoji
import optuna
from langdetect import detect, DetectorFactory
from ftfy import fix_text
import re
import nltk
from nltk.corpus import stopwords
import wandb
from wordcloud import WordCloud
from collections import Counter
import re
import warnings
import optuna
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)
wandb.login(key=Config(RepositoryEnv("./.env")).get('wandb_api_key'));  # If not necessary comment this line of W&B, or change to wandb.login() and pase API key"

In [50]:
df = pd.read_csv("preprocessed_tweets.csv"); df.head(); # The preprocessed tweets csv should be in same root folder after cloning

In [51]:
CLASS_TEXTS = ['extremely negative','negative','neutral','positive','extremely positive']  # order matters
LABEL2ID = {s:i for i,s in enumerate(CLASS_TEXTS)}
ID2LABEL = {i:s for s,i in LABEL2ID.items()}

train_df, tmp = train_test_split(df, test_size=0.3, stratify=df['Sentiment'], random_state=42)
val_df, test_df = train_test_split(tmp, test_size=0.4, stratify=tmp['Sentiment'], random_state=42)
train_df.shape, val_df.shape, test_df.shape

((28609, 2), (7356, 2), (4905, 2))

### Decoder-only: TinlyLLAMA model

We will start with the decoder only model where our samples need to be in a format of instruction. 
we will train the decoder in a generative way and its format will be like: 
"Tweet: {text}\nSentiment: {label_text}" and we will mask the prompt part.
<br>
The **Raw Pytorch fine-tuning**

In [55]:
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # swap to Mistral/Llama-3 if you have VRAM
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.padding_side = "right"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def build_prompt(text: str) -> str:
    return f"Tweet: {text}\nSentiment:"

def build_target(label_text: str) -> str:
    return label_text + tokenizer.eos_token

class GenTweetSet(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len=160):
        self.prompts = [build_prompt(t) for t in df["clean_text"]]
        self.targets = [build_target(s) for s in df["Sentiment"]]
        self.tok = tokenizer
        self.max_len = max_len

    def __len__(self): return len(self.prompts)

    def __getitem__(self, i):
        s_enc = self.tok(self.prompts[i], add_special_tokens=False)
        t_enc = self.tok(self.targets[i], add_special_tokens=False)
        input_ids  = s_enc["input_ids"] + t_enc["input_ids"]
        attn_mask  = [1]*len(input_ids)
        labels     = [-100]*len(s_enc["input_ids"]) + t_enc["input_ids"]
        if len(input_ids) > self.max_len:
            cut = self.max_len
            # keep at least target portion
            keep_src = max(0, cut - len(t_enc["input_ids"]))
            input_ids = input_ids[:keep_src] + t_enc["input_ids"][:cut-keep_src]
            attn_mask = [1]*len(input_ids)
            labels    = [-100]*keep_src + t_enc["input_ids"][:cut-keep_src]
        return {"input_ids": input_ids, "attention_mask": attn_mask, "labels": labels}

def pad_collate(batch):
    maxlen = max(len(x["input_ids"]) for x in batch)
    pad_id = tok.pad_token_id
    out = {"input_ids":[], "attention_mask":[], "labels":[]}
    for x in batch:
        pad = maxlen - len(x["input_ids"])
        out["input_ids"].append(x["input_ids"] + [pad_id]*pad)
        out["attention_mask"].append(x["attention_mask"] + [0]*pad)
        out["labels"].append(x["labels"] + [-100]*pad)
    return {k: torch.tensor(v) for k,v in out.items()}

def build_causal_lora(lora_r=16, lora_alpha=32, lora_dropout=0.05):
    quant = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True,
                               bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME, quantization_config=quant, torch_dtype=torch.bfloat16, device_map="auto"
    )
    model.config.pad_token_id = tokenizer.pad_token_id
    model = prepare_model_for_kbit_training(model)
    peft = LoraConfig(
        r=lora_r, lora_alpha=lora_alpha, lora_dropout=lora_dropout,
        target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
        bias="none", task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, peft)
    return model

def extract_label(gen_text: str):
    s = gen_text.lower()
    for lab in CLASS_TEXTS:
        if lab.lower() in s:
            return lab
    # fallback heuristics
    if "very positive" in s or "extremely positive" in s: return "Extremely Positive"
    if "very negative" in s or "extremely negative" in s: return "Extremely Negative"
    if "positive" in s: return "Positive"
    if "negative" in s: return "Negative"
    if "neutral" in s:  return "Neutral"
    return "Neutral"

@torch.no_grad()
def eval_macro_f1(model, df_eval, max_len=160, max_new_tokens=4, bs=64):
    prompts = [build_prompt(t) for t in df_eval["clean_text"]]
    gold    = df_eval["Sentiment"].tolist()
    preds = []
    for i in range(0, len(prompts), bs):
        batch = prompts[i:i+bs]
        enc = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_len).to(model.device)
        out = model.generate(
            **enc, max_new_tokens=max_new_tokens, do_sample=False, eos_token_id=tokenizer.eos_token_id
        )
        input_lens = enc["attention_mask"].sum(dim=1).tolist()
        for j, ids in enumerate(out):
            gen_ids = ids[input_lens[j]:]
            text = tokenizer.decode(gen_ids, skip_special_tokens=True)
            preds.append(extract_label(text))
    y_true = [LABEL2ID[x] for x in gold]
    y_pred = [LABEL2ID.get(x, LABEL2ID["Neutral"]) for x in preds]
    return f1_score(y_true, y_pred, average="macro")
  

def run_epoch(model, loader, optimizer=None, scheduler=None, train=True):
    model.train() if train else model.eval()
    total_loss = 0.0
    for batch in loader:
        for k in batch: batch[k] = batch[k].to(model.device)
        with torch.set_grad_enabled(train):
            out = model(**batch)
            loss = out.loss
            if train:
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                if scheduler: scheduler.step()
        total_loss += loss.item() * batch["input_ids"].size(0)
    return total_loss / len(loader.dataset)

def objective_raw(trial: optuna.Trial):
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
    weight_decay  = trial.suggest_loguniform("weight_decay", 1e-6, 1e-4)
    patience      = trial.suggest_int("patience", 7, 10)
    batch_size    = trial.suggest_categorical("batch_size", [32, 64, 128])
    num_layers    = trial.suggest_int("num_layers", 1, 3)  # placeholder
    max_len       = trial.suggest_categorical("max_length", [128, 160, 192])
    num_epochs    = trial.suggest_int("num_epochs", 2, 4)
    warmup_ratio  = trial.suggest_float("warmup_ratio", 0.05, 0.2)
    lora_r        = trial.suggest_categorical("lora_r", [8, 16, 32])
    lora_alpha    = trial.suggest_categorical("lora_alpha", [16, 32, 64])
    lora_dropout  = trial.suggest_float("lora_dropout", 0.0, 0.2)

    wandb_run = wandb.init(project="tweets-decoder-gentask-raw", config=trial.params)

    model = build_causal_lora(lora_r, lora_alpha, lora_dropout)

    tr_ds = GenTweetSet(train_df, tokenizer, max_len=max_len)
    va_ds = GenTweetSet(val_df, tokenizer, max_len=max_len)
    tr_ld = DataLoader(tr_ds, batch_size=batch_size, shuffle=True, collate_fn=pad_collate)
    va_ld = DataLoader(va_ds, batch_size=batch_size, shuffle=False, collate_fn=pad_collate)

    optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    steps = num_epochs * len(tr_ld)
    warmup = int(warmup_ratio * steps)
    scheduler = get_linear_schedule_with_warmup(optimizer, warmup, steps)

    best_f1, no_improve = -1.0, 0
    for ep in range(1, num_epochs+1):
        tr_loss = run_epoch(model, tr_ld, optimizer, scheduler, train=True)
        va_loss = run_epoch(model, va_ld, train=False)
        va_f1 = eval_macro_f1(model, val_df, max_len=max_len)
        wandb.log({"epoch": ep, "train_loss": tr_loss, "val_loss": va_loss, "val_macro_f1": va_f1})
        trial.report(va_f1, ep)
        if trial.should_prune(): raise optuna.TrialPruned()
        if va_f1 > best_f1:
            best_f1, no_improve = va_f1, 0
            model.save_pretrained(f"best_raw_gen_trial_{trial.number}")
            tokenizer.save_pretrained(f"best_raw_gen_trial_{trial.number}")
        else:
            no_improve += 1
            if no_improve >= patience:
                break

    wandb.summary["best_val_macro_f1"] = best_f1
    wandb_run.finish()
    return best_f1

study = optuna.create_study(direction="maximize")
study.optimize(objective_raw, n_trials=2)
print("Best raw macro-F1:", study.best_value, study.best_trial.params)


**The HF trainer** + optuna and W&B experiment like the above raw training: 

In [54]:
from datasets import Dataset, DatasetDict

def hf_build_decoder_ds(df, max_len=160):
    # map into prompt -> labels (causal LM): mask source tokens with -100
    def _enc(ex):
        src = build_prompt(ex["clean_text"])
        tgt = build_target(ex["Sentiment"])
        s = tokenizer(src, add_special_tokens=False)
        t = tokenizer(tgt, add_special_tokens=False)
        inp = s["input_ids"] + t["input_ids"]
        att = [1]*len(inp)
        lab = [-100]*len(s["input_ids"]) + t["input_ids"]
        # truncate keeping all target tokens
        if len(inp) > max_len:
            keep_src = max(0, max_len - len(t["input_ids"]))
            inp = inp[:keep_src] + t["input_ids"][:max_len-keep_src]
            att = [1]*len(inp)
            lab = [-100]*keep_src + t["input_ids"][:max_len-keep_src]
        return {"input_ids": inp, "attention_mask": att, "labels": lab}

    cols = ["input_ids","attention_mask","labels"]
    ds = Dataset.from_pandas(df[["clean_text","Sentiment"]])
    ds = ds.map(_enc, remove_columns=[c for c in ds.column_names if c not in ["clean_text","Sentiment"]])
    ds = ds.remove_columns(["clean_text","Sentiment"])
    return ds

def hf_decoder_collator(features):
    pad_id = tokenizer.pad_token_id
    maxlen = max(len(f["input_ids"]) for f in features)
    batch = {"input_ids":[], "attention_mask":[], "labels":[]}
    for f in features:
        pad = maxlen - len(f["input_ids"])
        batch["input_ids"].append(f["input_ids"] + [pad_id]*pad)
        batch["attention_mask"].append(f["attention_mask"] + [0]*pad)
        batch["labels"].append(f["labels"] + [-100]*pad)
    return {k: torch.tensor(v) for k,v in batch.items()}

# ---------- Trainer factory ----------

def build_decoder_trainer(params):
    max_len = params["max_length"]
    d_train = hf_build_decoder_ds(train_df, max_len=max_len)
    d_val   = hf_build_decoder_ds(val_df,   max_len=max_len)

    quant = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
    )
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME, quantization_config=quant, torch_dtype=torch.bfloat16, device_map="auto"
    )
    model.config.pad_token_id = tokenizer.pad_token_id
    model = prepare_model_for_kbit_training(model)
    peft_cfg = LoraConfig(
        r=params["lora_r"], lora_alpha=params["lora_alpha"], lora_dropout=params["lora_dropout"],
        target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
        bias="none", task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, peft_cfg)

    args = TrainingArguments(
        output_dir=f"./hf_decoder_out",
        per_device_train_batch_size=params["batch_size"],
        per_device_eval_batch_size=params["batch_size"],
        learning_rate=params["learning_rate"],
        weight_decay=params["weight_decay"],
        num_train_epochs=params["num_epochs"],
        warmup_ratio=params["warmup_ratio"],
        logging_strategy="steps", logging_steps=50,
        evaluation_strategy="steps", eval_steps=200,
        save_strategy="steps", save_steps=200,
        load_best_model_at_end=True, metric_for_best_model="eval_loss",
        gradient_checkpointing=True,
        fp16=torch.cuda.is_available(),
        report_to=["wandb"]
    )

    trainer = Trainer(
        model=model, args=args,
        train_dataset=d_train, eval_dataset=d_val,
        data_collator=hf_decoder_collator, tokenizer=tokenizer
    )
    return trainer

# ---------- Optuna objective for HF Trainer ----------

def objective_hf_decoder(trial: optuna.Trial):
    params = {
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 1e-3),
        "weight_decay":  trial.suggest_loguniform("weight_decay", 1e-6, 1e-4),
        "patience":      trial.suggest_int("patience", 7, 10),   # used only for raw; kept for parity
        "batch_size":    trial.suggest_categorical("batch_size", [32, 64, 128]),
        "num_layers":    trial.suggest_int("num_layers", 1, 3),  # placeholder (not used directly)
        "max_length":    trial.suggest_categorical("max_length", [128, 160, 192]),
        "num_epochs":    trial.suggest_int("num_epochs", 2, 4),
        "warmup_ratio":  trial.suggest_float("warmup_ratio", 0.05, 0.2),
        "lora_r":        trial.suggest_categorical("lora_r", [8, 16, 32]),
        "lora_alpha":    trial.suggest_categorical("lora_alpha", [16, 32, 64]),
        "lora_dropout":  trial.suggest_float("lora_dropout", 0.0, 0.2),
    }

    run = wandb.init(project="tweets-decoder-gentask-trainer", config=params)
    trainer = build_decoder_trainer(params)
    trainer.train()

    # compute macro-F1 via generation (same helper you already have)
    val_f1 = eval_macro_f1(trainer.model, val_df, max_len=params["max_length"])
    wandb.summary["val_macro_f1_gen"] = val_f1

    # persist best
    out_dir = f"best_trainer_gen_trial_{trial.number}"
    trainer.save_model(out_dir); tokenizer.save_pretrained(out_dir)
    run.finish()
    return val_f1

# Example separate study for HF:
study_hf = optuna.create_study(direction="maximize")
study_hf.optimize(objective_hf_decoder, n_trials=2)
print("Best HF (decoder) macro-F1:", study_hf.best_value, study_hf.best_trial.params)


### Encoder-Only: DeBERTa-v3 based classification: 

In this part we will apply same process as in the decoder part but now on the encoder part so we can see what is more robust and performs better way. 

In [60]:
# ----- Encoder (DeBERTa-v3) setup -----
ENC_MODEL = "microsoft/deberta-v3-base"
enc_tokenizer = AutoTokenizer.from_pretrained(ENC_MODEL, use_fast=True)

LABELS = CLASS_TEXTS  # ["Extremely Negative","Negative","Neutral","Positive","Extremely Positive"]
ID2LABEL = {i:lab for i,lab in enumerate(LABELS)}
LABEL2ID = {lab:i for i,lab in enumerate(LABELS)}

class EncTweetSet(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len=160):
        self.texts = df["clean_text"].tolist()
        self.labels = [LABEL2ID[s] for s in df["Sentiment"].tolist()]
        self.tok = tokenizer
        self.max_len = max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, i):
        enc = self.tok(self.texts[i], truncation=True, max_length=self.max_len)
        return {
            "input_ids": enc["input_ids"],
            "attention_mask": enc["attention_mask"],
            "labels": self.labels[i]
        }

def enc_pad_collate(batch):
    pad_id = enc_tokenizer.pad_token_id
    maxlen = max(len(x["input_ids"]) for x in batch)
    out = {"input_ids":[], "attention_mask":[], "labels":[]}
    for x in batch:
        pad = maxlen - len(x["input_ids"])
        out["input_ids"].append(x["input_ids"] + [pad_id]*pad)
        out["attention_mask"].append(x["attention_mask"] + [0]*pad)
        out["labels"].append(x["labels"])
    return {k: torch.tensor(v) for k,v in out.items()}

def build_deberta_classif(num_labels=5, lora_cfg=None):
    quant = BitsAndBytesConfig(load_in_8bit=True)  # 8-bit encoder is usually fine; or fp16 if you prefer
    model = AutoModelForSequenceClassification.from_pretrained(
        ENC_MODEL, num_labels=num_labels, id2label=ID2LABEL, label2id=LABEL2ID,
        quantization_config=quant, device_map="auto"
    )
    if lora_cfg is not None:
        model = prepare_model_for_kbit_training(model)
        model = get_peft_model(model, lora_cfg)
    return model

@torch.no_grad()
def enc_eval_macro_f1(model, loader):
    model.eval()
    preds, gold = [], []
    for batch in loader:
        for k in batch: batch[k] = batch[k].to(model.device)
        out = model(**batch)
        p = out.logits.argmax(dim=-1).detach().cpu().tolist()
        y = batch["labels"].detach().cpu().tolist()
        preds.extend(p); gold.extend(y)
    return f1_score(gold, preds, average="macro")

# ---------- RAW PyTorch + Optuna for DeBERTa ----------

def objective_enc_raw(trial: optuna.Trial):
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-4)
    weight_decay  = trial.suggest_loguniform("weight_decay", 1e-6, 1e-4)
    patience      = trial.suggest_int("patience", 5, 10)
    batch_size    = trial.suggest_categorical("batch_size", [16, 32, 64])
    max_len       = trial.suggest_categorical("max_length", [128, 160, 192])
    num_epochs    = trial.suggest_int("num_epochs", 2, 4)
    warmup_ratio  = trial.suggest_float("warmup_ratio", 0.05, 0.2)
    use_lora      = trial.suggest_categorical("use_lora", [False, True])
    lora_r        = trial.suggest_categorical("lora_r", [8, 16]) if use_lora else 0
    lora_alpha    = trial.suggest_categorical("lora_alpha", [16, 32]) if use_lora else 0
    lora_dropout  = trial.suggest_float("lora_dropout", 0.0, 0.15) if use_lora else 0.0

    wandb_run = wandb.init(project="tweets-encoder-deberta-raw", config=trial.params)

    lora_cfg = None
    if use_lora:
        lora_cfg = LoraConfig(
            r=lora_r, lora_alpha=lora_alpha, lora_dropout=lora_dropout,
            target_modules=["query_proj","key_proj","value_proj","output_proj"],  # DeBERTa naming can vary; fallback to all linear layers if needed
            bias="none", task_type="SEQ_CLS"
        )

    model = build_deberta_classif(num_labels=len(LABELS), lora_cfg=lora_cfg)

    tr_ds = EncTweetSet(train_df, enc_tokenizer, max_len=max_len)
    va_ds = EncTweetSet(val_df,   enc_tokenizer, max_len=max_len)
    tr_ld = DataLoader(tr_ds, batch_size=batch_size, shuffle=True,  collate_fn=enc_pad_collate)
    va_ld = DataLoader(va_ds, batch_size=batch_size, shuffle=False, collate_fn=enc_pad_collate)

    optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    steps = num_epochs * len(tr_ld)
    warmup = int(warmup_ratio * steps)
    scheduler = get_linear_schedule_with_warmup(optimizer, warmup, steps)

    best_f1, no_improve = -1.0, 0
    for ep in range(1, num_epochs+1):
        model.train()
        total = 0.0
        for batch in tr_ld:
            for k in batch: batch[k] = batch[k].to(model.device)
            out = model(**batch)
            loss = out.loss
            optimizer.zero_grad(); loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step(); scheduler.step()
            total += loss.item()*batch["input_ids"].size(0)
        tr_loss = total/len(tr_ld.dataset)
        va_f1 = enc_eval_macro_f1(model, va_ld)
        wandb.log({"epoch": ep, "train_loss": tr_loss, "val_macro_f1": va_f1})
        trial.report(va_f1, ep)
        if trial.should_prune(): raise optuna.TrialPruned()
        if va_f1 > best_f1:
            best_f1, no_improve = va_f1, 0
            model.save_pretrained(f"best_deberta_raw_trial_{trial.number}")
            enc_tokenizer.save_pretrained(f"best_deberta_raw_trial_{trial.number}")
        else:
            no_improve += 1
            if no_improve >= patience:
                break

    wandb.summary["best_val_macro_f1"] = best_f1
    wandb_run.finish()
    return best_f1

study_enc_raw = optuna.create_study(direction="maximize")
study_enc_raw.optimize(objective_enc_raw, n_trials=2)
print("Best RAW DeBERTa macro-F1:", study_enc_raw.best_value, study_enc_raw.best_trial.params)

# ---------- HF Trainer + Optuna for DeBERTa ----------

from transformers import DataCollatorWithPadding

def build_enc_trainer(params):
    max_len = params["max_length"]
    def tok_fn(ex):
        out = enc_tokenizer(ex["clean_text"], truncation=True, max_length=max_len)
        out["labels"] = LABEL2ID[ex["Sentiment"]]
        return out
    dtr = Dataset.from_pandas(train_df[["clean_text","Sentiment"]]).map(tok_fn)
    dva = Dataset.from_pandas(val_df[["clean_text","Sentiment"]]).map(tok_fn)

    lora_cfg = None
    if params["use_lora"]:
        lora_cfg = LoraConfig(
            r=params["lora_r"], lora_alpha=params["lora_alpha"], lora_dropout=params["lora_dropout"],
            target_modules=["query_proj","key_proj","value_proj","output_proj"], bias="none", task_type="SEQ_CLS"
        )

    model = build_deberta_classif(num_labels=len(LABELS), lora_cfg=lora_cfg)

    args = TrainingArguments(
        output_dir="./hf_deberta_out",
        per_device_train_batch_size=params["batch_size"],
        per_device_eval_batch_size=params["batch_size"],
        learning_rate=params["learning_rate"],
        weight_decay=params["weight_decay"],
        num_train_epochs=params["num_epochs"],
        warmup_ratio=params["warmup_ratio"],
        logging_strategy="steps", logging_steps=50,
        evaluation_strategy="steps", eval_steps=200,
        save_strategy="steps", save_steps=200,
        load_best_model_at_end=True, metric_for_best_model="eval_f1",
        report_to=["wandb"], fp16=torch.cuda.is_available()
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = logits.argmax(axis=-1)
        return {
            "accuracy": (preds == labels).mean().item() if hasattr((preds == labels).mean(), "item") else float((preds == labels).mean()),
            "f1_macro": f1_score(labels, preds, average="macro")
        }

    trainer = Trainer(
        model=model, args=args,
        train_dataset=dtr, eval_dataset=dva,
        data_collator=DataCollatorWithPadding(enc_tokenizer),
        tokenizer=enc_tokenizer, compute_metrics=compute_metrics
    )
    return trainer

def objective_enc_hf(trial: optuna.Trial):
    params = {
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),
        "weight_decay":  trial.suggest_loguniform("weight_decay", 1e-6, 1e-4),
        "batch_size":    trial.suggest_categorical("batch_size", [16, 32, 64]),
        "max_length":    trial.suggest_categorical("max_length", [128, 160, 192]),
        "num_epochs":    trial.suggest_int("num_epochs", 2, 4),
        "warmup_ratio":  trial.suggest_float("warmup_ratio", 0.05, 0.2),
        "use_lora":      trial.suggest_categorical("use_lora", [False, True]),
        "lora_r":        trial.suggest_categorical("lora_r", [8, 16]) if trial.params.get("use_lora", False) else 0,
        "lora_alpha":    trial.suggest_categorical("lora_alpha", [16, 32]) if trial.params.get("use_lora", False) else 0,
        "lora_dropout":  trial.suggest_float("lora_dropout", 0.0, 0.15) if trial.params.get("use_lora", False) else 0.0,
    }
    run = wandb.init(project="tweets-encoder-deberta-trainer", config=params)
    trainer = build_enc_trainer(params)
    trainer.train()

    # return macro-F1 from best checkpoint on val set
    metrics = trainer.evaluate()
    val_f1 = metrics.get("eval_f1_macro", metrics.get("eval_f1", 0.0))
    wandb.summary["val_macro_f1"] = val_f1

    out_dir = f"best_deberta_trainer_trial_{trial.number}"
    trainer.save_model(out_dir); enc_tokenizer.save_pretrained(out_dir)
    run.finish()
    return val_f1

study_enc_hf = optuna.create_study(direction="maximize")
study_enc_hf.optimize(objective_enc_hf, n_trials=2)
print("Best HF DeBERTa macro-F1:", study_enc_hf.best_value, study_enc_hf.best_trial.params)
