In [None]:
# %pip install --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# %pip install transformers[torch]==4.30.2
# %pip install accelerate -U
# %pip install optuna

In [None]:
model_choice_int = 1
bert_model_path = 'indolem/indobert-base-uncased'
distilbert_model_path = 'cahya/distilbert-base-indonesian'

# 0 = distilbert, 1 = bert

In [None]:
import pandas as pd

datasets = {}

df = pd.read_csv('Dataset\Pre-Processed Dataset\combined_dataset.csv')
datasets['combined_dataset'] = df[['encoded_label', 'clean_text']]

datasets['combined_dataset'].head(10)

## Train-Test Split

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset

# Custom dataset
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

    def __len__(self):
        return len(self.labels)

# Load tokenizer
if model_choice_int == 0:
    tokenizer = AutoTokenizer.from_pretrained(distilbert_model_path)
    
else:
    tokenizer = AutoTokenizer.from_pretrained(bert_model_path)


# Dictionary to hold train and test custom datasets
train_test_datasets = {}

for name, df in datasets.items():
    # Get text and labels
    X = df['clean_text'].tolist()
    y = df['encoded_label'].tolist()

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Tokenize
    train_encodings = tokenizer(X_train, truncation=True, padding=True)
    test_encodings = tokenizer(X_test, truncation=True, padding=True)

    # Create datasets
    train_dataset = CustomDataset(train_encodings, y_train)
    test_dataset = CustomDataset(test_encodings, y_test)

    # Store in Dict
    train_test_datasets[name] = {
        "train": train_dataset,
        "test": test_dataset
    }

    print(f"Dataset: {name}, Train size: {len(train_dataset)}, Test size: {len(test_dataset)}")

In [None]:
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder
# import torch
# from torch.utils.data import Dataset

# # Custom dataset
# class CustomDataset(Dataset):
#     def __init__(self, encodings, labels):
#         self.encodings = encodings
#         self.labels = labels

#     def __getitem__(self, idx):
#         item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
#         item['labels'] = torch.tensor(int(self.labels[idx]))
#         return item

#     def __len__(self):
#         return len(self.labels)

# # Load tokenizer
# if model_choice_int == 0:
#     tokenizer = AutoTokenizer.from_pretrained(distilbert_model_path)
    
# else:
#     tokenizer = AutoTokenizer.from_pretrained(bert_model_path)


# # Dictionary to hold train and test custom datasets
# train_test_datasets = {}

# for name, df in datasets.items():
#     # Get text and labels
#     X = df['clean_text'].tolist()
#     y = df['encoded_label'].tolist()

#     # Train-test-val split
#     X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
#     X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

#     # Tokenize
#     train_encodings = tokenizer(X_train, truncation=True, padding=True)
#     test_encodings = tokenizer(X_test, truncation=True, padding=True)
#     val_encodings = tokenizer(X_val, truncation=True, padding=True)

#     # Create datasets
#     train_dataset = CustomDataset(train_encodings, y_train)
#     test_dataset = CustomDataset(test_encodings, y_test)
#     val_dataset = CustomDataset(val_encodings, y_val)

#     # Store in Dict
#     train_test_datasets[name] = {
#         "train": train_dataset,
#         "test": test_dataset,
#         "val": val_dataset
#     }

#     print(f"Dataset: {name}, Train size: {len(train_dataset)} | Test size: {len(test_dataset)} | Val size: {len(val_dataset)}")




## Train and Evaluation


In [None]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

In [None]:
output_dir = f'./results/{name}'
log_dir = f'./logs/{name}'
os.makedirs(output_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)

default_training_args = TrainingArguments(
    output_dir=output_dir,
    run_name=f'training_{name}',
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    # gradient_accumulation_steps=2,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir=log_dir,
    logging_steps=10,
    do_eval=True,
    eval_steps=500,
    save_strategy="epoch",
    save_total_limit=1,
    report_to=[],
    fp16=True,
    
)

best_training_args_distil = TrainingArguments(
    output_dir=output_dir,
    run_name=f'training_{name}',
    per_device_eval_batch_size=16,  # can stay as-is for evaluation
    num_train_epochs=6,            # ⬅️ from Optuna
    per_device_train_batch_size=16,  # ⬅️ from Optuna
    warmup_steps=29,               # ⬅️ from Optuna
    weight_decay=0.09793901282245424,  # ⬅️ from Optuna
    learning_rate=3.694163912198525e-05,  # ⬅️ from Optuna
    logging_dir=log_dir,
    logging_steps=10,
    do_eval=True,
    eval_steps=500,
    save_strategy="epoch",
    save_total_limit=1,
    report_to=[],
    fp16=True,
)

best_training_args_bert = TrainingArguments(
    output_dir=output_dir,
    run_name=f'training_{name}',
    per_device_eval_batch_size=8,
    per_device_train_batch_size=8,  # From Optuna
    num_train_epochs=9,             # From Optuna
    warmup_steps=117,               # From Optuna
    weight_decay=0.12399629519542921,  # From Optuna
    learning_rate=2.9677882858655988e-05,  # From Optuna
    logging_dir=log_dir,
    logging_steps=10,
    do_eval=True,
    eval_steps=500,
    save_strategy="epoch",
    save_total_limit=1,
    report_to=[],
    fp16=True,
)


### Default


In [None]:
os.environ["WANDB_MODE"] = "disabled"

In [None]:
def evaluate_model(model, test_dataset, batch_size, output_dir, dataset_name):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    dataloader = DataLoader(test_dataset, batch_size=batch_size)

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            # Move inputs to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Move to CPU and accumulate
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    # Metrics
    report = classification_report(all_labels, all_preds, digits=4)
    with open(os.path.join(output_dir, "classification_report.txt"), "w") as f:
        f.write(report)

    cm = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix: {dataset_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "confusion_matrix.png"))
    plt.close()

    print(f"✅ Evaluation complete for {dataset_name}")

In [None]:
print(f"\n🚀 Training on dataset: {name}")

# Initialize model
if model_choice_int == 0:
    model = DistilBertForSequenceClassification.from_pretrained(
            distilbert_model_path, num_labels=2
    )

    training_args = best_training_args_distil

elif model_choice_int == 1:
    model = BertForSequenceClassification.from_pretrained(
            bert_model_path, num_labels=2
    )

    training_args = best_training_args_bert

else:
    model = DistilBertForSequenceClassification.from_pretrained(
            distilbert_model_path, num_labels=2
    )

    training_args = default_training_args


print(training_args)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_test_datasets[name]['train'],
    eval_dataset=train_test_datasets[name]['test'],
)

trainer.train()

# Save model
trainer.save_model(output_dir)

In [None]:
# Evaluate and log
evaluate_model(model, train_test_datasets[name]['test'], 4, output_dir, name)

### Optuna

In [None]:
import torch
import numpy as np
from sklearn.metrics import accuracy_score
from transformers import Trainer, DistilBertForSequenceClassification, BertForSequenceClassification
import gc
from torch.utils.data import DataLoader
from tqdm import tqdm
from optuna import create_study
from sklearn.metrics import f1_score

def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16]),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.3),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2, 10),
        "warmup_steps": trial.suggest_int("warmup_steps", 0, 300),
    }

def manual_evaluate(model, dataset, batch_size=8):
    model.eval()
    model.to("cuda" if torch.cuda.is_available() else "cpu")
    dataloader = DataLoader(dataset, batch_size=batch_size)

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            inputs = {k: v.to(model.device) for k, v in batch.items() if k in ["input_ids", "attention_mask"]}
            labels = batch["labels"].to(model.device)

            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = f1_score(all_labels, all_preds)

    torch.cuda.empty_cache()
    gc.collect()

    return acc
    

# Subclass Trainer to inject memory cleanup
class CleanTrainer(Trainer):
    def train(self, *args, **kwargs):
        result = super().train(*args, **kwargs)
        torch.cuda.empty_cache()
        gc.collect()
        return result

if model_choice_int == 0:
    # Initialize model
    model_init = lambda: DistilBertForSequenceClassification.from_pretrained(
        distilbert_model_path, num_labels=2
    )

else:
    model_init = lambda: BertForSequenceClassification.from_pretrained(
        bert_model_path, num_labels=2
    )



# Initialize the Trainer
trainer = CleanTrainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_test_datasets[name]["train"],
    eval_dataset=train_test_datasets[name]["test"],
    compute_metrics= manual_evaluate(trainer.model, train_test_datasets[name]['test'], batch_size=8),
)

def objective(trial):
    hp = optuna_hp_space(trial)

    training_args = TrainingArguments(
        output_dir=f"./results/optuna_trial_{trial.number}",
        num_train_epochs=hp["num_train_epochs"],
        per_device_train_batch_size=hp["per_device_train_batch_size"],
        warmup_steps=hp["warmup_steps"],
        weight_decay=hp["weight_decay"],
        save_strategy="no",        # ← disables saving
        save_total_limit=0,         # ← just in case (optional safety)
        logging_dir=f"./logs/optuna_trial_{trial.number}",
        report_to=[],  # Disable wandb
        fp16=True,
    )

    model = model_init()

    trainer = CleanTrainer(
        model=model,
        args=training_args,
        train_dataset=train_test_datasets[name]["train"],
        eval_dataset=train_test_datasets[name]["test"],
    )

    trainer.train()

    acc = manual_evaluate(trainer.model, train_test_datasets[name]["test"], batch_size=8)
    return acc

study = create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print("✅ Best trial:")
print(study.best_trial)



In [None]:
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer  # Only if tokenizer not yet loaded


# 1. Retrieve best hyperparameters from Optuna
best_hp = study.best_trial.params

# 2. Re-initialize training arguments with best parameters
final_training_args = TrainingArguments(
    output_dir="./results/final_eval",
    num_train_epochs=best_hp["num_train_epochs"],
    per_device_train_batch_size=best_hp["per_device_train_batch_size"],
    warmup_steps=best_hp["warmup_steps"],
    weight_decay=best_hp["weight_decay"],
    learning_rate=best_hp["learning_rate"],
    save_strategy="no",
    report_to=[],
    fp16=True,
    logging_dir="./logs/final_eval"
)

# 3. Re-initialize model with same init function
final_model = model_init()

# 4. Retrain model on TRAIN + VAL before final test evaluation (optional but recommended)
from torch.utils.data import ConcatDataset
train_val_dataset = ConcatDataset([
    train_test_datasets[name]["train"],
    train_test_datasets[name]["val"]
])

# Load the same tokenizer used to tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained(distilbert_model_path if model_choice_int == 0 else bert_model_path)

# Create a data collator to dynamically pad batches
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


final_trainer = CleanTrainer(
    model=final_model,
    args=final_training_args,
    train_dataset=train_val_dataset,
    data_collator=data_collator  # <- Fixes padding issue
)

# 5. Train on full train+val
final_trainer.train()

# 6. Final evaluation on the test set
final_f1 = manual_evaluate(final_model, train_test_datasets[name]["test"], batch_size=8)

print(f"\n🎯 Final Test F1 Score (macro): {final_f1:.4f}")
