Load & Split data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import re

file_path = '/content/drive/MyDrive/Combined All Dataset/Teks panjang-pendek(192)/preprocessed_Gabungan Seluruh Dataset_long6046.csv'
#file_path = '/content/drive/MyDrive/Combined All Dataset/Teks panjang-pendek(192)/preprocessed_Gabungan Seluruh Dataset_short31584.csv'
df = pd.read_csv(file_path)

# Hitung total jumlah data
total_samples = len(df)
print(f"Total data: {total_samples}\n")

# ==== Split Dataset =====
# Mengacak dataset terlebih dahulu
df = shuffle(df, random_state=42)

# Split: 80% train, 10% validation, 10% test
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Print jumlah tiap subset
print(f"Jumlah data pelatihan: {len(train_df)}")
print(f"Jumlah data validasi: {len(val_df)}")
print(f"Jumlah data pengujian: {len(test_df)}")

Tokenizer BERT

In [None]:
from transformers import DistilBertTokenizerFast
import torch
from torch.utils.data import Dataset
from typing import Optional, Union
import pandas as pd
import os
import random
import numpy as np

torch.manual_seed(42)
np.random.seed(42)
random.seed(42)


tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')


class ConfigTM :
    MAX_LENGTH = 256
    HIDDEN_DIM = 256


class PersonalityDataset(Dataset):
    def __init__(
        self,
        data: Union[str, pd.DataFrame],
        tokenizer: DistilBertTokenizerFast,
        max_len: int = ConfigTM.MAX_LENGTH,
        text_col: str = "Text",
        label_cols: Optional[list] = None
    ):
        if isinstance(data, str):
            if not os.path.isfile(data):
                raise FileNotFoundError(f"File '{data}' tidak ditemukan.")
            df = pd.read_csv(data)
        elif isinstance(data, pd.DataFrame):
            df = data.copy()
        else:
            raise ValueError("Argumen `data` harus str (path) atau pd.DataFrame.")

        if text_col not in df.columns:
            raise ValueError(f"Kolom '{text_col}' tidak ditemukan di data.")
        if label_cols is None:
            label_cols = df.columns[-5:].tolist()
        for lab in label_cols:
            if lab not in df.columns:
                raise ValueError(f"Kolom label '{lab}' tidak ditemukan di data.")

        self.tokenizer = tokenizer
        self.max_len = max_len

        self.texts = df[text_col].astype(str).tolist()
        self.labels = df[label_cols].astype(float).values

        encodings = tokenizer(
            self.texts,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        self.input_ids = encodings['input_ids']
        self.attention_mask = encodings['attention_mask']

        self.labels_tensor = torch.tensor(self.labels, dtype=torch.float)

    def __len__(self) -> int:
        return len(self.texts)

    def __getitem__(self, idx: int) -> dict:
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels_tensor[idx]
        }


train_dataset = PersonalityDataset(train_df, tokenizer, max_len=ConfigTM.MAX_LENGTH)
val_dataset   = PersonalityDataset(val_df,   tokenizer, max_len=ConfigTM.MAX_LENGTH)
test_dataset  = PersonalityDataset(test_df,  tokenizer, max_len=ConfigTM.MAX_LENGTH)


print("\nContoh hasil preprocessing 1 sample:")
sample = train_dataset[0]
print(f"Input IDs    : {sample['input_ids'].shape}  # tensor of length {sample['input_ids'].shape[0]}")
print(f"Attention Mask: {sample['attention_mask'].shape}")
print(f"Labels (OCEAN): {sample['labels']}  # shape {sample['labels'].shape}")

Definisi Model

In [None]:
import torch
import torch.nn as nn
from transformers import DistilBertModel

class SimpleBertOcean(nn.Module):
    def __init__(
        self,
        pretrained_model_name: str = 'distilbert-base-uncased',
        hidden_dim: int = ConfigTM.HIDDEN_DIM,
        dropout: float = 0.3,
        num_labels: int = 5,
        freeze_bert: bool = True
    ):
        super(SimpleBertOcean, self).__init__()

        self.bert = DistilBertModel.from_pretrained(pretrained_model_name)


        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False


        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, num_labels)

        )

    def forward(self, input_ids, attention_mask):

        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_states = outputs.last_hidden_state
        pooled = last_hidden_states[:, 0, :]


        logits = self.classifier(pooled)
        return logits

model = SimpleBertOcean(
    pretrained_model_name='distilbert-base-uncased',
    hidden_dim=ConfigTM.HIDDEN_DIM,
    dropout=0.3,
    num_labels=5,
    freeze_bert=True
)

print(model)

Hyperparameter dan pelatihan

In [None]:
import time
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW, SGD
import matplotlib.pyplot as plt
from google.colab import drive
import numpy as np
import random
from transformers import DistilBertModel

torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

class Config:
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    LEARNING_RATE = 5e-5
    BATCH_SIZE    = 32
    EPOCHS        = 3

    OPTIMIZER = 'adamw'

    LOSS_FN = 'bce_logits'

    PIN_MEMORY  = True
    NUM_WORKERS = 2

    CHECKPOINT_PATH = f'/content/drive/MyDrive/model_checkpoints/v1-tambahan metode threshold/distilbert_cekpt_Gabungan Seluruh Dataset_biner([{LEARNING_RATE}]-{BATCH_SIZE}-{EPOCHS}-{NUM_WORKERS})MX{ConfigTM.MAX_LENGTH} H{ConfigTM.HIDDEN_DIM} JMLH{len(df)}.pth'

train_loader = DataLoader(
    train_dataset,
    batch_size=Config.BATCH_SIZE,
    shuffle=True,
    pin_memory=Config.PIN_MEMORY,
    num_workers=Config.NUM_WORKERS
)
val_loader = DataLoader(
    val_dataset,
    batch_size=Config.BATCH_SIZE,
    shuffle=False,
    pin_memory=Config.PIN_MEMORY,
    num_workers=Config.NUM_WORKERS
)

model = SimpleBertOcean(
    pretrained_model_name='distilbert-base-uncased',
    hidden_dim=ConfigTM.HIDDEN_DIM,
    dropout=0.3,
    num_labels=5,
    freeze_bert=True
).to(Config.DEVICE)


if Config.LOSS_FN.lower() == 'bce':
    loss_fn = nn.BCELoss()
else:
    loss_fn = nn.BCEWithLogitsLoss()


trainable_params = filter(lambda p: p.requires_grad, model.parameters())
if Config.OPTIMIZER.lower() == 'sgd':
    optimizer = SGD(trainable_params, lr=Config.LEARNING_RATE)
else:
    optimizer = AdamW(trainable_params, lr=Config.LEARNING_RATE)



def train_epoch(loader, model, loss_fn, optimizer, device):
    model.train()
    running_loss = 0.0
    for batch in loader:
        input_ids      = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels         = batch['labels'].to(device)

        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        loss   = loss_fn(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    return running_loss / len(loader)

def eval_epoch(loader, model, loss_fn, device):
    model.eval()
    running_loss = 0.0
    with torch.no_grad():
        for batch in loader:
            input_ids      = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels         = batch['labels'].to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            loss   = loss_fn(logits, labels)
            running_loss += loss.item()

    return running_loss / len(loader)


train_losses, val_losses = [], []
start_all = time.time()

for epoch in range(1, Config.EPOCHS + 1):
    train_loss = train_epoch(train_loader, model, loss_fn, optimizer, Config.DEVICE)
    val_loss   = eval_epoch(val_loader,   model, loss_fn, Config.DEVICE)

    train_losses.append(train_loss)
    val_losses.append(val_loss)

    print(f"Epoch {epoch}/{Config.EPOCHS} | "
          f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

total_secs     = time.time() - start_all
total_time_str = time.strftime("%H:%M:%S", time.gmtime(total_secs))
print(f"\nTotal waktu pelatihan: {total_time_str}\n")


plt.figure(figsize=(8,5))
plt.plot(range(1, Config.EPOCHS+1), train_losses, label='Train Loss', marker='o')
plt.plot(range(1, Config.EPOCHS+1), val_losses,   label='Val Loss',   marker='s')
plt.xlabel('Epoch'); plt.ylabel('Loss')
plt.title('Loss Curve (Train vs Val)')
plt.legend(); plt.grid(alpha=0.3)
plt.show()

Simpan pelatihan sebelum ke tahap selanjutnya

In [None]:
import torch
import os

os.makedirs(os.path.dirname(Config.CHECKPOINT_PATH), exist_ok=True)

checkpoint = {
    'dataset_info': {
        'total_data': len(df),
        'train_size': len(train_dataset),
        'val_size'  : len(val_dataset),
        'test_size' : len(test_dataset)
    },
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'hyperparameters': {
        'learning_rate': Config.LEARNING_RATE,
        'batch_size'   : Config.BATCH_SIZE,
        'epochs'       : Config.EPOCHS,
        'optimizer'    : Config.OPTIMIZER,
        'loss_fn'      : Config.LOSS_FN,
        'pin_memory'   : Config.PIN_MEMORY,
        'num_workers'  : Config.NUM_WORKERS,
        'max_length'   : ConfigTM.MAX_LENGTH,
        'freeze_bert'  : True,
        'pretrained_model_name': 'distilbert-base-uncased',
        'hidden_dim'   : ConfigTM.HIDDEN_DIM,
        'dropout'      : 0.3,
        'num_labels'   : 5
    },
    'train_losses'  : train_losses,
    'val_losses'    : val_losses,
    'training_time' : total_time_str
}

torch.save(checkpoint, Config.CHECKPOINT_PATH)
print(f"Checkpoint tersimpan di: {Config.CHECKPOINT_PATH}")

Metode Threshold Tuning

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, matthews_corrcoef
import torch
from torch.utils.data import DataLoader

def find_optimal_thresholds_combined(
    probs,
    labels,
    num_labels=5,
    step=0.05,
    alpha=0.5,
    label_names=None
):
    """
    Mencari threshold optimal per dimensi dengan mengoptimalkan kombinasi F1 dan MCC.
    - alpha: bobot untuk F1 (0.0 = hanya MCC, 1.0 = hanya F1).
    """
    if label_names is None:
        label_names = [f"Dimensi_{i}" for i in range(num_labels)]

    optimal_thresholds = []
    print("Mencari threshold optimal per dimensi (mengoptimalkan kombinasi F1 & MCC)...")
    for i in range(num_labels):
        best_score    = -1
        best_f1       = 0
        best_mcc      = -1
        best_thresh   = 0.5

        for thresh in np.arange(0.05, 1.0, step):
            preds_binary = (probs[:, i] > thresh).astype(int)

            if len(np.unique(preds_binary)) < 2:
                continue

            _, _, f1, _ = precision_recall_fscore_support(
                labels[:, i], preds_binary, average='binary', zero_division=0
            )
            mcc = matthews_corrcoef(labels[:, i], preds_binary)

            mcc_norm = (mcc + 1) / 2.0


            score = alpha * f1 + (1 - alpha) * mcc_norm

            if score > best_score:
                best_score  = score
                best_f1     = f1
                best_mcc    = mcc
                best_thresh = thresh

        optimal_thresholds.append(best_thresh)
        print(
            f"  {label_names[i]} | "
            f"Threshold={best_thresh:.4f} | "
            f"F1={best_f1:.4f} | "
            f"MCC={best_mcc:.4f} | "
            f"Score_combined={best_score:.4f}"
        )

    return optimal_thresholds


def get_preds_labels(loader, model, device):
    """
    Mengumpulkan prediksi (probabilities) dan label asli dari DataLoader.
    """
    model.eval()
    all_probs, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            input_ids      = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels         = batch['labels'].cpu().numpy()

            logits = model(input_ids=input_ids, attention_mask=attention_mask)

            probs = torch.sigmoid(logits).cpu().numpy()

            all_probs.append(probs)
            all_labels.append(labels)


    preds_np  = np.vstack(all_probs)
    labels_np = np.vstack(all_labels)

    return preds_np, labels_np

val_probs, val_labels = get_preds_labels(val_loader, model, Config.DEVICE)

label_names = ['O', 'C', 'E', 'A', 'N']


val_thresholds = find_optimal_thresholds_combined(
    val_probs, val_labels,
    label_names=label_names,
    alpha=0.5,
    step=0.001
)

print("\nOptimal thresholds per dimensi:")
for ln, t in zip(label_names, val_thresholds):
    print(f" - {ln}: {t:.4f}")

Tes & Evaluasi

In [None]:
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    matthews_corrcoef
)
from torch.utils.data import DataLoader
import torch
import random

torch.manual_seed(42)
np.random.seed(42)
random.seed(42)


test_loader = DataLoader(
    test_dataset,
    batch_size=Config.BATCH_SIZE,
    shuffle=False,
    pin_memory=Config.PIN_MEMORY,
    num_workers=Config.NUM_WORKERS
)


def test_and_evaluate_per_dimension(test_loader, model, device, thresholds, label_names=None):
    """
    Menghitung metrik overall (micro) dan per-dimensi (O, C, E, A, N)
    menggunakan threshold yang diberikan.
    """
    model.eval()
    all_probs, all_labels = [], []

    with torch.no_grad():
        for batch in test_loader:
            input_ids      = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels         = batch['labels'].cpu().numpy()

            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.sigmoid(logits).cpu().numpy()

            all_probs.append(probs)
            all_labels.append(labels)


    preds  = np.vstack(all_probs)
    labels = np.vstack(all_labels)


    preds_binary = np.zeros_like(preds, dtype=int)
    for i in range(preds.shape[1]):
        preds_binary[:, i] = (preds[:, i] > thresholds[i]).astype(int)


    num_labels = labels.shape[1]


    overall_micro_acc = accuracy_score(labels.flatten(), preds_binary.flatten())

    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(
        labels.flatten(), preds_binary.flatten(), average='binary', zero_division=0
    )

    mcc_overall = matthews_corrcoef(labels.flatten(), preds_binary.flatten())



    dim_acc = {i: accuracy_score(labels[:, i], preds_binary[:, i]) for i in range(num_labels)}

    precision_per_label = []
    recall_per_label = []
    f1_per_label = []
    mcc_per_label = {}

    for i in range(num_labels):

        if len(np.unique(preds_binary[:, i])) < 2 or len(np.unique(labels[:, i])) < 2:
             p, r, f, _ = 0.0, 0.0, 0.0, None
             mcc = 0.0
        else:
            p, r, f, _ = precision_recall_fscore_support(
                labels[:, i], preds_binary[:, i], average='binary', zero_division=0
            )
            mcc = matthews_corrcoef(labels[:, i], preds_binary[:, i])

        precision_per_label.append(p)
        recall_per_label.append(r)
        f1_per_label.append(f)
        mcc_per_label[i] = mcc


    print("---- Test Metrics overall (with Optimized Thresholds) ----")
    print(f"Akurasi   : {overall_micro_acc:.4f}")
    print(f"Presisi   : {precision_micro:.4f}")
    print(f"Recall    : {recall_micro:.4f}")
    print(f"F1-score  : {f1_micro:.4f}")
    print(f"MCC       : {mcc_overall:.4f}\n")


    if label_names is None:
        label_names = [f"Dimensi_{i}" for i in range(num_labels)]

    print("---- Akurasi, Precision, Recall, F1, MCC Tiap Dimensi ----")
    print(f"{'Label':<8} | {'Accuracy':>8} | {'Precision':>9} | {'Recall':>7} | {'F1-score':>8} | {'MCC':>7} | {'Threshold':>9}")
    print("-" * 85)
    for i, name in enumerate(label_names):
        print(
            f"{name:<8} | "
            f"{dim_acc[i]:>8.4f} | "
            f"{precision_per_label[i]:>9.4f} | "
            f"{recall_per_label[i]:>7.4f} | "
            f"{f1_per_label[i]:>8.4f} | "
            f"{mcc_per_label[i]:>7.4f} | "
            f"{thresholds[i]:>9.4f}"
        )


    best_acc_idx = max(dim_acc, key=lambda x: dim_acc[x])
    best_f1_idx  = int(np.argmax(f1_per_label))
    best_mcc_idx = max(mcc_per_label, key=lambda x: mcc_per_label[x])


    win_counts = {i: 0 for i in range(num_labels)}

    if dim_acc[best_acc_idx] > 0: win_counts[best_acc_idx] += 1
    if f1_per_label[best_f1_idx] > 0: win_counts[best_f1_idx] += 1
    if mcc_per_label[best_mcc_idx] > -1: win_counts[best_mcc_idx] += 1


    if max(win_counts.values()) == 0:
         best_dim_idx = int(np.argmax(f1_per_label))
    else:
         best_dim_idx = max(win_counts, key=lambda x: win_counts[x])

    best_dim_name = label_names[best_dim_idx]


    best_acc_val = dim_acc[best_dim_idx]
    best_f1_val  = f1_per_label[best_dim_idx]
    best_mcc_val = mcc_per_label[best_dim_idx]
    best_thresh_val = thresholds[best_dim_idx]



    print(
        f"\nDimensi terbaik diprediksi (berdasarkan voting Accuracy, F1, MCC): "
        f"( {best_dim_name} ) "
        f"(Akurasi = {best_acc_val:.4f}) "
        f"(F1 = {best_f1_val:.4f}) "
        f"(MCC = {best_mcc_val:.4f}) "
        f"(Threshold = {best_thresh_val:.4f})"
    )


    metrics = {
        'overall_micro_acc': overall_micro_acc,
        'precision_micro': precision_micro,
        'recall_micro': recall_micro,
        'f1_micro': f1_micro,
        'mcc_overall': mcc_overall,
        'accuracy_per_dim': {label_names[i]: dim_acc[i] for i in range(num_labels)},
        'precision_per_dim': {label_names[i]: precision_per_label[i] for i in range(num_labels)},
        'recall_per_dim'   : {label_names[i]: recall_per_label[i] for i in range(num_labels)},
        'f1_per_dim'       : {label_names[i]: f1_per_label[i] for i in range(num_labels)},
        'mcc_per_dim'      : {label_names[i]: mcc_per_label[i] for i in range(num_labels)},
        'optimal_thresholds': {label_names[i]: thresholds[i] for i in range(num_labels)},

        'best_dim_by_vote': {
            'name': best_dim_name,
            'accuracy': best_acc_val,
            'f1': best_f1_val,
            'mcc': best_mcc_val,
            'threshold': best_thresh_val,
            'votes': win_counts[best_dim_idx]
        }
    }
    return metrics


label_names = ['O', 'C', 'E', 'A', 'N']

test_metrics = test_and_evaluate_per_dimension(test_loader, model, Config.DEVICE, val_thresholds, label_names)

# Simpan Hasil

Simpan Metadata

In [None]:
import json
import pandas as pd
from google.colab import files
import os

metadata = {
    'dataset_info': {
        'total_data': len(df),
        'train_size': len(train_dataset),
        'val_size':   len(val_dataset),
        'test_size':  len(test_dataset)
    },
    'hyperparameters': {
        'learning_rate': Config.LEARNING_RATE,
        'batch_size'   : Config.BATCH_SIZE,
        'epochs'       : Config.EPOCHS,
        'optimizer'    : Config.OPTIMIZER,
        'loss_fn'      : Config.LOSS_FN,
        'pin_memory'   : Config.PIN_MEMORY,
        'num_workers'  : Config.NUM_WORKERS,
        'max_length'   : ConfigTM.MAX_LENGTH,
        'freeze_bert'  : True,
        'pretrained_model_name': 'distilbert-base-uncased',
        'hidden_dim'   : ConfigTM.HIDDEN_DIM,
        'dropout'      : 0.3,
        'num_labels'   : 5
    },
    'train_losses': train_losses,
    'val_losses':   val_losses,
    'val_thresholds': val_thresholds,
    'test_metrics': test_metrics,
    'training_time': total_time_str
}


metadata_path = os.path.join(os.path.dirname(Config.CHECKPOINT_PATH), f'distilbert_metadata_threshold_tuned_Gabungan Seluruh Dataset_biner([{Config.LEARNING_RATE}]-{Config.BATCH_SIZE}-{Config.EPOCHS}-{Config.NUM_WORKERS})MX{ConfigTM.MAX_LENGTH} H{ConfigTM.HIDDEN_DIM} JMLH{len(df)}.json')

with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"Metadata saved to {metadata_path}")

try:
    files.download(metadata_path)
except Exception as e:
    print(f"Could not automatically download the file. Please download it manually from: {metadata_path}")