In [None]:
# Install library yang diperlukan
# !pip install transformers
# !pip install torch
!pip install datasets


# Import library
import os
os.environ['WANDB_DISABLED'] = "true"

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset as HFDataset


# Kalau Pakai Colab :
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/TRAM/Data/single_label.json'  # Sesuaikan path Google Drive

# # Kalau Pakai laptop :
# file_path = '/TRAM/Data/single_label.json'  # Sesuaikan path

Mounted at /content/drive


In [None]:
# 1. Load Data
df = pd.read_json(file_path)
print(len(df))
# df = df.head(500) # <<< Jumlah baris ============================================================

# 2. Pembersihan Data (basic)
df = df[['text', 'label']]  # Pastikan hanya kolom 'text' dan 'label'
df.dropna(inplace=True)

# 3. Encoding Label
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

# ========= Buang Label Terlalu Sedikit =========
# Hitung jumlah data per label
label_counts = df['label_encoded'].value_counts()
# Ambil hanya label yang jumlahnya >= 2
valid_labels = label_counts[label_counts >= 2].index
# Filter dataframe
df = df[df['label_encoded'].isin(valid_labels)]


# 4. Membuat Train-Test Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['label_encoded'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df['label_encoded']
)

# 5. Load Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 6. Tokenisasi
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# 7. Dataset Custom
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)

# 8. Load Model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_encoder.classes_)
)

# 9. Buat Folder Output di Google Drive
output_dir = '/content/drive/MyDrive/TRAM/Output_BERT'
os.makedirs(output_dir, exist_ok=True)

# 10. Training Arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch", # Kalau versi baru ganti jadi evaluation_strategy
    save_strategy="epoch",
    logging_strategy="epoch",
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=f'{output_dir}/logs',
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True
)

# 11. Metric (Optional Evaluation Metric untuk Trainer)
import numpy as np

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        'accuracy': (preds == labels).mean()
    }

# 12. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# 13. Train
trainer.train()

# # 14. Evaluasi
# preds_output = trainer.predict(val_dataset)
# predictions = np.argmax(preds_output.predictions, axis=1)
# print(classification_report(val_labels, predictions, target_names=label_encoder.classes_))

# 14. Evaluasi
from sklearn.utils.multiclass import unique_labels

preds_output = trainer.predict(val_dataset)
predictions = np.argmax(preds_output.predictions, axis=1)

# Temukan label yang ada di data validasi
val_label_ids = unique_labels(val_labels, predictions)

# Print classification report
print(classification_report(
    val_labels,
    predictions,
    labels=val_label_ids,
    target_names=label_encoder.inverse_transform(val_label_ids)
))


# 15. Save Model dan Label Encoder
model.save_pretrained(f'{output_dir}/bert_model')
tokenizer.save_pretrained(f'{output_dir}/bert_model')

import pickle
with open(f'{output_dir}/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

print("Model dan tokenizer berhasil disimpan di Google Drive")


5089


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy
1,2.9283,1.656733,0.641454
2,1.0454,0.743562,0.834971
3,0.4469,0.598152,0.859528
4,0.2323,0.57881,0.86444


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

   T1003.001       1.00      1.00      1.00        22
       T1005       0.83      0.77      0.80        13
       T1012       1.00      0.20      0.33         5
       T1016       0.89      0.73      0.80        11
   T1021.001       0.85      0.89      0.87        19
       T1027       0.86      0.91      0.88       137
       T1033       0.80      0.80      0.80        10
   T1036.005       0.77      0.71      0.74        14
       T1041       0.65      0.69      0.67        16
       T1047       1.00      1.00      1.00        15
   T1053.005       1.00      1.00      1.00        21
       T1055       0.95      0.95      0.95        57
   T1056.001       1.00      0.92      0.96        13
       T1057       0.86      0.75      0.80        16
   T1059.003       0.93      0.94      0.94        69
       T1068       0.00      0.00      0.00         2
   T1070.004       0.65      0.88      0.75        17
   T1071.001       0.82    