In [1]:
import pandas as pd
import re
import torch
import numpy as np
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, f1_score
import optuna
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_a = pd.read_csv('../Dataset/Data_A/data_train_A.csv')
dev_a = pd.read_csv('../Dataset/Data_A/data_dev_A.csv')
test_a = pd.read_csv('../Dataset/Data_A/data_test_A.csv')
train_b = pd.read_csv('../Dataset/Data_B/data_train_B.csv')
dev_b = pd.read_csv('../Dataset/Data_B/data_dev_B.csv')
test_b = pd.read_csv('../Dataset/Data_B/data_test_B.csv')


In [3]:
stimulus_a = ["Pemanasan global terjadi karena peningkatan produksi karbon dioksida yang dihasilkan oleh pembakaran fosil dan konsumsi bahan bakar yang tinggi.",
"Salah satu akibat adalah mencairnya es abadi di kutub utara dan selatan yang menimbulkan naiknya ketinggian air laut.",
"kenaikan air laut akan terjadi terus menerus meskipun dalam hitungan centimeter akan mengakibatkan perubahan yang signifikan.",
"Film “Waterworld”, adalah film fiksi ilmiah yang menunjukkan akibat adanya pemanasan global yang sangat besar sehingga menyebabkan bumi menjadi tertutup oleh lautan.",
"Negara-negara dan daratan yang dulunya kering menjadi tengelamn karena terjadi kenaikan permukaan air laut.",
"Penduduk yang dulunya bisa berkehidupan bebas menjadi terpaksa mengungsi ke daratan yang lebih tinggi atau tinggal diatas air.",
"Apa yang akan menjadi tantangan bagi suatu penduduk ketika terjadi situasi daratan tidak dapat ditinggali kembali karena tengelam oleh naiknya air laut."]

stimulus_b = ["Sebuah toko baju berkonsep self-service menawarkan promosi dua buah baju bertema tahun baru seharga Rp50.000,00. sebelum baju bertema tahun baru dibagikan kepada pembeli, sebuah layar akan menampilkan tampilan gambar yang menampilkan kondisi kerja di dalam sebuah pabrik konveksi/pembuatan baju. ",
"Kemudian pembeli diberi program pilihan untuk menyelesaikan pembeliannya atau menyumpangkan Rp50.000,00 untuk dijadikan donasi pembagian baju musim dingin di suatu daerah yang membutuhkan.",
"Delapan dari sepuluh pembeli memilih untuk memberikan donasi.",
"Menurut anda mengapa banyak dari pembeli yang memilih berdonasi?"]

In [4]:
stimulus_a_text = " ".join(stimulus_a)
stimulus_b_text = " ".join(stimulus_b)

for df in [train_a, dev_a, test_a]:
    df["TEXT"] = stimulus_a_text + " [SEP] " + df["RESPONSE"]

for df in [train_b, dev_b, test_b]:
    df["TEXT"] = stimulus_b_text + " [SEP] " + df["RESPONSE"]

In [5]:
stopwords_ukara = {'yang', 'lebih', 'untuk', 'akan', 'mereka', 'dan'}

def preprocess(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in stopwords_ukara]
    return " ".join(tokens)

train_a["clean_text"] = train_a["TEXT"].apply(preprocess)
train_b["clean_text"] = train_b["TEXT"].apply(preprocess)
test_a["clean_text"] = test_a["TEXT"].apply(preprocess)
test_b["clean_text"] = test_b["TEXT"].apply(preprocess)
dev_a["clean_text"] = dev_a["TEXT"].apply(preprocess)
dev_b["clean_text"] = dev_b["TEXT"].apply(preprocess)


In [6]:
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')


In [None]:

def encode_data(texts, labels, tokenizer, max_length=128):
    input_ids = []
    attention_masks = []
    
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)
    
    return input_ids, attention_masks, labels

In [None]:
def train_model(model, train_dataloader, val_dataloader, optimizer, scheduler, device, epochs=4):
    best_val_f1 = 0
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}', leave=False)
        
        for batch in progress_bar:
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_mask, labels = batch
            
            model.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            
            progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item())})
        
        avg_train_loss = total_loss / len(train_dataloader)
        
        val_f1, val_accuracy = evaluate_model(model, val_dataloader, device)
        
        print(f"Epoch {epoch+1}/{epochs}")
        print(f"Train loss: {avg_train_loss:.4f}")
        print(f"Validation F1: {val_f1:.4f}, Accuracy: {val_accuracy:.4f}")
        
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            torch.save(model.state_dict(), 'best_model.pt')
    
    return model

In [9]:
def evaluate_model(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []
    
    for batch in tqdm(dataloader, desc="Evaluating"):
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch
        
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        
        predictions.extend(np.argmax(logits, axis=1).flatten())
        true_labels.extend(label_ids.flatten())
    
    f1 = f1_score(true_labels, predictions)
    accuracy = accuracy_score(true_labels, predictions)
    return f1, accuracy


In [None]:
def run_experiment(train_texts, train_labels, val_texts, val_labels, test_texts, test_labels, params):
    train_input_ids, train_attention_masks, train_labels = encode_data(train_texts, train_labels, tokenizer)
    val_input_ids, val_attention_masks, val_labels = encode_data(val_texts, val_labels, tokenizer)
    test_input_ids, test_attention_masks, test_labels = encode_data(test_texts, test_labels, tokenizer)
    
    train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=params['batch_size'])
    
    val_data = TensorDataset(val_input_ids, val_attention_masks, val_labels)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=params['batch_size'])
    
    test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=params['batch_size'])
    
    # Initialize model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = XLMRobertaForSequenceClassification.from_pretrained(
        'xlm-roberta-base',
        num_labels=2,
        output_attentions=False,
        output_hidden_states=False
    )
    model.to(device)
    
    optimizer = AdamW(model.parameters(), lr=params['learning_rate'], eps=1e-8)
    total_steps = len(train_dataloader) * params['epochs']
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                              num_warmup_steps=0,
                                              num_training_steps=total_steps)
    
    model = train_model(model, train_dataloader, val_dataloader, optimizer, scheduler, device, params['epochs'])
    
    model.load_state_dict(torch.load('best_model.pt'))
    
    test_f1, test_accuracy = evaluate_model(model, test_dataloader, device)
    
    print("\nClassification Report on Test Set:")
    test_preds, test_labels = [], []
    with torch.no_grad():
        for batch in test_dataloader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            test_preds.extend(preds.cpu().numpy())
            test_labels.extend(labels.cpu().numpy())
    
    print(classification_report(test_labels, test_preds, target_names=['Incorrect', 'Correct']))
    
    return {
        'test_accuracy': test_accuracy,
        'test_f1': test_f1,
        'precision': precision_score(test_labels, test_preds),
        'recall': recall_score(test_labels, test_preds)
    }

In [None]:
def objective(trial, train_texts, train_labels, val_texts, val_labels):
    params = {
        'batch_size': trial.suggest_categorical('batch_size', [8, 16, 32]),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 5e-5, log=True),
        'epochs': trial.suggest_int('epochs', 3, 5)
    }
    
    results = run_experiment(
        train_texts, train_labels, 
        val_texts, val_labels,
        val_texts, val_labels, 
        params
    )
    
    return results['test_f1']

---


In [12]:
train_texts_a = train_a['clean_text'].tolist()
train_labels_a = train_a['LABEL'].values
val_texts_a = dev_a['clean_text'].tolist()
val_labels_a = dev_a['LABEL'].values
test_texts_a = test_a['clean_text'].tolist()
test_labels_a = test_a['LABEL'].values

In [13]:
study_a = optuna.create_study(direction='maximize')
study_a.optimize(lambda trial: objective(trial, train_texts_a, train_labels_a, val_texts_a, val_labels_a), n_trials=10)
best_params_a = study_a.best_params
print("\nBest hyperparameters for Dataset A:", best_params_a)

[I 2025-04-22 20:28:58,036] A new study created in memory with name: no-name-e6fd03ba-efa5-474f-8c47-55bec470d509
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  4.55it/s]                    


Epoch 1/4
Train loss: 0.6656
Validation F1: 0.8315, Accuracy: 0.7116


Evaluating: 100%|██████████| 7/7 [00:01<00:00,  4.52it/s]                    


Epoch 2/4
Train loss: 0.6125
Validation F1: 0.8315, Accuracy: 0.7116


Evaluating: 100%|██████████| 7/7 [00:01<00:00,  4.54it/s]                    


Epoch 3/4
Train loss: 0.6228
Validation F1: 0.8315, Accuracy: 0.7116


Evaluating: 100%|██████████| 7/7 [00:01<00:00,  4.54it/s]                    
  model.load_state_dict(torch.load('best_model.pt'))


Epoch 4/4
Train loss: 0.5899
Validation F1: 0.8315, Accuracy: 0.7116


Evaluating: 100%|██████████| 7/7 [00:01<00:00,  4.22it/s]



Classification Report on Test Set:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-04-22 20:33:28,412] Trial 0 finished with value: 0.8315217391304348 and parameters: {'batch_size': 32, 'learning_rate': 2.2430921386899696e-05, 'epochs': 4}. Best is trial 0 with value: 0.8315217391304348.


              precision    recall  f1-score   support

   Incorrect       0.00      0.00      0.00        62
     Correct       0.71      1.00      0.83       153

    accuracy                           0.71       215
   macro avg       0.36      0.50      0.42       215
weighted avg       0.51      0.71      0.59       215



Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 27/27 [00:01<00:00, 16.83it/s]                    


Epoch 1/4
Train loss: 0.6695
Validation F1: 0.8315, Accuracy: 0.7116


Evaluating: 100%|██████████| 27/27 [00:01<00:00, 16.79it/s]                    


Epoch 2/4
Train loss: 0.6301
Validation F1: 0.8315, Accuracy: 0.7116


Evaluating: 100%|██████████| 27/27 [00:01<00:00, 16.89it/s]                    


Epoch 3/4
Train loss: 0.6139
Validation F1: 0.8315, Accuracy: 0.7116


Evaluating: 100%|██████████| 27/27 [00:01<00:00, 16.73it/s]                    
  model.load_state_dict(torch.load('best_model.pt'))


Epoch 4/4
Train loss: 0.6373
Validation F1: 0.8315, Accuracy: 0.7116


Evaluating: 100%|██████████| 27/27 [00:01<00:00, 15.59it/s]



Classification Report on Test Set:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-04-22 20:38:28,265] Trial 1 finished with value: 0.8315217391304348 and parameters: {'batch_size': 8, 'learning_rate': 2.0054156790524582e-05, 'epochs': 4}. Best is trial 0 with value: 0.8315217391304348.


              precision    recall  f1-score   support

   Incorrect       0.00      0.00      0.00        62
     Correct       0.71      1.00      0.83       153

    accuracy                           0.71       215
   macro avg       0.36      0.50      0.42       215
weighted avg       0.51      0.71      0.59       215



Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 14/14 [00:01<00:00,  8.92it/s]                    


Epoch 1/4
Train loss: 0.7115
Validation F1: 0.8315, Accuracy: 0.7116


Evaluating: 100%|██████████| 14/14 [00:01<00:00,  8.92it/s]                    


Epoch 2/4
Train loss: 0.6045
Validation F1: 0.8315, Accuracy: 0.7116


Evaluating: 100%|██████████| 14/14 [00:01<00:00,  8.90it/s]                    


Epoch 3/4
Train loss: 0.6179
Validation F1: 0.8315, Accuracy: 0.7116


Evaluating: 100%|██████████| 14/14 [00:01<00:00,  8.95it/s]                    
  model.load_state_dict(torch.load('best_model.pt'))


Epoch 4/4
Train loss: 0.6211
Validation F1: 0.8315, Accuracy: 0.7116


Evaluating: 100%|██████████| 14/14 [00:01<00:00,  8.34it/s]



Classification Report on Test Set:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-04-22 20:41:17,032] Trial 2 finished with value: 0.8315217391304348 and parameters: {'batch_size': 16, 'learning_rate': 1.3066230115645936e-05, 'epochs': 4}. Best is trial 0 with value: 0.8315217391304348.


              precision    recall  f1-score   support

   Incorrect       0.00      0.00      0.00        62
     Correct       0.71      1.00      0.83       153

    accuracy                           0.71       215
   macro avg       0.36      0.50      0.42       215
weighted avg       0.51      0.71      0.59       215



Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 14/14 [00:01<00:00,  8.88it/s]                    


Epoch 1/4
Train loss: 0.6579
Validation F1: 0.8315, Accuracy: 0.7116


Evaluating: 100%|██████████| 14/14 [00:01<00:00,  8.88it/s]                    


Epoch 2/4
Train loss: 0.6177
Validation F1: 0.8315, Accuracy: 0.7116


Evaluating: 100%|██████████| 14/14 [00:01<00:00,  8.90it/s]                    


Epoch 3/4
Train loss: 0.6218
Validation F1: 0.8315, Accuracy: 0.7116


Evaluating: 100%|██████████| 14/14 [00:01<00:00,  8.89it/s]                    
  model.load_state_dict(torch.load('best_model.pt'))


Epoch 4/4
Train loss: 0.6084
Validation F1: 0.8315, Accuracy: 0.7116


Evaluating: 100%|██████████| 14/14 [00:01<00:00,  8.18it/s]



Classification Report on Test Set:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-04-22 20:44:05,455] Trial 3 finished with value: 0.8315217391304348 and parameters: {'batch_size': 16, 'learning_rate': 1.3446537961997179e-05, 'epochs': 4}. Best is trial 0 with value: 0.8315217391304348.


              precision    recall  f1-score   support

   Incorrect       0.00      0.00      0.00        62
     Correct       0.71      1.00      0.83       153

    accuracy                           0.71       215
   macro avg       0.36      0.50      0.42       215
weighted avg       0.51      0.71      0.59       215



Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  4.47it/s]                    


Epoch 1/5
Train loss: 0.6286
Validation F1: 0.8315, Accuracy: 0.7116


[W 2025-04-22 20:45:45,016] Trial 4 failed with parameters: {'batch_size': 32, 'learning_rate': 3.0314947698427283e-05, 'epochs': 5} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\ahmad\AppData\Local\Programs\Python\Python39\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\ahmad\AppData\Local\Temp\ipykernel_15480\4016788140.py", line 2, in <lambda>
    study_a.optimize(lambda trial: objective(trial, train_texts_a, train_labels_a, val_texts_a, val_labels_a), n_trials=10)
  File "C:\Users\ahmad\AppData\Local\Temp\ipykernel_15480\2225642489.py", line 9, in objective
    results = run_experiment(
  File "C:\Users\ahmad\AppData\Local\Temp\ipykernel_15480\1346242000.py", line 38, in run_experiment
    model = train_model(model, train_dataloader, val_dataloader, optimizer, scheduler, device, params['epochs'])
  File "C:\Users\ahmad\AppData\Local\Temp\ipykernel_1548

KeyboardInterrupt: 

In [None]:
print("\nTraining final model for Dataset A with best hyperparameters...")
final_results_a = run_experiment(
    train_texts_a, train_labels_a,
    val_texts_a, val_labels_a,
    test_texts_a, test_labels_a,
    best_params_a
)


---


In [None]:
rain_texts_b = train_b['clean_text'].tolist()
train_labels_b = train_b['LABEL'].values
val_texts_b = dev_b['clean_text'].tolist()
val_labels_b = dev_b['LABEL'].values
test_texts_b = test_b['clean_text'].tolist()
test_labels_b = test_b['LABEL'].values

In [None]:
study_b = optuna.create_study(direction='maximize')
study_b.optimize(lambda trial: objective(trial, train_texts_b, train_labels_b, val_texts_b, val_labels_b), n_trials=10)
best_params_b = study_b.best_params
print("\nBest hyperparameters for Dataset B:", best_params_b)

In [None]:
print("\nTraining final model for Dataset B with best hyperparameters...")
final_results_b = run_experiment(
    train_texts_b, train_labels_b,
    val_texts_b, val_labels_b,
    test_texts_b, test_labels_b,
    best_params_b
)

---


In [None]:
print("\nFinal Results:")
print("Dataset A:")
print(f"- Test Accuracy: {final_results_a['test_accuracy']:.4f}")
print(f"- Test F1: {final_results_a['test_f1']:.4f}")
print(f"- Precision: {final_results_a['precision']:.4f}")
print(f"- Recall: {final_results_a['recall']:.4f}")

print("\nDataset B:")
print(f"- Test Accuracy: {final_results_b['test_accuracy']:.4f}")
print(f"- Test F1: {final_results_b['test_f1']:.4f}")
print(f"- Precision: {final_results_b['precision']:.4f}")
print(f"- Recall: {final_results_b['recall']:.4f}")