# **Import Libraries & Data Loading**

In [2]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, f1_score, classification_report, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm.auto import tqdm
import numpy as np
import os

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Load Data**

In [3]:
mainPath = "/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data"
dataPath = os.path.join(mainPath, "Dataset-BDC-SatriaData-2024")
cleanDataPath = os.path.join(mainPath, "Clean Dataset")

In [4]:
train = pd.read_csv(dataPath + "/dataset_penyisihan_bdc_2024.csv", sep=";")
test = pd.read_csv(dataPath + "/dataset_unlabeled_penyisihan_bdc_2024.csv", sep=";")
submissions = pd.read_csv(dataPath + "/template_jawaban_penyisihan_bdc_2024.csv", sep=";")

In [5]:
train1 = pd.read_excel(cleanDataPath + "/balanced train.xlsx").drop(columns = "Unnamed: 0")
train2 = pd.read_csv(cleanDataPath + "/Processing-Data-clean-text-5.csv")
train3 = pd.read_csv(cleanDataPath + "/Processing-Data-clean-text-6.csv")

# Preparing Data

In [6]:
# Encode labels
label_encoder = LabelEncoder()

train3['label'] = label_encoder.fit_transform(train3['label'])

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("indolem/indobertweet-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/235k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [7]:
# Munculkan panduan label setelah encoding
label_mapping = dict(zip(range(len(label_encoder.classes_)), label_encoder.classes_))
print("Panduan Label setelah Encoding:")
for key, value in label_mapping.items():
    print(f"Encoded {key} untuk label {value}")

Panduan Label setelah Encoding:
Encoded 0 untuk label Demografi
Encoded 1 untuk label Ekonomi
Encoded 2 untuk label Geografi
Encoded 3 untuk label Ideologi
Encoded 4 untuk label Pertahanan dan Keamanan
Encoded 5 untuk label Politik
Encoded 6 untuk label Sosial Budaya
Encoded 7 untuk label Sumber Daya Alam


In [8]:
class YourDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels.astype(int)  # Ensure labels are integers
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Cross-validation setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Hyperparameters
batch_size = 16
num_epochs = 4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Placeholder for balanced accuracy scores
balanced_acc_scores = []

# Fine Tune Distilbert: Cross-validation Method

In [9]:
# Cross-validation loop
for fold, (train_index, test_index) in enumerate(skf.split(train3['clean_text_5'], train3['label'])):
    print(f"Fold {fold + 1}")

    X_train, X_val = train3['clean_text_5'].iloc[train_index], train3['clean_text_5'].iloc[test_index]
    y_train, y_val = train3['label'].iloc[train_index], train3['label'].iloc[test_index]

    train_dataset = YourDataset(X_train, y_train, tokenizer)
    val_dataset = YourDataset(X_val, y_val, tokenizer)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

    model = AutoModelForSequenceClassification.from_pretrained("indolem/indobertweet-base-uncased", num_labels=len(label_encoder.classes_))
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=5e-5)
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        total_train_accuracy = 0

        for batch in tqdm(train_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_train_loss += loss.item()
            preds = torch.argmax(logits, dim=-1)
            accuracy = (preds == labels).float().mean()
            total_train_accuracy += accuracy.item()

            loss.backward()
            optimizer.step()
            lr_scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader)
        avg_train_accuracy = total_train_accuracy / len(train_dataloader)

        print(f"Epoch {epoch + 1}/{num_epochs}")
        print(f"Train Loss: {avg_train_loss:.4f} | Train Accuracy: {avg_train_accuracy:.4f}")

    # Evaluation
    model.eval()
    total_val_loss = 0
    total_val_accuracy = 0
    all_preds = []
    all_labels = []

    for batch in val_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        total_val_loss += loss.item()

        preds = torch.argmax(logits, dim=-1)
        accuracy = (preds == labels).float().mean()
        total_val_accuracy += accuracy.item()

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    avg_val_loss = total_val_loss / len(val_dataloader)
    avg_val_accuracy = total_val_accuracy / len(val_dataloader)
    balanced_acc = balanced_accuracy_score(all_labels, all_preds)
    balanced_acc_scores.append(balanced_acc)

    print(f"Validation Loss: {avg_val_loss:.4f} | Validation Accuracy: {avg_val_accuracy:.4f} | Balanced Accuracy: {balanced_acc:.4f}")

Fold 1


pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/231 [00:00<?, ?it/s]

Epoch 1/4
Train Loss: 0.9743 | Train Accuracy: 0.6989


  0%|          | 0/231 [00:00<?, ?it/s]

Epoch 2/4
Train Loss: 0.6810 | Train Accuracy: 0.7854


  0%|          | 0/231 [00:00<?, ?it/s]

Epoch 3/4
Train Loss: 0.4656 | Train Accuracy: 0.8571


  0%|          | 0/231 [00:00<?, ?it/s]

Epoch 4/4
Train Loss: 0.3045 | Train Accuracy: 0.9110
Validation Loss: 0.8323 | Validation Accuracy: 0.7448 | Balanced Accuracy: 0.4701
Fold 2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/231 [00:00<?, ?it/s]

Epoch 1/4
Train Loss: 1.0555 | Train Accuracy: 0.6832


  0%|          | 0/231 [00:00<?, ?it/s]

Epoch 2/4
Train Loss: 0.8817 | Train Accuracy: 0.7202


  0%|          | 0/231 [00:00<?, ?it/s]

Epoch 3/4
Train Loss: 0.7678 | Train Accuracy: 0.7600


  0%|          | 0/231 [00:00<?, ?it/s]

Epoch 4/4
Train Loss: 0.6293 | Train Accuracy: 0.8044
Validation Loss: 0.7815 | Validation Accuracy: 0.7619 | Balanced Accuracy: 0.4525
Fold 3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/231 [00:00<?, ?it/s]

Epoch 1/4
Train Loss: 1.0686 | Train Accuracy: 0.6791


  0%|          | 0/231 [00:00<?, ?it/s]

Epoch 2/4
Train Loss: 0.7262 | Train Accuracy: 0.7624


  0%|          | 0/231 [00:00<?, ?it/s]

Epoch 3/4
Train Loss: 0.5237 | Train Accuracy: 0.8377


  0%|          | 0/231 [00:00<?, ?it/s]

Epoch 4/4
Train Loss: 0.3625 | Train Accuracy: 0.8937
Validation Loss: 0.8348 | Validation Accuracy: 0.7638 | Balanced Accuracy: 0.4605
Fold 4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/231 [00:00<?, ?it/s]

Epoch 1/4
Train Loss: 1.0342 | Train Accuracy: 0.6805


  0%|          | 0/231 [00:00<?, ?it/s]

Epoch 2/4
Train Loss: 0.7353 | Train Accuracy: 0.7652


  0%|          | 0/231 [00:00<?, ?it/s]

Epoch 3/4
Train Loss: 0.5741 | Train Accuracy: 0.8190


  0%|          | 0/231 [00:00<?, ?it/s]

Epoch 4/4
Train Loss: 0.4268 | Train Accuracy: 0.8666
Validation Loss: 0.8131 | Validation Accuracy: 0.7513 | Balanced Accuracy: 0.4514
Fold 5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/231 [00:00<?, ?it/s]

Epoch 1/4
Train Loss: 0.9780 | Train Accuracy: 0.6989


  0%|          | 0/231 [00:00<?, ?it/s]

Epoch 2/4
Train Loss: 0.7040 | Train Accuracy: 0.7773


  0%|          | 0/231 [00:00<?, ?it/s]

Epoch 3/4
Train Loss: 0.5090 | Train Accuracy: 0.8363


  0%|          | 0/231 [00:00<?, ?it/s]

Epoch 4/4
Train Loss: 0.3537 | Train Accuracy: 0.8926
Validation Loss: 0.8385 | Validation Accuracy: 0.7466 | Balanced Accuracy: 0.4689


## Model Evaluasi

In [10]:
# Average balanced accuracy score across all folds
average_balanced_accuracy = np.mean(balanced_acc_scores)
print(f'Average Balanced Accuracy: {average_balanced_accuracy:.3f}')

Average Balanced Accuracy: 0.461


In [11]:
# Predict function
def predict(texts, tokenizer, model, max_length=128):
    # Ensure texts is a list of strings
    texts = [str(text) for text in texts]
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1)
    return preds

# Predict on the test set
texts_to_predict = X_val.tolist()
predictions = predict(texts_to_predict, tokenizer, model)
decoded_predictions = label_encoder.inverse_transform(predictions.cpu().numpy())

# Evaluation function
def evaluate_model(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    report = classification_report(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_true, y_pred)

    # Print the evaluation metrics
    print("Precision Score: ", precision)
    print("Recall Score: ", recall)
    print("F1 Score: ", f1)
    print("\nClassification Report:")
    print(report)
    print("Accuracy Score: ", accuracy)
    print("Balanced Accuracy Score: ", balanced_accuracy)

In [12]:
evaluate_model(y_val, predictions.tolist())

Precision Score:  0.7368764670168505
Recall Score:  0.7462039045553145
F1 Score:  0.7393534619042212

Classification Report:
              precision    recall  f1-score   support

           0       0.25      0.08      0.12        12
           1       0.80      0.70      0.75        61
           2       0.00      0.00      0.00         4
           3       0.67      0.51      0.58        69
           4       0.63      0.70      0.66        66
           5       0.82      0.86      0.84       594
           6       0.42      0.45      0.43        85
           7       0.54      0.45      0.49        31

    accuracy                           0.75       922
   macro avg       0.52      0.47      0.48       922
weighted avg       0.74      0.75      0.74       922

Accuracy Score:  0.7462039045553145
Balanced Accuracy Score:  0.468926065865761


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Saving Model

In [13]:
# Save the model - First Saved
modelPath = os.path.join(mainPath, "Model_Trained")

In [14]:
model.save_pretrained(modelPath + "/[Delete Name and stopword and Lexicon Tambahan]Trained-IndoBERTweet")
tokenizer.save_pretrained(modelPath + "/[Delete Name and stopword and Lexicon Tambahan]Trained-IndoBERTweet")

('/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Model_Trained/[Delete Name and stopword and Lexicon Tambahan]Trained-IndoBERTweet/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Model_Trained/[Delete Name and stopword and Lexicon Tambahan]Trained-IndoBERTweet/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Model_Trained/[Delete Name and stopword and Lexicon Tambahan]Trained-IndoBERTweet/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Model_Trained/[Delete Name and stopword and Lexicon Tambahan]Trained-IndoBERTweet/added_tokens.json',
 '/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Model_Trained/[Delete Name and stopword and Lexicon Tambahan]Trained-IndoBERTweet/tokenizer.json')

# Load Model

In [15]:
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import AutoModelForSequenceClassification, AdamW, get_scheduler, AutoTokenizer
from tqdm.auto import tqdm
from sklearn.metrics import balanced_accuracy_score

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_path = os.path.join(modelPath, "[Delete Name and stopword and Lexicon Tambahan]Trained-IndoBERTweet")
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.to(device)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31923, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# **Model Inference to Submissions**

In [17]:
# Memastikan model dan tokenizer sudah dimuat sebelumnya
# model, tokenizer = load_model_and_tokenizer(model_save_path)

# Fungsi untuk melakukan prediksi
def predict(texts, tokenizer, model, max_length=128):
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1)
    return preds

In [18]:
from collections import Counter

In [19]:
y_pred_indices = predict(test["Text"].tolist(), tokenizer, model).cpu().numpy() # udah run tpi lama, jadi ku copy aja hasil output nya taruh di cell bawah

In [20]:
# Mengubah indeks kelas menjadi label asli
y_pred_labels = label_encoder.inverse_transform(y_pred_indices)

In [21]:
Counter(y_pred_labels)

Counter({'Politik': 876,
         'Ideologi': 4,
         'Sosial Budaya': 49,
         'Ekonomi': 46,
         'Pertahanan dan Keamanan': 19,
         'Sumber Daya Alam': 5,
         'Demografi': 1})

In [None]:
submissions["Kelas"] = y_pred_labels

In [None]:
test

Unnamed: 0,IDText,Text
0,TXT0001,Lu mau org2 pro-demokrasi di negara ini bisa p...
1,TXT0002,Prabowo ditanya soal hutang luar negeri dia me...
2,TXT0003,kiki_daliyo Ganjar Pranowo itulah beliau soso...
3,TXT0004,@kumparan Prabowo Gibran yang bisa melakukan i...
4,TXT0005,@sniperruben45 @uda_zulhendra @ainunnajib Lah ...
...,...,...
995,TXT0996,"Bikin bangga deh, Ganjar-Mahfud mau alokasikan..."
996,TXT0997,Pak Jokowi sebelum pilpres 2024 berbesar hati ...
997,TXT0998,@datuakrajoangek Sbaiknya si gemot nga usah ik...
998,TXT0999,kebiasaan merembuk atau bermusyawarah jadi gay...


In [None]:
submissions

Unnamed: 0,IDText,Kelas
0,TXT0001,Demografi
1,TXT0002,Demografi
2,TXT0003,Demografi
3,TXT0004,Demografi
4,TXT0005,Demografi
...,...,...
995,TXT0996,Demografi
996,TXT0997,Demografi
997,TXT0998,Sosial Budaya
998,TXT0999,Demografi


In [None]:
# submissions.to_csv("/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Submissions/[IndoBERTweet [Delete Name and stopword and Lexicon Tambahan] - Clean_Text_5]SD2024040000208.csv", index = False)