# **Import Libraries & Data Loading**

In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, f1_score, classification_report, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm.auto import tqdm
import numpy as np
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Load Data**

In [None]:
mainPath = "/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data"
dataPath = os.path.join(mainPath, "Dataset-BDC-SatriaData-2024")
cleanDataPath = os.path.join(mainPath, "Clean Dataset")

In [None]:
train = pd.read_csv(dataPath + "/dataset_penyisihan_bdc_2024.csv", sep=";")
test = pd.read_csv(dataPath + "/dataset_unlabeled_penyisihan_bdc_2024.csv", sep=";")
submissions = pd.read_csv(dataPath + "/template_jawaban_penyisihan_bdc_2024.csv", sep=";")

In [None]:
train1 = pd.read_excel(cleanDataPath + "/balanced train.xlsx").drop(columns = "Unnamed: 0")
train2 = pd.read_csv(cleanDataPath + "/Processing-Data-clean-text-5.csv")

In [None]:
Geo_Aug = pd.read_excel(cleanDataPath + "/final_geografi_augmentasi.xlsx").drop(columns = "Unnamed: 0").rename(columns = {"clean_text_5": "text"})
Demo_Aug = pd.read_excel(cleanDataPath + "/final_demografi_augmentasi.xlsx").drop(columns = "Unnamed: 0").rename(columns = {"clean_text_5": "text"})

In [None]:
Geo_Aug.head()

Unnamed: 0,label,text
0,Geografi,malam ganjar doa bareng puluh ribu warga saran...
1,Geografi,golput pilih calon presiden ganjar pranowo mah...
2,Geografi,tiktok bilang anies bangun kota jakarta provin...
3,Geografi,kuliah gratis dampak anak tinggal kota pelosok...
4,Geografi,jokowi orgnya panen uang kota negara tanah pra...


In [None]:
Demo_Aug.head()

Unnamed: 0,label,text
0,Demografi,dukung goblok dukung ridwan kamil skema mayori...
1,Demografi,konsisten suara lawan radikalisme toleransi pi...
2,Demografi,ganjar mahfud program kuliah gratis anak tenta...
3,Demografi,jumat agenda ganjar pranowo mahfud kampanye ja...
4,Demografi,anies gila grup bangga anak bangsa indonesia p...


# Preparing Data

In [11]:
# Encode labels
label_encoder = LabelEncoder()

train2['label'] = label_encoder.fit_transform(train2['label'])
Demo_Aug['label'] = label_encoder.transform(Demo_Aug['label'])
Geo_Aug['label'] = label_encoder.transform(Geo_Aug['label'])

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("indolem/indobertweet-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/235k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [12]:
# Munculkan panduan label setelah encoding
label_mapping = dict(zip(range(len(label_encoder.classes_)), label_encoder.classes_))
print("Panduan Label setelah Encoding:")
for key, value in label_mapping.items():
    print(f"Encoded {key} untuk label {value}")

Panduan Label setelah Encoding:
Encoded 0 untuk label Demografi
Encoded 1 untuk label Ekonomi
Encoded 2 untuk label Geografi
Encoded 3 untuk label Ideologi
Encoded 4 untuk label Pertahanan dan Keamanan
Encoded 5 untuk label Politik
Encoded 6 untuk label Sosial Budaya
Encoded 7 untuk label Sumber Daya Alam


In [13]:
class YourDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels.astype(int)  # Ensure labels are integers
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Cross-validation setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Hyperparameters
batch_size = 16
num_epochs = 4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Placeholder for balanced accuracy scores
balanced_acc_scores = []

In [14]:
# Function to augment data (simple example)
def augment_data(train_df, geo_aug_df, demo_aug_df):
    # Duplicate the existing data
    augmented_data = train_df.copy()
    # Append Geo_Aug and Demo_Aug data
    augmented_data = pd.concat([augmented_data, geo_aug_df, demo_aug_df], ignore_index=True)
    return augmented_data

In [15]:
# # Cross-validation loop
# for fold, (train_index, test_index) in enumerate(skf.split(train2['clean_text_5'], train2['label'])):
#     print(f"Fold {fold + 1}")

#     X_train, X_val = train2['clean_text_5'].iloc[train_index], train2['clean_text_5'].iloc[test_index]
#     y_train, y_val = train2['label'].iloc[train_index], train2['label'].iloc[test_index]
#     print(X_train.shape,X_val.shape)
#     print(X_train,"\n")
#     print(y_train.value_counts())

#     # Augment the training data
#     train_aug = augment_data(pd.DataFrame({'text': X_train, 'label': y_train}), Geo_Aug, Demo_Aug)

#     X_train_aug = train_aug['text']
#     y_train_aug = train_aug['label']

# Fine Tune Distilbert: Cross-validation Method

In [16]:
# Cross-validation loop
for fold, (train_index, test_index) in enumerate(skf.split(train2['clean_text_5'], train2['label'])):
    print(f"Fold {fold + 1}")

    X_train, X_val = train2['clean_text_5'].iloc[train_index], train2['clean_text_5'].iloc[test_index]
    y_train, y_val = train2['label'].iloc[train_index], train2['label'].iloc[test_index]

    # Augment the training data
    train_aug = augment_data(pd.DataFrame({'text': X_train, 'label': y_train}), Geo_Aug, Demo_Aug).drop_duplicates()

    X_train_aug = train_aug['text']
    y_train_aug = train_aug['label']

    train_dataset = YourDataset(X_train_aug, y_train_aug, tokenizer)
    val_dataset = YourDataset(X_val, y_val, tokenizer)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

    model = AutoModelForSequenceClassification.from_pretrained("indolem/indobertweet-base-uncased", num_labels=len(label_encoder.classes_))
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=5e-5)
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        total_train_accuracy = 0

        for batch in tqdm(train_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_train_loss += loss.item()
            preds = torch.argmax(logits, dim=-1)
            accuracy = (preds == labels).float().mean()
            total_train_accuracy += accuracy.item()

            loss.backward()
            optimizer.step()
            lr_scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader)
        avg_train_accuracy = total_train_accuracy / len(train_dataloader)

        print(f"Epoch {epoch + 1}/{num_epochs}")
        print(f"Train Loss: {avg_train_loss:.4f} | Train Accuracy: {avg_train_accuracy:.4f}")

    # Evaluation
    model.eval()
    total_val_loss = 0
    total_val_accuracy = 0
    all_preds = []
    all_labels = []

    for batch in val_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        total_val_loss += loss.item()

        preds = torch.argmax(logits, dim=-1)
        accuracy = (preds == labels).float().mean()
        total_val_accuracy += accuracy.item()

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    avg_val_loss = total_val_loss / len(val_dataloader)
    avg_val_accuracy = total_val_accuracy / len(val_dataloader)
    balanced_acc = balanced_accuracy_score(all_labels, all_preds)
    balanced_acc_scores.append(balanced_acc)

    print(f"Validation Loss: {avg_val_loss:.4f} | Validation Accuracy: {avg_val_accuracy:.4f} | Balanced Accuracy: {balanced_acc:.4f}")

Fold 1


pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/243 [00:00<?, ?it/s]

Epoch 1/4
Train Loss: 1.3208 | Train Accuracy: 0.6123


  0%|          | 0/243 [00:00<?, ?it/s]

Epoch 2/4
Train Loss: 1.4483 | Train Accuracy: 0.5963


  0%|          | 0/243 [00:00<?, ?it/s]

Epoch 3/4
Train Loss: 1.4455 | Train Accuracy: 0.5940


  0%|          | 0/243 [00:00<?, ?it/s]

Epoch 4/4
Train Loss: 1.4436 | Train Accuracy: 0.5943
Validation Loss: 1.3444 | Validation Accuracy: 0.6426 | Balanced Accuracy: 0.1250
Fold 2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/243 [00:00<?, ?it/s]

Epoch 1/4
Train Loss: 1.1080 | Train Accuracy: 0.6523


  0%|          | 0/243 [00:00<?, ?it/s]

Epoch 2/4
Train Loss: 0.6849 | Train Accuracy: 0.7798


  0%|          | 0/243 [00:00<?, ?it/s]

Epoch 3/4
Train Loss: 0.4655 | Train Accuracy: 0.8534


  0%|          | 0/243 [00:00<?, ?it/s]

Epoch 4/4
Train Loss: 0.3155 | Train Accuracy: 0.9100
Validation Loss: 0.7803 | Validation Accuracy: 0.7780 | Balanced Accuracy: 0.6795
Fold 3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/243 [00:00<?, ?it/s]

Epoch 1/4
Train Loss: 1.0988 | Train Accuracy: 0.6594


  0%|          | 0/243 [00:00<?, ?it/s]

Epoch 2/4
Train Loss: 0.6562 | Train Accuracy: 0.7939


  0%|          | 0/243 [00:00<?, ?it/s]

Epoch 3/4
Train Loss: 0.4223 | Train Accuracy: 0.8760


  0%|          | 0/243 [00:00<?, ?it/s]

Epoch 4/4
Train Loss: 0.2753 | Train Accuracy: 0.9226
Validation Loss: 0.8521 | Validation Accuracy: 0.7496 | Balanced Accuracy: 0.6410
Fold 4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/242 [00:00<?, ?it/s]

Epoch 1/4
Train Loss: 1.1926 | Train Accuracy: 0.6379


  0%|          | 0/242 [00:00<?, ?it/s]

Epoch 2/4
Train Loss: 0.7548 | Train Accuracy: 0.7548


  0%|          | 0/242 [00:00<?, ?it/s]

Epoch 3/4
Train Loss: 0.5112 | Train Accuracy: 0.8432


  0%|          | 0/242 [00:00<?, ?it/s]

Epoch 4/4
Train Loss: 0.3602 | Train Accuracy: 0.8931
Validation Loss: 0.8107 | Validation Accuracy: 0.7464 | Balanced Accuracy: 0.6940
Fold 5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/243 [00:00<?, ?it/s]

Epoch 1/4
Train Loss: 1.1710 | Train Accuracy: 0.6396


  0%|          | 0/243 [00:00<?, ?it/s]

Epoch 2/4
Train Loss: 0.7777 | Train Accuracy: 0.7487


  0%|          | 0/243 [00:00<?, ?it/s]

Epoch 3/4
Train Loss: 0.5575 | Train Accuracy: 0.8245


  0%|          | 0/243 [00:00<?, ?it/s]

Epoch 4/4
Train Loss: 0.4038 | Train Accuracy: 0.8752
Validation Loss: 0.7313 | Validation Accuracy: 0.7664 | Balanced Accuracy: 0.6583


## Model Evaluasi

In [17]:
# Average balanced accuracy score across all folds
average_balanced_accuracy = np.mean(balanced_acc_scores)
print(f'Average Balanced Accuracy: {average_balanced_accuracy:.3f}')

Average Balanced Accuracy: 0.560


In [18]:
# Predict function
def predict(texts, tokenizer, model, max_length=128):
    # Ensure texts is a list of strings
    texts = [str(text) for text in texts]
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1)
    return preds

# Predict on the test set
texts_to_predict = X_val.tolist()
predictions = predict(texts_to_predict, tokenizer, model)
decoded_predictions = label_encoder.inverse_transform(predictions.cpu().numpy())

# Evaluation function
def evaluate_model(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    report = classification_report(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_true, y_pred)

    # Print the evaluation metrics
    print("Precision Score: ", precision)
    print("Recall Score: ", recall)
    print("F1 Score: ", f1)
    print("\nClassification Report:")
    print(report)
    print("Accuracy Score: ", accuracy)
    print("Balanced Accuracy Score: ", balanced_accuracy)

In [19]:
evaluate_model(y_val, predictions.tolist())

Precision Score:  0.7604177402551622
Recall Score:  0.7670639219934995
F1 Score:  0.7593367490392976

Classification Report:
              precision    recall  f1-score   support

           0       0.50      1.00      0.67        13
           1       0.81      0.63      0.71        62
           2       0.75      0.75      0.75         4
           3       0.62      0.44      0.52        68
           4       0.73      0.68      0.70        66
           5       0.83      0.89      0.86       594
           6       0.49      0.42      0.46        85
           7       0.58      0.45      0.51        31

    accuracy                           0.77       923
   macro avg       0.66      0.66      0.65       923
weighted avg       0.76      0.77      0.76       923

Accuracy Score:  0.7670639219934995
Balanced Accuracy Score:  0.6582572642937918


# Saving Model

In [20]:
# Save the model - First Saved
modelPath = os.path.join(mainPath, "Model_Trained")

In [21]:
model.save_pretrained(modelPath + "/[Augmented Data]Trained-IndoBERTweet")
tokenizer.save_pretrained(modelPath + "/[Augmented Data]Trained-IndoBERTweet")

('/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Model_Trained/[Augmented Data]Trained-IndoBERTweet/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Model_Trained/[Augmented Data]Trained-IndoBERTweet/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Model_Trained/[Augmented Data]Trained-IndoBERTweet/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Model_Trained/[Augmented Data]Trained-IndoBERTweet/added_tokens.json',
 '/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Model_Trained/[Augmented Data]Trained-IndoBERTweet/tokenizer.json')

# Load Model

In [22]:
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import AutoModelForSequenceClassification, AdamW, get_scheduler, AutoTokenizer
from tqdm.auto import tqdm
from sklearn.metrics import balanced_accuracy_score

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_path = os.path.join(modelPath, "[Augmented Data]Trained-IndoBERTweet")
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.to(device)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31923, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# **Model Inference to Submissions**

In [24]:
# Memastikan model dan tokenizer sudah dimuat sebelumnya
# model, tokenizer = load_model_and_tokenizer(model_save_path)

# Fungsi untuk melakukan prediksi
def predict(texts, tokenizer, model, max_length=128):
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1)
    return preds

In [25]:
from collections import Counter

In [26]:
y_pred_indices = predict(test["Text"].tolist(), tokenizer, model).cpu().numpy() # udah run tpi lama, jadi ku copy aja hasil output nya taruh di cell bawah

In [27]:
# Mengubah indeks kelas menjadi label asli
y_pred_labels = label_encoder.inverse_transform(y_pred_indices)

In [28]:
Counter(y_pred_labels)

Counter({'Demografi': 729,
         'Politik': 200,
         'Geografi': 14,
         'Ekonomi': 36,
         'Pertahanan dan Keamanan': 10,
         'Sosial Budaya': 9,
         'Sumber Daya Alam': 2})

In [29]:
submissions["Kelas"] = y_pred_labels

In [32]:
test

Unnamed: 0,IDText,Text
0,TXT0001,Lu mau org2 pro-demokrasi di negara ini bisa p...
1,TXT0002,Prabowo ditanya soal hutang luar negeri dia me...
2,TXT0003,kiki_daliyo Ganjar Pranowo itulah beliau soso...
3,TXT0004,@kumparan Prabowo Gibran yang bisa melakukan i...
4,TXT0005,@sniperruben45 @uda_zulhendra @ainunnajib Lah ...
...,...,...
995,TXT0996,"Bikin bangga deh, Ganjar-Mahfud mau alokasikan..."
996,TXT0997,Pak Jokowi sebelum pilpres 2024 berbesar hati ...
997,TXT0998,@datuakrajoangek Sbaiknya si gemot nga usah ik...
998,TXT0999,kebiasaan merembuk atau bermusyawarah jadi gay...


In [30]:
submissions

Unnamed: 0,IDText,Kelas
0,TXT0001,Demografi
1,TXT0002,Demografi
2,TXT0003,Demografi
3,TXT0004,Demografi
4,TXT0005,Demografi
...,...,...
995,TXT0996,Demografi
996,TXT0997,Demografi
997,TXT0998,Sosial Budaya
998,TXT0999,Demografi


In [31]:
submissions.to_csv("/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Submissions/[IndoBERTweet [Augmented Data] - Clean_Text_5]SD2024040000208.csv", index = False)