# **Import Libraries & Data Loading**

In [37]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import re # if u want to learn regex [https://regex101.com/]
import string
import random # Random number generators - Library for generating random numbers, selecting random elements, shuffling sequences, etc.
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

pd.set_option("display.max_columns", None)
# pd.set_option("display.max_row", None)

In [38]:
!pip install sastrawi nlp-id



In [39]:
import nltk # Natural Language Toolkit - Library for natural language processing (NLP) tasks such as tokenization, stemming, tagging, parsing, and more.
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm # A Fast, Extensible Progress Bar - Library for creating progress bars to monitor the progress of iterations or tasks.
from nltk.stem import WordNetLemmatizer, PorterStemmer

from nlp_id.lemmatizer import Lemmatizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional, GlobalMaxPool1D, BatchNormalization, Dropout, GRU, Reshape
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf

import keras
from keras.initializers import Constant
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.optimizers import Adam

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    precision_score, # Precision score - Metric for evaluating classification models, measuring the ratio of correctly predicted positive observations to the total predicted positives.
    recall_score, # Recall score - Metric for evaluating classification models, measuring the ratio of correctly predicted positive observations to the total actual positives.
    f1_score, # F1 score - Harmonic mean of precision and recall, a metric for evaluating classification models.
    classification_report, # Classification report - Summary of the precision, recall, F1 score, and support for each class in a classification problem.
    accuracy_score, # Accuracy score - Metric for evaluating classification models, measuring the proportion of correct predictions to the total number of predictions.
    balanced_accuracy_score # Balanced accuracy score - Metric for evaluating classification models, measuring the accuracy of the model while accounting for imbalanced data.
)

In [40]:
def check_duplicates(dataframe):
    print("Duplicate Values (Top 10):")
    duplicate_values = dataframe[dataframe.duplicated()]
    print(f"Number of Duplicate Rows: {duplicate_values.shape[0]}")
    display(duplicate_values.head(10))

def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

def basic_data_info(dataframe):
    print("Data Preview:")
    print("---------------------------")
    display(dataframe.head())

    print("\nGeneral Info:")
    print("---------------------------")
    print(dataframe.info())

    print("\nDescriptive Statistics:")
    print("---------------------------")
    display(dataframe.describe().T)

# **Load Data**

In [41]:
mainPath = "/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data"
dataPath = os.path.join(mainPath, "Dataset-BDC-SatriaData-2024")
cleanDataPath = os.path.join(mainPath, "Clean Dataset")

In [42]:
train = pd.read_csv(dataPath + "/dataset_penyisihan_bdc_2024.csv", sep=";")
test = pd.read_csv(dataPath + "/dataset_unlabeled_penyisihan_bdc_2024.csv", sep=";")
submissions = pd.read_csv(dataPath + "/template_jawaban_penyisihan_bdc_2024.csv", sep=";")

# **Simple Explore the Data**

In [43]:
train.label.value_counts()

label
Politik                    2972
Sosial Budaya               587
Pertahanan dan Keamanan     400
Ideologi                    400
Ekonomi                     367
Sumber Daya Alam            192
Demografi                    62
Geografi                     20
Name: count, dtype: int64

# Splitting Data For Modeling

In [44]:
train = pd.read_excel(cleanDataPath + "/balanced train.xlsx").drop(columns = "Unnamed: 0")

In [45]:
train.label.value_counts()

label
Politik                    2969
Demografi                   427
Sosial Budaya               422
Ideologi                    343
Pertahanan dan Keamanan     331
Ekonomi                     309
Sumber Daya Alam            156
Geografi                    133
Name: count, dtype: int64

In [46]:
train.head()

Unnamed: 0,label,clean_text_5
0,Sumber Daya Alam,kunjung prabowo resmi serah proyek bantu air b...
1,Politik,anies tepuk tangan riah rektor wajib mata kuli...
2,Demografi,dukung goblok dukung ridwan kamil skema mayori...
3,Politik,anies sikap kritis kerja prabowo anggap sopan ...
4,Politik,anies baswedan harap aparatur sipil negara ten...


In [47]:
# Split data into 85% train and 15% test
X_train, X_test, y_train, y_test = train_test_split(train.clean_text_5, train.label, test_size=0.15, random_state=42, stratify=train.label)

print('Train Size : ', X_train.shape)
print('Test Size  : ', X_test.shape)

Train Size :  (4326,)
Test Size  :  (764,)


In [48]:
y_train.value_counts(), y_test.value_counts()

(label
 Politik                    2523
 Demografi                   363
 Sosial Budaya               359
 Ideologi                    291
 Pertahanan dan Keamanan     281
 Ekonomi                     263
 Sumber Daya Alam            133
 Geografi                    113
 Name: count, dtype: int64,
 label
 Politik                    446
 Demografi                   64
 Sosial Budaya               63
 Ideologi                    52
 Pertahanan dan Keamanan     50
 Ekonomi                     46
 Sumber Daya Alam            23
 Geografi                    20
 Name: count, dtype: int64)

In [49]:
# Change Target to One Hot Encoding
from tensorflow.keras.utils import to_categorical

# One-hot encoding pada target
y_train_ohe = to_categorical(y_train.factorize()[0])
y_test_ohe = to_categorical(y_test.factorize()[0])

print('One-Hot Encoded y_train:')
print(y_train_ohe)
print('One-Hot Encoded y_test:')
print(y_test_ohe)

One-Hot Encoded y_train:
[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]]
One-Hot Encoded y_test:
[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [50]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, get_scheduler, AutoModelForSequenceClassification, AdamW
from tqdm.auto import tqdm

# Encode labels as integers
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Convert to pandas Series if necessary
if not isinstance(y_train_encoded, pd.Series):
    y_train_encoded = pd.Series(y_train_encoded)
    y_test_encoded = pd.Series(y_test_encoded)

In [51]:
# Munculkan panduan label setelah encoding
label_mapping = dict(zip(range(len(label_encoder.classes_)), label_encoder.classes_))
print("Panduan Label setelah Encoding:")
for key, value in label_mapping.items():
    print(f"Encoded {key} untuk label {value}")

Panduan Label setelah Encoding:
Encoded 0 untuk label Demografi
Encoded 1 untuk label Ekonomi
Encoded 2 untuk label Geografi
Encoded 3 untuk label Ideologi
Encoded 4 untuk label Pertahanan dan Keamanan
Encoded 5 untuk label Politik
Encoded 6 untuk label Sosial Budaya
Encoded 7 untuk label Sumber Daya Alam


# Indobertweet Embedding - Finetune

In [52]:
# Define the Dataset class
class YourDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels.astype(int)  # Ensure labels are integers
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [53]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("indolem/indobertweet-base-uncased")

# Create datasets
train_dataset = YourDataset(X_train, y_train_encoded, tokenizer)
test_dataset = YourDataset(X_test, y_test_encoded, tokenizer)

# Define DataLoader parameters
batch_size = 16

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

print('DataLoader Created')

# Define model and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_labels = len(label_encoder.classes_)
model = AutoModelForSequenceClassification.from_pretrained("indolem/indobertweet-base-uncased", num_labels=num_labels)
model.to(device)  # Move model to GPU if available
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define the learning rate scheduler
num_epochs = 4
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

DataLoader Created


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [54]:
# Define evaluation function
def evaluate(model, dataloader):
    model.eval()
    total_eval_loss = 0
    total_eval_accuracy = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        total_eval_loss += loss.item()

        preds = torch.argmax(logits, dim=-1)
        accuracy = (preds == labels).float().mean()
        total_eval_accuracy += accuracy.item()

    avg_loss = total_eval_loss / len(dataloader)
    avg_accuracy = total_eval_accuracy / len(dataloader)
    return avg_loss, avg_accuracy

In [55]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    total_train_accuracy = 0

    for batch in tqdm(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        total_train_loss += loss.item()

        preds = torch.argmax(logits, dim=-1)
        accuracy = (preds == labels).float().mean()
        total_train_accuracy += accuracy.item()

        loss.backward()
        optimizer.step()
        lr_scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    avg_train_accuracy = total_train_accuracy / len(train_dataloader)

    test_loss, test_accuracy = evaluate(model, test_dataloader)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {avg_train_loss:.4f} | Train Accuracy: {avg_train_accuracy:.4f}")
    print(f"Test Loss: {test_loss:.4f} | Test Accuracy: {test_accuracy:.4f}")

print("Training Completed")

  0%|          | 0/271 [00:00<?, ?it/s]

Epoch 1/4
Train Loss: 1.0651 | Train Accuracy: 0.6700
Test Loss: 0.8339 | Test Accuracy: 0.7313


  0%|          | 0/271 [00:00<?, ?it/s]

Epoch 2/4
Train Loss: 0.6496 | Train Accuracy: 0.7879
Test Loss: 0.6988 | Test Accuracy: 0.7717


  0%|          | 0/271 [00:00<?, ?it/s]

Epoch 3/4
Train Loss: 0.4223 | Train Accuracy: 0.8736
Test Loss: 0.7308 | Test Accuracy: 0.7704


  0%|          | 0/271 [00:00<?, ?it/s]

Epoch 4/4
Train Loss: 0.2764 | Train Accuracy: 0.9150
Test Loss: 0.7615 | Test Accuracy: 0.7826
Training Completed


# Evaluasi Model

In [56]:
# Predict function
def predict(texts, tokenizer, model, max_length=128):
    # Ensure texts is a list of strings
    texts = [str(text) for text in texts]
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1)
    return preds

# Predict on the test set
texts_to_predict = X_test.tolist()
predictions = predict(texts_to_predict, tokenizer, model)
decoded_predictions = label_encoder.inverse_transform(predictions.cpu().numpy())

# Evaluation function
def evaluate_model(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    report = classification_report(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_true, y_pred)

    # Print the evaluation metrics
    print("Precision Score: ", precision)
    print("Recall Score: ", recall)
    print("F1 Score: ", f1)
    print("\nClassification Report:")
    print(report)
    print("Accuracy Score: ", accuracy)
    print("Balanced Accuracy Score: ", balanced_accuracy)

evaluate_model(y_test, decoded_predictions)

Precision Score:  0.783556853436667
Recall Score:  0.7827225130890052
F1 Score:  0.7820297974443329

Classification Report:
                         precision    recall  f1-score   support

              Demografi       0.90      0.94      0.92        64
                Ekonomi       0.70      0.67      0.69        46
               Geografi       0.95      0.90      0.92        20
               Ideologi       0.72      0.56      0.63        52
Pertahanan dan Keamanan       0.73      0.66      0.69        50
                Politik       0.84      0.86      0.85       446
          Sosial Budaya       0.43      0.48      0.45        63
       Sumber Daya Alam       0.61      0.61      0.61        23

               accuracy                           0.78       764
              macro avg       0.74      0.71      0.72       764
           weighted avg       0.78      0.78      0.78       764

Accuracy Score:  0.7827225130890052
Balanced Accuracy Score:  0.709091984269224


# **Saving Model**

In [63]:
# Save the model - First Saved
modelPath = os.path.join(mainPath, "Model_Trained")

In [64]:
model.save_pretrained(modelPath + "/Trained-indobertweet_Balanced_Dataset")
tokenizer.save_pretrained(modelPath + "/Trained-indobertweet_Balanced_Dataset")

('/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Model_Trained/Trained-indobertweet_Balanced_Dataset/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Model_Trained/Trained-indobertweet_Balanced_Dataset/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Model_Trained/Trained-indobertweet_Balanced_Dataset/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Model_Trained/Trained-indobertweet_Balanced_Dataset/added_tokens.json',
 '/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Model_Trained/Trained-indobertweet_Balanced_Dataset/tokenizer.json')

In [None]:
# # Save the model - Second Saved
# model_save_path = os.path.join(modelPath, "indobertweet_model.pth")
# torch.save(model.state_dict(), model_save_path)
# print(f"Model saved to {model_save_path}")

Model saved to /content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Model_Trained/indobertweet_model.pth


https://huggingface.co/docs/transformers/v4.41.3/en/tasks/multiple_choice#inference

# **Load Model**

In [None]:
# # Load model directly
# from transformers import AutoTokenizer, AutoModelForSequenceClassification

# tokenizer = AutoTokenizer.from_pretrained("Rendika/tweets-election-classification")
# model = AutoModelForSequenceClassification.from_pretrained("Rendika/tweets-election-classification")

In [65]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_path = os.path.join(modelPath, "Trained-indobertweet_Balanced_Dataset")
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.to(device)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31923, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# **Predict to Test**

In [None]:
# Memastikan model dan tokenizer sudah dimuat sebelumnya
# model, tokenizer = load_model_and_tokenizer(model_save_path)

# Fungsi untuk melakukan prediksi
def predict(texts, tokenizer, model, max_length=128):
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1)
    return preds

# Lakukan prediksi pada kolom "Text" di DataFrame
test["label"] = predict(test["Text"].tolist(), tokenizer, model).cpu().numpy()

# Decode predictions to original labels
test["label"] = label_encoder.inverse_transform(test["label"])

In [None]:
# Menampilkan beberapa hasil prediksi
test.head()

Unnamed: 0,IDText,Text,label
0,TXT0001,Lu mau org2 pro-demokrasi di negara ini bisa p...,Politik
1,TXT0002,Prabowo ditanya soal hutang luar negeri dia me...,Politik
2,TXT0003,kiki_daliyo Ganjar Pranowo itulah beliau soso...,Ideologi
3,TXT0004,@kumparan Prabowo Gibran yang bisa melakukan i...,Politik
4,TXT0005,@sniperruben45 @uda_zulhendra @ainunnajib Lah ...,Politik


In [None]:
submissions["Kelas"] = test["label"]

In [None]:
# submissions.to_csv("/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Submissions/IndoBERTweet_Predict.csv")