# **Import Libraries & Data Loading**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import re # if u want to learn regex [https://regex101.com/]
import string
import random # Random number generators - Library for generating random numbers, selecting random elements, shuffling sequences, etc.
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

pd.set_option("display.max_columns", None)
# pd.set_option("display.max_row", None)

In [None]:
!pip install sastrawi nlp-id

Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nlp-id
  Downloading nlp_id-0.1.15.0.tar.gz (54.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.8/54.8 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting wget==3.2 (from nlp-id)
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pytest==7.3.1 (from nlp-id)
  Downloading pytest-7.3.1-py3-none-any.whl (320 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.5/320.5 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: nlp-id, wget
  Building wheel for nlp-id (setup.py) ... [?25l[?25hdone
  Created wheel for nlp-id: filename=nlp_id-0.1.15.0-py3-none-any.whl size=58153892 sha256=4a4847a

In [None]:
import nltk # Natural Language Toolkit - Library for natural language processing (NLP) tasks such as tokenization, stemming, tagging, parsing, and more.
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm # A Fast, Extensible Progress Bar - Library for creating progress bars to monitor the progress of iterations or tasks.
from nltk.stem import WordNetLemmatizer, PorterStemmer

from nlp_id.lemmatizer import Lemmatizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional, GlobalMaxPool1D, BatchNormalization, Dropout, GRU, Reshape
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf

import keras
from keras.initializers import Constant
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.optimizers import Adam

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    precision_score, # Precision score - Metric for evaluating classification models, measuring the ratio of correctly predicted positive observations to the total predicted positives.
    recall_score, # Recall score - Metric for evaluating classification models, measuring the ratio of correctly predicted positive observations to the total actual positives.
    f1_score, # F1 score - Harmonic mean of precision and recall, a metric for evaluating classification models.
    classification_report, # Classification report - Summary of the precision, recall, F1 score, and support for each class in a classification problem.
    accuracy_score, # Accuracy score - Metric for evaluating classification models, measuring the proportion of correct predictions to the total number of predictions.
    balanced_accuracy_score # Balanced accuracy score - Metric for evaluating classification models, measuring the accuracy of the model while accounting for imbalanced data.
)

In [None]:
def check_duplicates(dataframe):
    print("Duplicate Values (Top 10):")
    duplicate_values = dataframe[dataframe.duplicated()]
    print(f"Number of Duplicate Rows: {duplicate_values.shape[0]}")
    display(duplicate_values.head(10))

def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

def basic_data_info(dataframe):
    print("Data Preview:")
    print("---------------------------")
    display(dataframe.head())

    print("\nGeneral Info:")
    print("---------------------------")
    print(dataframe.info())

    print("\nDescriptive Statistics:")
    print("---------------------------")
    display(dataframe.describe().T)

# **Load Data**

In [None]:
mainPath = "/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data"
dataPath = os.path.join(mainPath, "Dataset-BDC-SatriaData-2024")
cleanDataPath = os.path.join(mainPath, "Clean Dataset")

In [None]:
train = pd.read_csv(dataPath + "/dataset_penyisihan_bdc_2024.csv", sep=";")
test = pd.read_csv(dataPath + "/dataset_unlabeled_penyisihan_bdc_2024.csv", sep=";")
submissions = pd.read_csv(dataPath + "/template_jawaban_penyisihan_bdc_2024.csv", sep=";")

# **Simple Explore the Data**

In [None]:
check_duplicates(train)
# drop duplicate entries considering all columns
train = train.drop_duplicates()

Duplicate Values (Top 10):
Number of Duplicate Rows: 381


Unnamed: 0,text,label
57,RT Abah Anies ingin mengangkat martabat petani...,Sosial Budaya
104,"RT Anak Muda Indonesia, the future of this nat...",Ideologi
145,"RT Pupuk bersubsidi langka, Tim Prabowo Gibran...",Ekonomi
146,RT Abah Anies ingin mengangkat martabat petani...,Sosial Budaya
189,RT Abah Anies ingin mengangkat martabat petani...,Sosial Budaya
220,RT Abah Anies ingin mengangkat martabat petani...,Sosial Budaya
234,"RT al fatihah buat Alm. Lambang Babar Purnomo,...",Pertahanan dan Keamanan
257,RT Kapitalisme neoliberal tak cocok untuk Indo...,Ideologi
298,"RT Anak Muda Indonesia, the future of this nat...",Ideologi
350,"RT According to Prabowo, Gaza is opressed beca...",Pertahanan dan Keamanan


In [None]:
missing_data(train)

Unnamed: 0,Total,Percent
text,0,0.0
label,0,0.0


In [None]:
basic_data_info(train)

Data Preview:
---------------------------


Unnamed: 0,text,label
0,Kunjungan Prabowo ini untuk meresmikan dan men...,Sumber Daya Alam
1,RT Anies dapat tepuk tangan meriah saat jadi R...,Politik
2,@CIqXqwGAT04tMtx4OCATxjoVq7vv/Y8HeYaIOgMFg8Y= ...,Demografi
3,RT @L3R8XFBw3WGbxRPSj0/0hHZTbqVGX7qtfwRg9zmhK7...,Politik
4,Anies Baswedan Harap ASN termasuk TNI dan Polr...,Politik



General Info:
---------------------------
<class 'pandas.core.frame.DataFrame'>
Index: 4619 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    4619 non-null   object
 1   label   4619 non-null   object
dtypes: object(2)
memory usage: 108.3+ KB
None

Descriptive Statistics:
---------------------------


Unnamed: 0,count,unique,top,freq
text,4619,4583,RT Abah Anies Janji Bakal Revisi UU KPK untuk ...,2
label,4619,8,Politik,2972


In [None]:
train.label.value_counts()

label
Politik                    2972
Sosial Budaya               425
Ideologi                    343
Pertahanan dan Keamanan     331
Ekonomi                     310
Sumber Daya Alam            157
Demografi                    61
Geografi                     20
Name: count, dtype: int64

# Splitting Data For Modeling

In [None]:
train = pd.read_csv(cleanDataPath + "/Processing-Data-clean-text-4.csv")

In [None]:
# Split data into 85% train and 15% test
X_train, X_test, y_train, y_test = train_test_split(train.clean_text_4, train.label, test_size=0.15, random_state=42, stratify=train.label)

print('Train Size : ', X_train.shape)
print('Test Size  : ', X_test.shape)

Train Size :  (3926,)
Test Size  :  (693,)


In [None]:
y_train.value_counts(), y_test.value_counts()

(label
 Politik                    2526
 Sosial Budaya               361
 Ideologi                    292
 Pertahanan dan Keamanan     281
 Ekonomi                     264
 Sumber Daya Alam            133
 Demografi                    52
 Geografi                     17
 Name: count, dtype: int64,
 label
 Politik                    446
 Sosial Budaya               64
 Ideologi                    51
 Pertahanan dan Keamanan     50
 Ekonomi                     46
 Sumber Daya Alam            24
 Demografi                    9
 Geografi                     3
 Name: count, dtype: int64)

In [None]:
# Change Target to One Hot Encoding
from tensorflow.keras.utils import to_categorical

# One-hot encoding pada target
y_train_ohe = to_categorical(y_train.factorize()[0])
y_test_ohe = to_categorical(y_test.factorize()[0])

print('One-Hot Encoded y_train:')
print(y_train_ohe)
print('One-Hot Encoded y_test:')
print(y_test_ohe)

One-Hot Encoded y_train:
[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]
One-Hot Encoded y_test:
[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [None]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, get_scheduler, AutoModelForSequenceClassification, AdamW
from tqdm.auto import tqdm

# Encode labels as integers
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Convert to pandas Series if necessary
if not isinstance(y_train_encoded, pd.Series):
    y_train_encoded = pd.Series(y_train_encoded)
    y_test_encoded = pd.Series(y_test_encoded)

In [None]:
# Munculkan panduan label setelah encoding
label_mapping = dict(zip(range(len(label_encoder.classes_)), label_encoder.classes_))
print("Panduan Label setelah Encoding:")
for key, value in label_mapping.items():
    print(f"Encoded {key} untuk label {value}")

Panduan Label setelah Encoding:
Encoded 0 untuk label Demografi
Encoded 1 untuk label Ekonomi
Encoded 2 untuk label Geografi
Encoded 3 untuk label Ideologi
Encoded 4 untuk label Pertahanan dan Keamanan
Encoded 5 untuk label Politik
Encoded 6 untuk label Sosial Budaya
Encoded 7 untuk label Sumber Daya Alam


# Indobertweet Embedding - Finetune

In [None]:
# Define the Dataset class
class YourDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels.astype(int)  # Ensure labels are integers
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("indolem/indobertweet-base-uncased")

# Create datasets
train_dataset = YourDataset(X_train, y_train_encoded, tokenizer)
test_dataset = YourDataset(X_test, y_test_encoded, tokenizer)

# Define DataLoader parameters
batch_size = 16

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

print('DataLoader Created')

# Define model and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_labels = len(label_encoder.classes_)
model = AutoModelForSequenceClassification.from_pretrained("indolem/indobertweet-base-uncased", num_labels=num_labels)
model.to(device)  # Move model to GPU if available
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define the learning rate scheduler
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

DataLoader Created


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define evaluation function
def evaluate(model, dataloader):
    model.eval()
    total_eval_loss = 0
    total_eval_accuracy = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        total_eval_loss += loss.item()

        preds = torch.argmax(logits, dim=-1)
        accuracy = (preds == labels).float().mean()
        total_eval_accuracy += accuracy.item()

    avg_loss = total_eval_loss / len(dataloader)
    avg_accuracy = total_eval_accuracy / len(dataloader)
    return avg_loss, avg_accuracy

In [None]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    total_train_accuracy = 0

    for batch in tqdm(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        total_train_loss += loss.item()

        preds = torch.argmax(logits, dim=-1)
        accuracy = (preds == labels).float().mean()
        total_train_accuracy += accuracy.item()

        loss.backward()
        optimizer.step()
        lr_scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    avg_train_accuracy = total_train_accuracy / len(train_dataloader)

    test_loss, test_accuracy = evaluate(model, test_dataloader)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {avg_train_loss:.4f} | Train Accuracy: {avg_train_accuracy:.4f}")
    print(f"Test Loss: {test_loss:.4f} | Test Accuracy: {test_accuracy:.4f}")

print("Training Completed")

  0%|          | 0/246 [00:00<?, ?it/s]

Epoch 1/3
Train Loss: 0.9223 | Train Accuracy: 0.7168
Test Loss: 0.7288 | Test Accuracy: 0.7577


  0%|          | 0/246 [00:00<?, ?it/s]

Epoch 2/3
Train Loss: 0.5725 | Train Accuracy: 0.8236
Test Loss: 0.6996 | Test Accuracy: 0.7821


  0%|          | 0/246 [00:00<?, ?it/s]

Epoch 3/3
Train Loss: 0.3612 | Train Accuracy: 0.8968
Test Loss: 0.7106 | Test Accuracy: 0.7963
Training Completed


# Evaluasi Model

In [None]:
# Predict function
def predict(texts, tokenizer, model, max_length=128):
    # Ensure texts is a list of strings
    texts = [str(text) for text in texts]
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1)
    return preds

# Predict on the test set
texts_to_predict = X_test.tolist()
predictions = predict(texts_to_predict, tokenizer, model)
decoded_predictions = label_encoder.inverse_transform(predictions.cpu().numpy())

# Evaluation function
def evaluate_model(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    report = classification_report(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_true, y_pred)

    # Print the evaluation metrics
    print("Precision Score: ", precision)
    print("Recall Score: ", recall)
    print("F1 Score: ", f1)
    print("\nClassification Report:")
    print(report)
    print("Accuracy Score: ", accuracy)
    print("Balanced Accuracy Score: ", balanced_accuracy)

evaluate_model(y_test, decoded_predictions)

Precision Score:  0.7931762329593315
Recall Score:  0.7994227994227994
F1 Score:  0.7901133255762076

Classification Report:
                         precision    recall  f1-score   support

              Demografi       1.00      0.11      0.20         9
                Ekonomi       0.70      0.65      0.67        46
               Geografi       0.00      0.00      0.00         3
               Ideologi       0.74      0.61      0.67        51
Pertahanan dan Keamanan       0.82      0.74      0.78        50
                Politik       0.85      0.91      0.88       446
          Sosial Budaya       0.62      0.56      0.59        64
       Sumber Daya Alam       0.46      0.46      0.46        24

               accuracy                           0.80       693
              macro avg       0.65      0.51      0.53       693
           weighted avg       0.79      0.80      0.79       693

Accuracy Score:  0.7994227994227994
Balanced Accuracy Score:  0.5058449626276065


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# **Saving Model**

In [None]:
# Save the model - First Saved
modelPath = os.path.join(mainPath, "Model_Trained")

In [None]:
model.save_pretrained(modelPath + "/Trained-indobertweet2")
tokenizer.save_pretrained(modelPath + "/Trained-indobertweet2")

('/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Model_Trained/Trained-indobertweet2/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Model_Trained/Trained-indobertweet2/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Model_Trained/Trained-indobertweet2/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Model_Trained/Trained-indobertweet2/added_tokens.json',
 '/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Model_Trained/Trained-indobertweet2/tokenizer.json')

In [None]:
# # Save the model - Second Saved
# model_save_path = os.path.join(modelPath, "indobertweet_model.pth")
# torch.save(model.state_dict(), model_save_path)
# print(f"Model saved to {model_save_path}")

Model saved to /content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Model_Trained/indobertweet_model.pth


https://huggingface.co/docs/transformers/v4.41.3/en/tasks/multiple_choice#inference

# **Load Model**

In [None]:
# # Load model directly
# from transformers import AutoTokenizer, AutoModelForSequenceClassification

# tokenizer = AutoTokenizer.from_pretrained("Rendika/tweets-election-classification")
# model = AutoModelForSequenceClassification.from_pretrained("Rendika/tweets-election-classification")

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_path = os.path.join(modelPath, "Trained-indobertweet2")
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.to(device)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31923, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# **Model Inference to Submissions**

In [None]:
# Memastikan model dan tokenizer sudah dimuat sebelumnya
# model, tokenizer = load_model_and_tokenizer(model_save_path)

# Fungsi untuk melakukan prediksi
def predict(texts, tokenizer, model, max_length=128):
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1)
    return preds

In [None]:
from collections import Counter

In [None]:
predict(test["Text"].tolist(), tokenizer, model).cpu().numpy() # udah run tpi lama, jadi ku copy aja hasil output nya taruh di cell bawah

In [None]:
y_pred_indices = [5, 5, 3, 5, 5, 5, 5, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 1, 6, 5, 5, 5,
       1, 7, 1, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 1,
       5, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 1,
       5, 5, 5, 5, 5, 5, 5, 4, 5, 1, 5, 1, 5, 5, 4, 4, 5, 5, 4, 1, 5, 6,
       5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 6, 5, 5, 6, 5, 5, 1, 5,
       5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 6, 5, 5, 5, 5, 5, 5, 5, 4, 5,
       5, 5, 6, 5, 5, 1, 5, 5, 1, 1, 5, 5, 5, 4, 1, 5, 1, 5, 5, 4, 5, 5,
       5, 1, 5, 1, 1, 5, 5, 5, 5, 5, 5, 5, 4, 7, 5, 5, 5, 5, 4, 5, 5, 5,
       5, 5, 5, 5, 5, 1, 5, 7, 5, 4, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 1, 1, 5, 5, 5, 5, 1, 5, 5, 5, 5, 5, 5, 6, 5,
       5, 5, 1, 5, 4, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 4, 5, 5, 5, 4,
       5, 6, 4, 5, 1, 1, 5, 1, 6, 5, 5, 5, 5, 1, 6, 5, 5, 5, 5, 5, 5, 5,
       1, 5, 1, 5, 5, 5, 5, 1, 5, 1, 5, 5, 5, 5, 1, 5, 5, 1, 4, 5, 1, 5,
       5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 5, 1, 5, 5, 5, 5, 5, 5,
       1, 5, 1, 1, 4, 1, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5,
       4, 1, 5, 4, 6, 1, 5, 5, 3, 5, 5, 5, 5, 1, 5, 5, 5, 5, 1, 4, 5, 5,
       5, 5, 1, 5, 4, 5, 5, 5, 5, 5, 5, 5, 1, 5, 5, 5, 4, 5, 5, 4, 5, 4,
       4, 5, 5, 5, 5, 1, 5, 5, 1, 5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 1, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 6, 5, 5, 5, 6, 5, 5, 5, 6, 1, 7, 5, 5, 5, 5, 5, 5, 5,
       5, 1, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 7, 5, 5, 5, 5, 6, 5, 1, 5, 5,
       6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 5, 4, 5, 1, 5, 5, 5, 5,
       5, 5, 5, 5, 1, 5, 5, 1, 5, 1, 5, 1, 5, 5, 1, 5, 5, 5, 5, 5, 1, 5,
       3, 1, 5, 5, 5, 1, 5, 5, 5, 6, 5, 1, 5, 6, 5, 5, 5, 5, 4, 1, 5, 5,
       5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 1, 1, 5,
       7, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 4, 5, 5, 1, 1, 5, 1, 4, 5, 5, 5,
       5, 5, 5, 1, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 1, 7, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 1, 6, 5, 5, 5, 5, 5, 5, 4, 5, 5, 1, 5, 5, 5, 5, 5, 6,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 5, 4, 5, 5, 5, 5, 5, 5,
       5, 5, 3, 5, 1, 5, 5, 5, 5, 5, 5, 5, 1, 3, 5, 5, 6, 5, 5, 5, 4, 5,
       5, 5, 5, 1, 1, 5, 6, 5, 5, 5, 5, 5, 5, 5, 1, 1, 5, 4, 5, 4, 5, 5,
       5, 5, 1, 5, 6, 5, 5, 5, 5, 5, 5, 5, 6, 1, 5, 3, 6, 5, 5, 5, 5, 5,
       1, 1, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 5, 1, 5, 5, 5, 5, 5,
       1, 5, 7, 5, 4, 7, 5, 5, 5, 7, 5, 5, 5, 5, 1, 5, 5, 1, 5, 1, 3, 5,
       5, 5, 5, 5, 5, 5, 7, 6, 5, 5, 5, 5, 5, 5, 5, 1, 5, 5, 5, 5, 5, 5,
       1, 1, 5, 4, 7, 0, 6, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3,
       1, 1, 5, 5, 5, 5, 5, 5, 1, 5, 5, 3, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5,
       5, 6, 6, 5, 5, 5, 5, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 4, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 1, 5, 5, 1, 1, 5,
       5, 5, 5, 5, 1, 4, 5, 5, 5, 5, 5, 1, 4, 4, 5, 5, 5, 5, 5, 5, 1, 6,
       5, 4, 5, 5, 4, 5, 7, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 1, 5, 5,
       5, 5, 5, 5, 4, 5, 3, 5, 5, 5, 1, 1, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 1, 4, 6, 5, 5, 7, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       1, 5, 1, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 5, 5, 6, 5,
       5, 5, 5, 4, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 6, 5, 5, 5, 4, 5, 5, 5,
       5, 5, 5, 5, 5, 1, 5, 5, 1, 5]

# Mengubah indeks kelas menjadi label asli
y_pred_labels = label_encoder.inverse_transform(y_pred_indices)

In [None]:
Counter(y_pred_labels)

Counter({'Politik': 751,
         'Ideologi': 13,
         'Ekonomi': 119,
         'Sosial Budaya': 42,
         'Sumber Daya Alam': 14,
         'Pertahanan dan Keamanan': 60,
         'Demografi': 1})

In [None]:
submissions["Kelas"] = y_pred_labels

In [None]:
submissions

Unnamed: 0,IDText,Kelas
0,TXT0001,Politik
1,TXT0002,Politik
2,TXT0003,Ideologi
3,TXT0004,Politik
4,TXT0005,Politik
...,...,...
995,TXT0996,Ekonomi
996,TXT0997,Politik
997,TXT0998,Politik
998,TXT0999,Ekonomi


In [None]:
submissions.to_csv("/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data/Submissions/[IndoBERTweet-Clean_Text_4]SD2024040000208.csv", index = False)