# **Import Libraries & Data Loading**

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import re # if u want to learn regex [https://regex101.com/]
import string
import random # Random number generators - Library for generating random numbers, selecting random elements, shuffling sequences, etc.
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

pd.set_option("display.max_columns", None)
# pd.set_option("display.max_row", None)

In [2]:
!pip install sastrawi nlp-id catboost

Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nlp-id
  Downloading nlp_id-0.1.15.0.tar.gz (54.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.8/54.8 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting wget==3.2 (from nlp-id)
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pytest==7.3.1 (from nlp-id)
  Downloading pytest-7.3.1-py3-none-any.whl (320 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.5/320.5 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00

In [3]:
import nltk # Natural Language Toolkit - Library for natural language processing (NLP) tasks such as tokenization, stemming, tagging, parsing, and more.
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm # A Fast, Extensible Progress Bar - Library for creating progress bars to monitor the progress of iterations or tasks.
from nltk.stem import WordNetLemmatizer, PorterStemmer

from nlp_id.lemmatizer import Lemmatizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional, GlobalMaxPool1D, BatchNormalization, Dropout, GRU, Reshape
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf

import keras
from keras.initializers import Constant
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.optimizers import Adam

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    precision_score, # Precision score - Metric for evaluating classification models, measuring the ratio of correctly predicted positive observations to the total predicted positives.
    recall_score, # Recall score - Metric for evaluating classification models, measuring the ratio of correctly predicted positive observations to the total actual positives.
    f1_score, # F1 score - Harmonic mean of precision and recall, a metric for evaluating classification models.
    classification_report, # Classification report - Summary of the precision, recall, F1 score, and support for each class in a classification problem.
    accuracy_score, # Accuracy score - Metric for evaluating classification models, measuring the proportion of correct predictions to the total number of predictions.
    roc_auc_score,
    confusion_matrix,
    balanced_accuracy_score)

In [4]:
def check_duplicates(dataframe):
    print("Duplicate Values (Top 10):")
    duplicate_values = dataframe[dataframe.duplicated()]
    print(f"Number of Duplicate Rows: {duplicate_values.shape[0]}")
    display(duplicate_values.head(10))

def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

def basic_data_info(dataframe):
    print("Data Preview:")
    print("---------------------------")
    display(dataframe.head())

    print("\nGeneral Info:")
    print("---------------------------")
    print(dataframe.info())

    print("\nDescriptive Statistics:")
    print("---------------------------")
    display(dataframe.describe().T)

# **Load Data**

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [86]:
mainPath = "/content/drive/MyDrive/Colab Notebooks/Satria Data Another Device"
dataPath = os.path.join(mainPath, "Dataset-BDC-SatriaData-2024")
cleanDataPath = os.path.join(mainPath, "Clean Dataset")

In [87]:
train = pd.read_csv(dataPath + "/dataset_penyisihan_bdc_2024.csv", sep=";")
test = pd.read_csv(dataPath + "/dataset_unlabeled_penyisihan_bdc_2024.csv", sep=";")
submissions = pd.read_csv(dataPath + "/template_jawaban_penyisihan_bdc_2024.csv", sep=";")

In [88]:
train2 = pd.read_csv(cleanDataPath + "/Processing-Data-clean-text-6.csv")

In [89]:
Geo_Aug = pd.read_excel(cleanDataPath + "/final_geografi_augmentasi.xlsx").drop(columns = "Unnamed: 0").rename(columns = {"clean_text_5": "text"})
Demo_Aug = pd.read_excel(cleanDataPath + "/final_demografi_augmentasi_3.xlsx").drop(columns = "Unnamed: 0").rename(columns = {"clean_text_6": "text"})

# **CatBoost with text feature**

In [90]:
train2.label.value_counts()

label
Politik                    2969
Sosial Budaya               422
Ideologi                    343
Pertahanan dan Keamanan     331
Ekonomi                     309
Sumber Daya Alam            156
Demografi                    61
Geografi                     19
Name: count, dtype: int64

In [91]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from catboost import CatBoostClassifier, Pool
import time
import pickle

## Load and Splitting Clean Data

In [92]:
train, test = train_test_split(train2, test_size=0.20, random_state=42, stratify=train2['label'])

# Membagi data train menjadi X_train dan y_train
X_train = train.drop(columns=['label'])["clean_text_6"]
y_train = train['label']

# Membagi data test menjadi X_test dan y_test
X_test = test.drop(columns=['label'])["clean_text_6"]
y_test = test['label']

# Output X_train, y_train, X_test, y_test untuk memastikan pembagian benar
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (3688,)
y_train shape: (3688,)
X_test shape: (922,)
y_test shape: (922,)


In [93]:
# Menggabungkan X_train dan y_train menjadi satu DataFrame
train_temp = pd.concat([X_train, y_train], axis=1)

In [94]:
test_temp = pd.concat([X_test, y_test], axis=1)

In [95]:
# Tentukan index yang ingin diambil
index_to_select = [1128, 4102, 2629, 509, 3822]

# Pilih baris-baris berdasarkan index tersebut
selected_rows = train_temp.loc[index_to_select]

# Hapus baris-baris tersebut dari DataFrame asli
train_temp = train_temp.drop(index_to_select)

In [96]:
train_temp[train_temp["label"] == "Geografi"]

Unnamed: 0,clean_text_6,label
2469,semangat bentar puncak,Geografi
3480,periksa,Geografi
1526,sumber makan kebun gunung kalimantan tanggung ...,Geografi
2419,hapus jakarta bekas videotron muncul surabaya ...,Geografi
885,orgnya panen uang kota negara tanah adik bilan...,Geografi
3408,titip pikir gagas program tingkat sumber daya ...,Geografi
294,golput hebat guru juta,Geografi
2812,pulau sebira perhati zaman gubernur omong loka...,Geografi
3197,peta politik lihat periksa gubernur jakarta su...,Geografi
2022,teman komentar janji manis asli janji beda beb...,Geografi


In [97]:
# Gabungkan selected_rows dengan test_temp
test_temp = pd.concat([test_temp, selected_rows], ignore_index=True)

In [98]:
train_temp.label.value_counts()

label
Politik                    2375
Sosial Budaya               338
Ideologi                    274
Pertahanan dan Keamanan     265
Ekonomi                     247
Sumber Daya Alam            125
Demografi                    49
Geografi                     10
Name: count, dtype: int64

In [99]:
test_temp.label.value_counts()

label
Politik                    594
Sosial Budaya               84
Ideologi                    69
Pertahanan dan Keamanan     66
Ekonomi                     62
Sumber Daya Alam            31
Demografi                   12
Geografi                     9
Name: count, dtype: int64

In [100]:
train_pakai = train_temp[["label","clean_text_6"]].rename(columns = {"clean_text_6": "text"})

In [101]:
# Function to augment data (simple example)
def augment_data(train_df, geo_aug_df, demo_aug_df):
    # Duplicate the existing data
    augmented_data = train_df.copy()
    # Append Geo_Aug and Demo_Aug data
    augmented_data = pd.concat([augmented_data, geo_aug_df, demo_aug_df], ignore_index=True)
    return augmented_data

# Augment the training data
train_aug = augment_data(train_pakai, Geo_Aug, Demo_Aug).drop_duplicates().rename(columns = {"text": "clean_text_6"})

In [102]:
train_aug.dropna(inplace=True)
test_temp.dropna(inplace=True)

In [103]:
# Membagi data train menjadi X_train dan y_train
X_train = train_aug.drop(columns=['label'])["clean_text_6"]
y_train = train_aug['label']

# Membagi data test menjadi X_test dan y_test
X_test = test_temp.drop(columns=['label'])["clean_text_6"]
y_test = test_temp['label']

In [104]:
# Output X_train, y_train, X_test, y_test untuk memastikan pembagian benar
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (3621,)
y_train shape: (3621,)
X_test shape: (924,)
y_test shape: (924,)


## Encode Label(y)

In [105]:
# Label Encoding
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)

# One-Hot Encoding
y_train_ohe = pd.get_dummies(y_train_enc).values
y_test_ohe = pd.get_dummies(y_test_enc).values

In [106]:
# Munculkan panduan label setelah encoding
label_mapping = dict(zip(range(len(label_encoder.classes_)), label_encoder.classes_))
print("Panduan Label setelah Encoding:")
for key, value in label_mapping.items():
    print(f"Encoded {key} untuk label {value}")

Panduan Label setelah Encoding:
Encoded 0 untuk label Demografi
Encoded 1 untuk label Ekonomi
Encoded 2 untuk label Geografi
Encoded 3 untuk label Ideologi
Encoded 4 untuk label Pertahanan dan Keamanan
Encoded 5 untuk label Politik
Encoded 6 untuk label Sosial Budaya
Encoded 7 untuk label Sumber Daya Alam


In [107]:
print('One-Hot Encoded y_train:')
print(y_train_ohe)
print('One-Hot Encoded y_test:')
print(y_test_ohe)

One-Hot Encoded y_train:
[[False False False ...  True False False]
 [False  True False ... False False False]
 [False False False ...  True False False]
 ...
 [ True False False ... False False False]
 [ True False False ... False False False]
 [ True False False ... False False False]]
One-Hot Encoded y_test:
[[False False False ...  True False False]
 [False False False ...  True False False]
 [False  True False ... False False False]
 ...
 [False False  True ... False False False]
 [False False  True ... False False False]
 [False False  True ... False False False]]


In [108]:
print('Label Encoded y_train:')
print(y_train_enc)
print('Label Encoded y_test:')
print(y_test_enc)

Label Encoded y_train:
[5 1 5 ... 0 0 0]
Label Encoded y_test:
[5 5 1 5 5 0 5 5 5 5 1 6 1 5 6 5 5 5 5 5 1 0 5 1 6 5 7 5 5 5 5 4 5 7 5 5 5
 7 5 6 1 5 5 5 5 5 5 5 3 6 5 3 4 6 5 0 5 6 5 4 5 5 6 5 6 3 4 7 2 5 5 5 5 1
 5 1 5 5 6 5 3 3 5 5 5 5 5 5 5 1 3 0 5 5 5 5 5 5 5 1 5 1 1 5 6 4 5 5 5 3 5
 6 5 5 6 5 6 5 5 6 4 5 6 3 5 5 4 3 5 4 5 5 1 5 5 5 6 5 5 5 5 6 6 5 5 4 4 4
 5 5 7 1 4 5 5 5 5 5 5 5 5 6 5 5 1 6 5 5 6 4 7 5 5 5 3 5 5 7 5 4 4 5 3 5 5
 6 5 5 3 5 5 5 5 3 5 5 5 5 5 5 5 4 1 4 5 5 5 5 4 5 5 5 3 1 5 5 5 6 5 5 6 6
 3 5 5 5 5 5 5 5 3 5 6 7 4 5 3 5 5 6 4 6 5 5 5 5 1 5 5 6 5 3 1 5 5 5 5 5 5
 5 5 5 1 5 3 4 1 5 5 3 3 5 5 5 3 5 6 5 5 5 1 5 1 3 5 6 6 3 5 5 5 3 3 4 5 4
 5 5 5 5 5 1 5 5 5 5 4 5 0 5 5 4 5 6 7 0 5 5 5 5 1 5 5 5 5 5 5 5 5 5 3 5 5
 6 5 5 5 5 6 5 7 5 5 5 5 5 5 1 4 5 5 0 4 5 7 5 4 5 5 5 5 5 5 5 4 5 3 5 3 5
 5 5 5 5 5 1 5 6 5 4 5 5 5 5 0 3 5 7 5 5 4 5 5 5 5 5 5 5 4 5 5 5 5 7 3 5 5
 4 5 3 5 5 1 5 5 7 5 5 5 5 4 5 5 3 5 4 1 5 1 5 5 3 1 5 5 3 5 7 5 5 6 5 1 6
 5 6 0 5 6 5 5 7 4 5 5 5 5 5 5 5 3 5 

## Class Weight Definition

In [109]:
from collections import Counter

def counter_of_value(lst):
    # Menggunakan Counter untuk menghitung kemunculan setiap elemen dalam list
    counts = Counter(lst)

    # Membuat DataFrame dari Counter
    df = pd.DataFrame.from_dict(counts, orient='index', columns=['Count'])
    df.index.name = 'Value'
    df = df.sort_values(by="Value")
    df.reset_index(inplace=True)

    return df

In [110]:
counter_y_train_enc = counter_of_value(y_train_enc)
counter_y_test_enc = counter_of_value(y_test_enc)

In [111]:
counter_y_train_enc, counter_y_test_enc

(   Value  Count
 0      0    109
 1      1    219
 2      2    120
 3      3    213
 4      4    218
 5      5   2299
 6      6    326
 7      7    117,
    Value  Count
 0      0     12
 1      1     62
 2      2      9
 3      3     69
 4      4     66
 5      5    591
 6      6     84
 7      7     31)

In [112]:
# Menghitung class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_enc), y=y_train_enc)
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}
print('Class Weights:', class_weights_dict)

Class Weights: {0: 4.152522935779817, 1: 2.066780821917808, 2: 3.771875, 3: 2.125, 4: 2.0762614678899083, 5: 0.1968790778599391, 6: 1.388420245398773, 7: 3.8685897435897436}


## Modeling CatBoost

In [113]:
# Parameter CatBoost
catboost_params = {
    'iterations': 1000,
    'learning_rate': 0.1,
    'eval_metric': 'MultiClass',
    'task_type': 'GPU',
    'early_stopping_rounds': 100,
    'use_best_model': True,
    'verbose': 100,
    'class_weights': class_weights.tolist()  # Convert to list
}

In [114]:
# Training model
start = time.time()

train_pool = Pool(X_train, y_train_enc, text_features=['clean_text_6'], feature_names=['clean_text_6'])
test_pool = Pool(X_test, y_test_enc, text_features=['clean_text_6'], feature_names=['clean_text_6'])

model = CatBoostClassifier(**catboost_params)
model.fit(train_pool, eval_set=test_pool)

end = time.time()
print(f"Model training completed in {(end-start)/60:.2f} minutes")

0:	learn: 1.9079748	test: 1.9233294	best: 1.9233294 (0)	total: 35ms	remaining: 34.9s
100:	learn: 1.0706117	test: 1.2556968	best: 1.2552095 (97)	total: 1.29s	remaining: 11.5s
200:	learn: 0.9696675	test: 1.2445243	best: 1.2421989 (154)	total: 2.37s	remaining: 9.41s
300:	learn: 0.9009825	test: 1.2370703	best: 1.2370703 (300)	total: 3.51s	remaining: 8.14s
400:	learn: 0.8353138	test: 1.2338422	best: 1.2334834 (330)	total: 4.54s	remaining: 6.79s
bestTest = 1.233483367
bestIteration = 330
Shrink model to first 331 iterations.
Model training completed in 0.13 minutes


## Evaluasi Model

In [115]:
# Evaluasi model
y_pred_train = model.predict(X_train.to_frame())
y_pred_test = model.predict(X_test.to_frame())

In [116]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, balanced_accuracy_score, f1_score, recall_score, precision_score

# Menghitung metrik evaluasi untuk train set
train_acc = accuracy_score(y_train_enc, y_pred_train)
train_conf_matrix = confusion_matrix(y_train_enc, y_pred_train)
train_class_report = classification_report(y_train_enc, y_pred_train)
train_balanced_acc = balanced_accuracy_score(y_train_enc, y_pred_train)
train_f1 = f1_score(y_train_enc, y_pred_train, average='weighted')
train_recall = recall_score(y_train_enc, y_pred_train, average='weighted')
train_precision = precision_score(y_train_enc, y_pred_train, average='weighted')

# Menghitung metrik evaluasi untuk test set
test_acc = accuracy_score(y_test_enc, y_pred_test)
test_conf_matrix = confusion_matrix(y_test_enc, y_pred_test)
test_class_report = classification_report(y_test_enc, y_pred_test)
test_balanced_acc = balanced_accuracy_score(y_test_enc, y_pred_test)
test_f1 = f1_score(y_test_enc, y_pred_test, average='weighted')
test_recall = recall_score(y_test_enc, y_pred_test, average='weighted')
test_precision = precision_score(y_test_enc, y_pred_test, average='weighted')

# Menyimpan hasil evaluasi
evaluation_results = {
    'Train Accuracy': train_acc,
    'Train Confusion Matrix': train_conf_matrix,
    'Train Classification Report': train_class_report,
    'Train Balanced Accuracy': train_balanced_acc,
    'Train F1 Score': train_f1,
    'Train Recall': train_recall,
    'Train Precision': train_precision,
    'Test Accuracy': test_acc,
    'Test Confusion Matrix': test_conf_matrix,
    'Test Classification Report': test_class_report,
    'Test Balanced Accuracy': test_balanced_acc,
    'Test F1 Score': test_f1,
    'Test Recall': test_recall,
    'Test Precision': test_precision
}

# Output hasil evaluasi
print("Train Accuracy:", train_acc)
print("Train Confusion Matrix:\n", train_conf_matrix)
print("Train Classification Report:\n", train_class_report)
print("Train Balanced Accuracy:", train_balanced_acc)
print("Train F1 Score:", train_f1)
print("Train Recall:", train_recall)
print("Train Precision:", train_precision, "\n")
print("="*50, "\n")
print("Test Accuracy:", test_acc)
print("Test Confusion Matrix:\n", test_conf_matrix)
print("Test Classification Report:\n", test_class_report)
print("Test Balanced Accuracy:", test_balanced_acc)
print("Test F1 Score:", test_f1)
print("Test Recall:", test_recall)
print("Test Precision:", test_precision)

Train Accuracy: 0.6940071803369235
Train Confusion Matrix:
 [[ 102    0    0    0    0    3    3    1]
 [   2  193    0    0    0    8    7    9]
 [   2    0  117    1    0    0    0    0]
 [   5    6    0  161    4   14   15    8]
 [   3    1    0    5  196    2    7    4]
 [  76  128    4  230  145 1381  255   80]
 [  10   13    1   10    8   11  259   14]
 [   3    3    0    1    0    4    2  104]]
Train Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.94      0.65       109
           1       0.56      0.88      0.69       219
           2       0.96      0.97      0.97       120
           3       0.39      0.76      0.52       213
           4       0.56      0.90      0.69       218
           5       0.97      0.60      0.74      2299
           6       0.47      0.79      0.59       326
           7       0.47      0.89      0.62       117

    accuracy                           0.69      3621
   macro avg       0.61

# Saving And Load Model

In [74]:
# Contoh path untuk menyimpan dan memuat model
model_base_path = os.path.join(mainPath, "Model_Trained/")

In [75]:
# Gabungkan path lengkap
model_dir_path = os.path.join(model_base_path, 'catboost_model_balanced/')

In [76]:
# Periksa apakah direktori sudah ada, jika tidak buat baru
if not os.path.exists(model_dir_path):
    os.makedirs(model_dir_path)

In [77]:
import pickle

# Simpan model ke dalam file
model_filename = os.path.join(model_base_path, 'catboost_model.pkl')
pickle.dump(model, open(model_filename, 'wb'))

# Simpan class weights ke dalam file
class_weights_filename =  os.path.join(model_base_path, 'class_weights.pkl')
pickle.dump(class_weights_dict, open(class_weights_filename, 'wb'))

print("Model dan class weights telah disimpan.")

Model dan class weights telah disimpan.


In [78]:
# Memuat model dari file
model_filename = os.path.join(model_base_path, 'catboost_model.pkl')
loaded_model = pickle.load(open(model_filename, 'rb'))

# Memuat class weights dari file
class_weights_filename = os.path.join(model_base_path, 'class_weights.pkl')
loaded_class_weights = pickle.load(open(class_weights_filename, 'rb'))

print("Model dan class weights telah dimuat.")

Model dan class weights telah dimuat.


# Model Inference to Submissions

In [121]:
final_test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Satria Data Another Device/Clean Dataset/Final Test for Submissions Data.csv")

In [125]:
# Stopwords Tambahan Setelah melihat wordcloud
stopwords = [
    "ridwan", "kamil", "indonesia", "presiden", "pranowo",
    "pilih", "dukung", "calon", "prabowo", "ganjar",
    "kaesang", "anies", "mahfud", "baswedan", "mohammad",
    "pangarep", "jokowi"
]

# Fungsi untuk menghapus stopwords
def remove_stopwords(text, stopwords):
    return ' '.join([word for word in text.split() if word not in stopwords])

# Terapkan fungsi pada kolom clean_text_6
final_test['clean_text_6'] = final_test['clean_Text_5'].apply(lambda x: remove_stopwords(x, stopwords))

In [126]:
submission_text = final_test['clean_text_6'].to_frame()

In [127]:
submission_text

Unnamed: 0,clean_text_6
0,orang prodemokrasi negara lawan oligarki amin ...
1,hutang negeri hutang negeri hitung rendah khaw...
2,beliau sosok agung nilai nilai pancasila sadar...
3,kumpar gibran sejahtera rakyat
4,sambung junjung omong etika katai omong busuk ...
...,...
995,bikin bangga alokasi belanja perintah badan us...
996,rangkul utuh rakyat pecah damai kalah ego bangsa
997,got debat serang delapan puluh triliun polusi ...
998,rembuk musyawarah gaya pimpin gubernur jawa ga...


In [137]:
y_pred_test = model.predict(submission_text)

In [138]:
# Mengubah indeks kelas menjadi label asli
y_pred_labels = label_encoder.inverse_transform(y_pred_test)

  y = column_or_1d(y, warn=True)


In [139]:
Counter(y_pred_labels)

Counter({'Politik': 418,
         'Pertahanan dan Keamanan': 134,
         'Ideologi': 119,
         'Ekonomi': 158,
         'Sosial Budaya': 99,
         'Demografi': 24,
         'Sumber Daya Alam': 45,
         'Geografi': 3})

In [140]:
submissions["Kelas"] = y_pred_labels

In [141]:
submissions.to_csv("/content/drive/MyDrive/Colab Notebooks/Satria Data Another Device/Submissions" + "/[0.6275187883568056]-SD2024040000208.csv",index = False)