# **Import Libraries & Data Loading**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import re # if u want to learn regex [https://regex101.com/]
import string
import random # Random number generators - Library for generating random numbers, selecting random elements, shuffling sequences, etc.
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

pd.set_option("display.max_columns", None)
# pd.set_option("display.max_row", None)

In [None]:
!pip install sastrawi nlp-id catboost

Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nlp-id
  Downloading nlp_id-0.1.15.0.tar.gz (54.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.8/54.8 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting wget==3.2 (from nlp-id)
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pytest==7.3.1 (from nlp-id)
  Downloading pytest-7.3.1-py3-none-any.whl (320 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.5/320.5 kB[0m [31m19.6 MB/s[0m eta [36m0:00:0

In [None]:
import nltk # Natural Language Toolkit - Library for natural language processing (NLP) tasks such as tokenization, stemming, tagging, parsing, and more.
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm # A Fast, Extensible Progress Bar - Library for creating progress bars to monitor the progress of iterations or tasks.
from nltk.stem import WordNetLemmatizer, PorterStemmer

from nlp_id.lemmatizer import Lemmatizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional, GlobalMaxPool1D, BatchNormalization, Dropout, GRU, Reshape
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf

import keras
from keras.initializers import Constant
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.optimizers import Adam

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    precision_score, # Precision score - Metric for evaluating classification models, measuring the ratio of correctly predicted positive observations to the total predicted positives.
    recall_score, # Recall score - Metric for evaluating classification models, measuring the ratio of correctly predicted positive observations to the total actual positives.
    f1_score, # F1 score - Harmonic mean of precision and recall, a metric for evaluating classification models.
    classification_report, # Classification report - Summary of the precision, recall, F1 score, and support for each class in a classification problem.
    accuracy_score, # Accuracy score - Metric for evaluating classification models, measuring the proportion of correct predictions to the total number of predictions.
    roc_auc_score,
    confusion_matrix,
    balanced_accuracy_score)

In [None]:
def check_duplicates(dataframe):
    print("Duplicate Values (Top 10):")
    duplicate_values = dataframe[dataframe.duplicated()]
    print(f"Number of Duplicate Rows: {duplicate_values.shape[0]}")
    display(duplicate_values.head(10))

def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

def basic_data_info(dataframe):
    print("Data Preview:")
    print("---------------------------")
    display(dataframe.head())

    print("\nGeneral Info:")
    print("---------------------------")
    print(dataframe.info())

    print("\nDescriptive Statistics:")
    print("---------------------------")
    display(dataframe.describe().T)

# **Load Data**

In [None]:
mainPath = "/content/drive/MyDrive/Colab Notebooks/2. Satria Data 2024/Pengerjaan Satria Data"
dataPath = os.path.join(mainPath, "Dataset-BDC-SatriaData-2024")
cleanDataPath = os.path.join(mainPath, "Clean Dataset")
submissionsPath = os.path.join(mainPath, "Submissions")

In [None]:
train = pd.read_csv(dataPath + "/dataset_penyisihan_bdc_2024.csv", sep=";")
test = pd.read_csv(dataPath + "/dataset_unlabeled_penyisihan_bdc_2024.csv", sep=";")
submissions = pd.read_csv(dataPath + "/template_jawaban_penyisihan_bdc_2024.csv", sep=";")

# **Simple Explore the Data**

In [None]:
check_duplicates(train)
# drop duplicate entries considering all columns
train = train.drop_duplicates()

Duplicate Values (Top 10):
Number of Duplicate Rows: 381


Unnamed: 0,text,label
57,RT Abah Anies ingin mengangkat martabat petani...,Sosial Budaya
104,"RT Anak Muda Indonesia, the future of this nat...",Ideologi
145,"RT Pupuk bersubsidi langka, Tim Prabowo Gibran...",Ekonomi
146,RT Abah Anies ingin mengangkat martabat petani...,Sosial Budaya
189,RT Abah Anies ingin mengangkat martabat petani...,Sosial Budaya
220,RT Abah Anies ingin mengangkat martabat petani...,Sosial Budaya
234,"RT al fatihah buat Alm. Lambang Babar Purnomo,...",Pertahanan dan Keamanan
257,RT Kapitalisme neoliberal tak cocok untuk Indo...,Ideologi
298,"RT Anak Muda Indonesia, the future of this nat...",Ideologi
350,"RT According to Prabowo, Gaza is opressed beca...",Pertahanan dan Keamanan


In [None]:
missing_data(train)

Unnamed: 0,Total,Percent
text,0,0.0
label,0,0.0


In [None]:
basic_data_info(train)

Data Preview:
---------------------------


Unnamed: 0,text,label
0,Kunjungan Prabowo ini untuk meresmikan dan men...,Sumber Daya Alam
1,RT Anies dapat tepuk tangan meriah saat jadi R...,Politik
2,@CIqXqwGAT04tMtx4OCATxjoVq7vv/Y8HeYaIOgMFg8Y= ...,Demografi
3,RT @L3R8XFBw3WGbxRPSj0/0hHZTbqVGX7qtfwRg9zmhK7...,Politik
4,Anies Baswedan Harap ASN termasuk TNI dan Polr...,Politik



General Info:
---------------------------
<class 'pandas.core.frame.DataFrame'>
Index: 4619 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    4619 non-null   object
 1   label   4619 non-null   object
dtypes: object(2)
memory usage: 108.3+ KB
None

Descriptive Statistics:
---------------------------


Unnamed: 0,count,unique,top,freq
text,4619,4583,RT Abah Anies Janji Bakal Revisi UU KPK untuk ...,2
label,4619,8,Politik,2972


In [None]:
train.label.value_counts()

label
Politik                    2972
Sosial Budaya               425
Ideologi                    343
Pertahanan dan Keamanan     331
Ekonomi                     310
Sumber Daya Alam            157
Demografi                    61
Geografi                     20
Name: count, dtype: int64

# **CatBoost with text feature**

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from catboost import CatBoostClassifier, Pool
import time
import pickle

## Load and Splitting Clean Data

In [None]:
# Load data
train = pd.read_csv(cleanDataPath + "/Processing-Data-clean-text-4.csv")

In [None]:
# Splitting Data
X_train_val, X_test, y_train_val, y_test = train_test_split(train[['clean_text_3']], train['label'], test_size=0.05, random_state=42, stratify=train['label'])
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1053, random_state=42, stratify=y_train_val)

print('Train Size : ', X_train.shape)
print('Val Size   : ', X_val.shape)
print('Test Size  : ', X_test.shape)

Train Size :  (3925, 1)
Val Size   :  (463, 1)
Test Size  :  (231, 1)


## Encode Label(y)

In [None]:
# Label Encoding
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_val_enc = label_encoder.transform(y_val)
y_test_enc = label_encoder.transform(y_test)

# One-Hot Encoding
y_train_ohe = pd.get_dummies(y_train_enc).values
y_val_ohe = pd.get_dummies(y_val_enc).values
y_test_ohe = pd.get_dummies(y_test_enc).values

In [None]:
print('One-Hot Encoded y_train:')
print(y_train_ohe)
print('One-Hot Encoded y_val:')
print(y_val_ohe)
print('One-Hot Encoded y_test:')
print(y_test_ohe)

One-Hot Encoded y_train:
[[False False False ... False  True False]
 [False False False ... False  True False]
 [False False False ...  True False False]
 ...
 [False False False ...  True False False]
 [False False False ...  True False False]
 [False False False ...  True False False]]
One-Hot Encoded y_val:
[[False False False ...  True False False]
 [False False False ... False False  True]
 [False False False ...  True False False]
 ...
 [False False False ... False False False]
 [False False False ...  True False False]
 [False  True False ... False False False]]
One-Hot Encoded y_test:
[[False False False ...  True False False]
 [False False False ...  True False False]
 [False False False ...  True False False]
 ...
 [False False False ... False False False]
 [False False False ...  True False False]
 [False False False ...  True False False]]


In [None]:
print('Label Encoded y_train:')
print(y_train_enc)
print('Label Encoded y_val:')
print(y_val_enc)
print('Label Encoded y_test:')
print(y_test_enc)

Label Encoded y_train:
[6 6 5 ... 5 5 5]
Label Encoded y_val:
[5 7 5 4 5 4 5 6 4 0 5 5 5 5 5 5 5 5 5 5 5 5 6 3 5 5 3 5 5 5 6 4 6 4 5 5 5
 5 5 5 3 5 5 5 5 5 6 5 6 1 5 5 6 5 5 5 1 5 3 5 4 5 5 5 5 7 5 5 7 5 5 5 6 3
 5 5 5 5 6 5 5 6 5 7 6 5 5 5 5 5 5 1 5 4 5 5 5 5 5 5 5 5 5 5 4 5 4 3 5 5 5
 3 3 7 5 5 5 5 7 3 5 5 5 5 6 5 5 5 0 3 5 3 5 5 3 1 5 5 5 5 5 6 5 4 5 7 5 5
 5 5 4 5 4 5 7 5 5 6 5 5 3 6 5 5 5 5 1 5 5 4 5 5 1 5 5 5 1 6 5 1 6 5 5 3 3
 4 0 1 5 3 7 4 5 5 5 1 1 6 4 5 5 5 5 3 5 7 5 5 5 6 6 5 6 5 5 5 5 5 5 5 5 5
 3 5 5 5 5 5 5 5 6 5 5 7 5 1 5 3 5 3 5 5 1 0 5 0 5 3 6 5 5 5 5 5 1 6 5 2 5
 1 5 3 1 4 0 5 6 4 5 1 5 6 7 5 5 5 6 5 5 5 5 4 3 5 5 5 5 5 5 4 5 5 5 1 1 3
 5 5 5 5 4 5 7 5 5 5 5 5 1 5 6 5 5 5 3 5 5 5 5 3 5 5 5 5 5 5 5 1 5 5 5 5 5
 5 6 5 1 3 4 4 5 4 5 5 5 6 7 4 5 5 5 6 5 5 5 5 5 1 5 5 5 6 3 4 1 5 5 5 5 5
 4 4 5 3 5 4 5 6 5 5 5 3 5 5 5 5 6 5 5 6 5 3 3 5 5 5 5 5 5 2 5 5 6 5 5 5 1
 5 5 6 5 5 6 5 1 6 5 5 5 5 4 5 3 1 5 6 5 5 1 7 4 5 5 5 5 5 6 5 6 7 4 5 1 1
 3 6 5 5 5 5 5 5 5 5 5 5 5 5 5 5 4 5 1

## Class Weight Definition

In [None]:
from collections import Counter

def counter_of_value(lst):
    # Menggunakan Counter untuk menghitung kemunculan setiap elemen dalam list
    counts = Counter(lst)

    # Membuat DataFrame dari Counter
    df = pd.DataFrame.from_dict(counts, orient='index', columns=['Count'])
    df.index.name = 'Value'
    df = df.sort_values(by="Value")
    df.reset_index(inplace=True)

    return df

In [None]:
counter_y_train_enc = counter_of_value(y_train_enc)
counter_y_val_enc = counter_of_value(y_val_enc)
counter_y_test_enc = counter_of_value(y_test_enc)

In [None]:
counter_y_train_enc

Unnamed: 0,Value,Count
0,0,52
1,1,264
2,2,17
3,3,292
4,4,281
5,5,2525
6,6,361
7,7,133


In [None]:
# Menghitung class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_enc), y=y_train_enc)
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}
print('Class Weights:', class_weights_dict)

Class Weights: {0: 9.435096153846153, 1: 1.8584280303030303, 2: 28.860294117647058, 3: 1.680222602739726, 4: 1.7459964412811388, 5: 0.1943069306930693, 6: 1.359072022160665, 7: 3.68890977443609}


## Modeling CatBoost

In [None]:
# Parameter CatBoost
catboost_params = {
    'iterations': 1000,
    'learning_rate': 0.1,
    'eval_metric': 'MultiClass',
    'task_type': 'GPU',
    'early_stopping_rounds': 100,
    'use_best_model': True,
    'verbose': 100,
    'class_weights': class_weights.tolist()  # Convert to list
}

In [None]:
# Training model dengan cross-validation
start = time.time()
y_preds = []
models = []
oof_train = np.zeros((len(X_train), len(y_train_ohe[0])))

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

for fold_id, (train_index, valid_index) in enumerate(cv.split(X_train, y_train_enc)):
    X_tr, X_val_fold = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_val_fold = y_train_enc[train_index], y_train_enc[valid_index]

    train_pool = Pool(X_tr, y_tr, text_features=['clean_text_3'], feature_names=['clean_text_3'])
    valid_pool = Pool(X_val_fold, y_val_fold, text_features=['clean_text_3'], feature_names=['clean_text_3'])

    model = CatBoostClassifier(**catboost_params)
    model.fit(train_pool, eval_set=valid_pool)

    oof_train[valid_index] = model.predict_proba(X_val_fold)

    y_pred = model.predict_proba(X_test)
    y_preds.append(y_pred)
    models.append(model)

end = time.time()
print(f"Model training completed in {(end-start)/60:.2f} minutes")

0:	learn: 1.9720716	test: 1.9713145	best: 1.9713145 (0)	total: 128ms	remaining: 2m 7s
100:	learn: 1.0318756	test: 1.2107298	best: 1.2107298 (100)	total: 7.23s	remaining: 1m 4s
200:	learn: 0.8952948	test: 1.1910471	best: 1.1852453 (185)	total: 8.4s	remaining: 33.4s
300:	learn: 0.8185139	test: 1.1833219	best: 1.1795911 (266)	total: 9.41s	remaining: 21.9s
400:	learn: 0.7565740	test: 1.1733174	best: 1.1729169 (339)	total: 10.4s	remaining: 15.6s
500:	learn: 0.7094552	test: 1.1809378	best: 1.1714519 (441)	total: 11.5s	remaining: 11.4s
bestTest = 1.17145186
bestIteration = 441
Shrink model to first 442 iterations.
0:	learn: 1.9359640	test: 1.9631237	best: 1.9631237 (0)	total: 13ms	remaining: 13s
100:	learn: 1.0169602	test: 1.5760127	best: 1.5601642 (48)	total: 2.98s	remaining: 26.6s
bestTest = 1.560164236
bestIteration = 48
Shrink model to first 49 iterations.
0:	learn: 1.9621864	test: 1.9395706	best: 1.9395706 (0)	total: 13ms	remaining: 13s
100:	learn: 1.0231187	test: 1.4624771	best: 1.44897

## Evaluasi Model

In [None]:
# Evaluasi model
val_accs = []
train_accs = []
conf_matrices = []
class_reports = []
balanced_accs = []
f1_scores = []
recalls = []
precisions = []

for model, (train_index, valid_index) in zip(models, cv.split(X_train, y_train_enc)):
    X_tr, X_val_fold = X_train.iloc[train_index], X_train.iloc[valid_index]z
    y_tr, y_val_fold = y_train_enc[train_index], y_train_enc[valid_index]

    y_val_pred = model.predict(X_val_fold)
    y_tr_pred = model.predict(X_tr)

    val_acc = accuracy_score(y_val_fold, y_val_pred)
    train_acc = accuracy_score(y_tr, y_tr_pred)

    conf_matrix = confusion_matrix(y_val_fold, y_val_pred)
    class_report = classification_report(y_val_fold, y_val_pred)
    balanced_acc = balanced_accuracy_score(y_val_fold, y_val_pred)
    f1 = f1_score(y_val_fold, y_val_pred, average='weighted')
    recall = recall_score(y_val_fold, y_val_pred, average='weighted')
    precision = precision_score(y_val_fold, y_val_pred, average='weighted')

    val_accs.append(val_acc)
    train_accs.append(train_acc)
    conf_matrices.append(conf_matrix)
    class_reports.append(class_report)
    balanced_accs.append(balanced_acc)
    f1_scores.append(f1)
    recalls.append(recall)
    precisions.append(precision)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Melihat Evaluasi
print("Average Validation Accuracy:", np.mean(val_accs))
print("Average Train Accuracy:", np.mean(train_accs))
print("Average Balanced Accuracy:", np.mean(balanced_accs))
print("Average F1-score:", np.mean(f1_scores))
print("Average Recall:", np.mean(recalls))
print("Average Precision:", np.mean(precisions))

print("Average Confusion Matrix:")
print(np.mean(conf_matrices, axis=0))

print("Average Classification Report:")
print("{}".format("\n".join(class_reports)))

# Melihat hasil
print("Out-of-Fold Predictions:")
print(oof_train)

# Calculate global ROC AUC
roc_auc = roc_auc_score(y_train_ohe, oof_train, multi_class='ovr', average='weighted')
print(f'Global ROC AUC: {roc_auc}')

roc_auc_micro = roc_auc_score(y_train_ohe, oof_train, multi_class='ovr', average='micro')
roc_auc_macro = roc_auc_score(y_train_ohe, oof_train, multi_class='ovr', average='macro')
print(f'Micro-average ROC AUC: {roc_auc_micro}')
print(f'Macro-average ROC AUC: {roc_auc_macro}')

Average Validation Accuracy: 0.6079023212338371
Average Train Accuracy: 0.7320603521753639
Average Balanced Accuracy: 0.5234656697615844
Average F1-score: 0.6331170291048862
Average Recall: 0.6079023212338371
Average Precision: 0.6974618004200894
Average Confusion Matrix:
[[1.800e+00 0.000e+00 0.000e+00 3.000e-01 0.000e+00 1.300e+00 1.300e+00
  5.000e-01]
 [0.000e+00 1.910e+01 0.000e+00 1.000e-01 4.000e-01 2.600e+00 2.300e+00
  1.900e+00]
 [0.000e+00 0.000e+00 5.000e-01 1.000e-01 0.000e+00 6.000e-01 2.000e-01
  3.000e-01]
 [1.200e+00 7.000e-01 1.000e-01 1.520e+01 6.000e-01 7.900e+00 2.500e+00
  1.000e+00]
 [4.000e-01 3.000e-01 0.000e+00 6.000e-01 2.070e+01 3.400e+00 2.100e+00
  6.000e-01]
 [7.200e+00 1.250e+01 1.000e+00 2.030e+01 1.720e+01 1.572e+02 2.620e+01
  1.090e+01]
 [1.700e+00 1.700e+00 6.000e-01 1.500e+00 1.700e+00 7.600e+00 1.840e+01
  2.900e+00]
 [5.000e-01 1.900e+00 5.000e-01 1.000e-01 4.000e-01 2.200e+00 2.000e+00
  5.700e+00]]
Average Classification Report:
              p

# Saving And Load Model

In [None]:
# Save models, out-of-fold predictions, and class weights
def save_artifacts(models, oof_train, class_weights_dict, model_base_path):
    # Create the directory if it doesn't exist
    os.makedirs(model_base_path, exist_ok=True)

    # Save each trained model to separate files
    for idx, model in enumerate(models):
        model_name = f"{model_base_path}/catboost_model_fold_{idx}.pkl"
        with open(model_name, 'wb') as file:
            pickle.dump(model, file)

    # Save out-of-fold predictions
    np.savetxt(f"{model_base_path}/oof_train.csv", oof_train, delimiter=',')

    # Save class weights
    with open(f"{model_base_path}/class_weights.pkl", 'wb') as file:
        pickle.dump(class_weights_dict, file)

In [None]:
# Load models, out-of-fold predictions, and class weights
def load_artifacts(num_models, model_base_path):
    models = []
    for idx in range(num_models):
        model_name = f"{model_base_path}/catboost_model_fold_{idx}.pkl"
        with open(model_name, 'rb') as file:
            model = pickle.load(file)
            models.append(model)

    oof_train = np.loadtxt(f"{model_base_path}/oof_train.csv", delimiter=',')

    with open(f"{model_base_path}/class_weights.pkl", 'rb') as file:
        class_weights_dict = pickle.load(file)

    return models, oof_train, class_weights_dict

In [None]:
# Contoh path untuk menyimpan dan memuat model
model_base_path = os.path.join(mainPath, "Model_Trained/Catboost_Model")

In [None]:
# Save artifacts after training
save_artifacts(models, oof_train, class_weights_dict, model_base_path)

In [None]:
# Load artifacts
loaded_models, loaded_oof_train, loaded_class_weights = load_artifacts(5, model_base_path)

# Predict to Test

In [None]:
import numpy as np

# Convert predicted probabilities to class labels
def predict_with_loaded_models(models, X_test):
    preds = np.zeros((X_test.shape[0], len(models[0].classes_)))
    for model in models:
        preds += model.predict_proba(X_test) / len(models)
    return preds

In [None]:
y_pred_proba = predict_with_loaded_models(loaded_models, X_test)
y_pred_indices = np.argmax(y_pred_proba, axis=1)

# Mengubah indeks kelas menjadi label asli
y_pred_labels = label_encoder.inverse_transform(y_pred_indices)

# Menggunakan prediksi
print("Predicted probabilities:", y_pred_proba)
print("Predicted class indices:", y_pred_indices)
print("Predicted class labels:", y_pred_labels)

Predicted probabilities: [[0.11597535 0.04952839 0.04909565 ... 0.38557923 0.11852996 0.0616654 ]
 [0.11245506 0.04176723 0.06281841 ... 0.38374108 0.09487796 0.05140884]
 [0.14354886 0.0794118  0.05885153 ... 0.11411064 0.31213895 0.13015763]
 ...
 [0.01476337 0.01445453 0.00699064 ... 0.04300016 0.03679253 0.02331172]
 [0.10781529 0.04573287 0.05105667 ... 0.24484451 0.09560803 0.06558173]
 [0.09928381 0.04887496 0.08093339 ... 0.37146278 0.12706164 0.06544911]]
Predicted class indices: [5 5 6 5 5 5 3 5 1 6 3 5 5 5 1 4 3 5 5 5 5 5 5 7 5 3 5 5 6 6 1 7 5 5 6 6 3
 1 6 5 0 5 6 6 7 5 4 3 1 3 5 1 7 4 5 3 6 5 7 5 5 1 5 5 5 6 5 1 7 1 5 5 0 6
 4 4 3 1 5 0 5 7 5 5 5 5 5 4 2 5 5 6 5 6 1 5 6 5 7 5 5 3 5 5 3 5 1 6 5 5 5
 5 3 5 0 5 5 5 7 3 1 4 3 4 3 1 4 5 6 6 7 1 5 5 0 5 5 5 6 0 5 5 5 7 5 3 5 5
 1 7 6 6 5 5 1 5 5 4 7 1 1 0 5 1 4 7 6 5 1 7 5 6 6 5 5 4 5 5 5 5 5 3 4 1 5
 1 1 1 1 1 5 3 4 3 3 5 5 0 5 5 6 1 4 5 6 5 7 3 5 7 5 5 3 5 5 6 6 4 5 7 5 3
 5 6 5 5 5 5 4 3 5]
Predicted class labels: ['Politik' '

In [None]:
balanced_acc = balanced_accuracy_score(y_test, y_pred_labels)
balanced_acc

0.4521549475494896

In [None]:
from collections import Counter
label_counts = Counter(y_pred_labels)
label_counts

Counter({'Politik': 244,
         'Ekonomi': 88,
         'Ideologi': 38,
         'Sumber Daya Alam': 195,
         'Sosial Budaya': 153,
         'Geografi': 137,
         'Demografi': 13,
         'Pertahanan dan Keamanan': 132})

# Model Inference to Submissions

In [None]:
X_test

Unnamed: 0,clean_text_3
3371,eks panglima gam aceh labuh prabowo deklarasi ...
1161,prabowo jabar semangat maju bangsa berbuatbaik...
4543,kaget prabowo jabat tangan warga blora rasa en...
2927,anies baswedan pimpin rindu rakyat solusi pj h...
3252,anies tinggal ahy pilih cak imin isi kepala pe...
...,...
2143,sebar luas pelosok negeri ngeri banget inidika...
1816,sinyal kuat prabowo subianto putus cuti dukung...
2160,januari serang kubu dkk sengit doxxing akun ak...
1333,prabowo gibran pimpin penuh dedikasi komitmen ...


In [None]:
y_pred_proba = predict_with_loaded_models(loaded_models, test.rename(columns = {"Text":"clean_text_3"})["clean_text_3"].to_frame())
y_pred_indices = np.argmax(y_pred_proba, axis=1)

# Mengubah indeks kelas menjadi label asli
y_pred_labels = label_encoder.inverse_transform(y_pred_indices)

In [None]:
Counter(y_pred_labels)

Counter({'Politik': 244,
         'Ekonomi': 88,
         'Ideologi': 38,
         'Sumber Daya Alam': 195,
         'Sosial Budaya': 153,
         'Geografi': 137,
         'Demografi': 13,
         'Pertahanan dan Keamanan': 132})

In [None]:
submissions["Kelas"] = y_pred_labels

In [None]:
submissions.to_csv(submissionsPath + "/[Catboost-Clean_Text_4]SD2024040000208.csv",index = False)