In [6]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras import backend as K
from sklearn.metrics import balanced_accuracy_score

from sklearn.model_selection import train_test_split

import os
import sys

In [7]:
def create_submission(predicted, path = "submission.csv"):
    folder_loc = '/'.join([i for i in path.split("/")][:-1])
    if not os.path.exists(folder_loc) and folder_loc != '':
        os.makedirs(folder_loc)
    df = pd.read_excel("Data/Submission_Format.xlsx")
    df["label"] = predicted
    df.to_csv(path, index=False)

In [14]:
# input Data
df_handling = pd.read_csv("../Temp/Handling/nodrop_v2_translated_lemarized_stopwords.csv").iloc[:,1:]
df_nonhandling = pd.read_csv("../Temp/cleaned_datav2_translated_lemarized_stopwords.csv")

# Tensorflow Bi-LSTM

In [30]:
# Was run on
print(f'PY version   : {sys.version}\nHardware     : {tf.config.list_physical_devices()[-1]}')

PY version   : 3.9.18 (main, Sep 11 2023, 14:09:26) [MSC v.1916 64 bit (AMD64)]
Hardware     : PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


## Handling

In [15]:
label_encoder = LabelEncoder()
df_handling['label'] = label_encoder.fit_transform(df_handling['label'])

# Print label mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)

{'Demografi': 0, 'Ekonomi': 1, 'Geografi': 2, 'Ideologi': 3, 'Pertahanan dan Keamanan': 4, 'Politik': 5, 'Sosial Budaya': 6, 'Sumber Daya Alam': 7}


In [16]:
# DO NOT CHANGE THIS CODE OR THE TESTS MAY NOT WORK
vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

In [17]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(df_handling["text"], df_handling["label"], test_size=0.2, random_state=42)

In [18]:
#the token
tokenizer = Tokenizer(oov_token=oov_tok, num_words=vocab_size)
tokenizer.fit_on_texts(train_sentences)

def get_paddedsequences(sentences):
    sequences = tokenizer.texts_to_sequences(sentences)
    return pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

training_pad_sequences=get_paddedsequences(train_sentences)
validation_pad_sequences=get_paddedsequences(val_sentences)

In [19]:
def get_ds(padsequences, labels):
    ds = tf.data.Dataset.from_tensor_slices((padsequences, labels))
    ds = ds.cache()
    ds = ds.batch(32)
    return ds.prefetch(tf.data.AUTOTUNE)

train_ds=get_ds(training_pad_sequences, train_labels)
val_ds=get_ds(validation_pad_sequences, val_labels)

In [20]:
def get_model():
    model = tf.keras.Sequential([
        keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
        keras.layers.BatchNormalization(),
        
        keras.layers.Bidirectional(tf.keras.layers.LSTM(64,return_sequences=True)),
        
        keras.layers.GlobalMaxPool1D(),
        keras.layers.BatchNormalization(),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dropout(0.3),
        
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.4),
        
        keras.layers.Dense(8, activation='softmax')
    ])
    
    model.compile(optimizer = tf.keras.optimizers.Adam(0.001), loss=keras.losses.SparseCategoricalCrossentropy(), metrics=["accuracy"])
    return model

In [22]:
model=get_model()
model.fit(train_ds, validation_data=val_ds, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x14ff1194760>

In [23]:
predicted_class_labels = model.predict(validation_pad_sequences)
predicted_class_labels = np.argmax(predicted_class_labels, axis=1)
balanced_accuracy_score(val_labels, predicted_class_labels)



0.9383464754103183

## Non Handling

In [24]:
label_encoder = LabelEncoder()
df_nonhandling['label'] = label_encoder.fit_transform(df_nonhandling['label'])

# Print label mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)

{'Demografi': 0, 'Ekonomi': 1, 'Geografi': 2, 'Ideologi': 3, 'Pertahanan dan Keamanan': 4, 'Politik': 5, 'Sosial Budaya': 6, 'Sumber Daya Alam': 7}


In [25]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(df_nonhandling["text"], df_nonhandling["label"], test_size=0.2, random_state=42)

In [26]:
training_pad_sequences=get_paddedsequences(train_sentences)
validation_pad_sequences=get_paddedsequences(val_sentences)

train_ds=get_ds(training_pad_sequences, train_labels)
val_ds=get_ds(validation_pad_sequences, val_labels)

In [27]:
model=get_model()
model.fit(train_ds, validation_data=val_ds, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x14fffc5a7f0>

In [29]:
predicted_class_labels = model.predict(validation_pad_sequences)
predicted_class_labels = np.argmax(predicted_class_labels, axis=1)
balanced_accuracy_score(val_labels, predicted_class_labels)



0.46413441482977946

# Indobert

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

## Non Handling

In [None]:
df = pd.read_csv("../TEMP/cleaned_datav2_translated_lemarized_stopwords.csv")

In [None]:
# Load IndoBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('indolem/indobert-base-uncased')
model = BertForSequenceClassification.from_pretrained('indolem/indobert-base-uncased', num_labels=len(label_mapping))

# Check if GPU is available
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Move model to the same device
model.to(device)

In [None]:
# Tokenize the input texts
inputs = tokenizer(df['text'].tolist(), padding=True, truncation=True, max_length=512, return_tensors='pt')
labels = torch.tensor(df['label'].values)

# Create a dataset class with correct label type
class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Create the dataset
dataset = SimpleDataset(inputs, labels)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=10,              # Number of training epochs
    per_device_train_batch_size=2,   # Batch size for training
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
)

# Initialize Trainer
trainer = Trainer(
    model=model,                         # The instantiated 🤗 Transformers model to be trained
    args=training_args,                  # Training arguments
    train_dataset=dataset,               # Training dataset
)

# Train the model
trainer.train()

In [None]:
# Tokenize input text
text = "jokowi menerapkan kebiakan"
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)

# Move input tensors to the same device as the model
inputs = {key: tensor.to(device) for key, tensor in inputs.items()}

# Perform inference
model.eval()  # Set model to evaluation mode
with torch.no_grad():
    outputs = model(**inputs)

# Get predictions
predictions = torch.argmax(outputs.logits, dim=1)

predicted_label = label_encoder.inverse_transform(predictions.cpu())[0]
predicted_label