In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras import backend as K
from sklearn.metrics import balanced_accuracy_score

from sklearn.model_selection import train_test_split

import os
import sys
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

In [2]:
# DO NOT CHANGE THIS CODE OR THE TESTS MAY NOT WORK
vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

In [3]:
def get_paddedsequences(sentences, tokenizer):
    sequences = tokenizer.texts_to_sequences(sentences)
    return pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

def get_ds(padsequences, labels):
    ds = tf.data.Dataset.from_tensor_slices((padsequences, labels))
    ds = ds.cache()
    ds = ds.batch(32)
    return ds.prefetch(tf.data.AUTOTUNE)

In [4]:
def get_model():
    model = tf.keras.Sequential([
        keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
        keras.layers.BatchNormalization(),
        
        keras.layers.Bidirectional(tf.keras.layers.LSTM(64,return_sequences=True)),
        
        keras.layers.GlobalMaxPool1D(),
        keras.layers.BatchNormalization(),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dropout(0.3),
        
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.4),
        
        keras.layers.Dense(8, activation='softmax')
    ])
    
    model.compile(optimizer = tf.keras.optimizers.Adam(0.001), loss=keras.losses.SparseCategoricalCrossentropy(), metrics=["accuracy"])
    return model

In [5]:
# input Data
df_handled =  pd.read_csv("../Temp/cleaned_datav2_translated_lemarized_stopwords_Handled.csv").iloc[:,1:]
df_unhandled = pd.read_csv("../Temp/cleaned_datav2_translated_lemarized_stopwords.csv")

### Unhandled

In [8]:
label_encoder = LabelEncoder()
df_unhandled['label'] = label_encoder.fit_transform(df_unhandled['label'])

# Print label mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)

{'Demografi': 0, 'Ekonomi': 1, 'Geografi': 2, 'Ideologi': 3, 'Pertahanan dan Keamanan': 4, 'Politik': 5, 'Sosial Budaya': 6, 'Sumber Daya Alam': 7}


In [9]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(df_unhandled["text"], df_unhandled["label"], test_size=0.2, random_state=42)

In [19]:
#the token
tokenizer = Tokenizer(oov_token=oov_tok, num_words=vocab_size)
tokenizer.fit_on_texts(train_sentences)

training_pad_sequences=get_paddedsequences(train_sentences, tokenizer)
validation_pad_sequences=get_paddedsequences(val_sentences, tokenizer)

In [20]:
train_ds=get_ds(training_pad_sequences, train_labels)
val_ds=get_ds(validation_pad_sequences, val_labels)

In [23]:
model=get_model()
model.fit(train_ds, validation_data=val_ds, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x170a38444f0>

In [24]:
predicted_class_labels = model.predict(validation_pad_sequences)
predicted_class_labels = np.argmax(predicted_class_labels, axis=1)
balanced_accuracy_score(val_labels, predicted_class_labels)



0.4359616264333467

### Handled + SMOTE

In [18]:
label_encoder = LabelEncoder()
df_handled['label'] = label_encoder.fit_transform(df_handled['label'])

# Print label mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)

{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7}


In [19]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(df_handled["text"], df_handled["label"], test_size=0.2, random_state=42)

In [20]:
#the token
tokenizer = Tokenizer(oov_token=oov_tok, num_words=vocab_size)
tokenizer.fit_on_texts(train_sentences)

training_pad_sequences=get_paddedsequences(train_sentences, tokenizer)
validation_pad_sequences=get_paddedsequences(val_sentences, tokenizer)

In [21]:
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(training_pad_sequences, train_labels)

In [25]:
train_ds=get_ds(X_resampled, y_resampled)
val_ds=get_ds(validation_pad_sequences, val_labels)

In [26]:
model=get_model()
model.fit(train_ds, validation_data=val_ds, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1ac11b0b8e0>

In [27]:
predicted_class_labels = model.predict(validation_pad_sequences)
predicted_class_labels = np.argmax(predicted_class_labels, axis=1)
balanced_accuracy_score(val_labels, predicted_class_labels)



0.12083333333333333

### Handled

In [7]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(df_handled["text"], df_handled["label"], test_size=0.2, random_state=42)

In [8]:
#the token
tokenizer = Tokenizer(oov_token=oov_tok, num_words=vocab_size)
tokenizer.fit_on_texts(train_sentences)

training_pad_sequences=get_paddedsequences(train_sentences, tokenizer)
validation_pad_sequences=get_paddedsequences(val_sentences, tokenizer)

In [9]:
train_ds=get_ds(training_pad_sequences, train_labels)
val_ds=get_ds(validation_pad_sequences, val_labels)

In [16]:
model=get_model()
model.fit(train_ds, validation_data=val_ds, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1abe3d679d0>

In [17]:
predicted_class_labels = model.predict(validation_pad_sequences)
predicted_class_labels = np.argmax(predicted_class_labels, axis=1)
balanced_accuracy_score(val_labels, predicted_class_labels)



0.27259712157809984