# Dependencies

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
root_path = "drive/MyDrive/IF/Sem7/NLP/2/"

In [3]:
! pip install Sastrawi
! pip install tensorflow_text
! pip install glove-python-binary
! pip install keras_tuner --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Libraries

In [4]:
import numpy as np
import pandas as pd
import keras_tuner as kt
from glove import Corpus, Glove
import tensorflow_text as tftext
from tensorflow.ragged import constant
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.initializers import Constant
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.metrics import Accuracy, Precision, Recall
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from tensorflow.keras.layers import Embedding, Dropout, Bidirectional, LSTM, Dense, InputLayer, LeakyReLU

# Data + Preprocessing

## Reading the Data

In [5]:
def tokenize(sent):
    tokens = sent.split()
    tokens = list(filter(lambda token: len(token) > 1, tokens))
    return tokens

In [6]:
# Split label and features
def split_dataframe(df):
    df_features = df.loc[:, "text_a"].str.lower()

    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()

    df_features = df_features.apply(lambda x: stopword.remove(x))
    df_features = df_features.apply(lambda x: tokenize(x))

    return df_features, df.loc[:, "label"]

### Training Data

In [7]:
df_train = pd.read_csv(f"{root_path}train.csv")
df_train.drop("Unnamed: 0", axis=1, inplace=True)
df_train

Unnamed: 0,text_a,label
0,betewe buka twitter cuman ngetweet liat home b...,no
1,mas piyuuu mugo2 corona tuh mulut tersumpal ma...,no
2,e100ss gini buka informasi sejelas nya identit...,yes
3,neng solo wes ono terduga corona cobo neng ati...,no
4,midiahn nii akun gak takut takut nya isu coron...,no
...,...,...
21596,depok panas ga karuan kereta sampe pasming huj...,no
21597,oxfara arie kriting yg lebi goblo nya orang ke...,no
21598,virus corona menyaba depok cuci tangan makan n...,no
21599,mata sipit tinggal depok udah abis dah bahan c...,no


In [8]:
X_train, y_train = split_dataframe(df_train)

### Validation Data

In [9]:
df_val = pd.read_csv(f"{root_path}dev.csv")
df_val

Unnamed: 0,text_a,label
0,jek dajal ga depok bang,no
1,detikcom untung depok masuk wilayah nya ridwan...,no
2,df dom jakarta depok yg gunain vc cabang nya c...,no
3,your2rl depok jkt,no
4,doakan indonesia selamat virus corona pkb depo...,yes
...,...,...
2795,ku tenang2 bae ku sih ya corona nya ga depok k...,no
2796,guru hati hati ya virus corona uda indonesia t...,yes
2797,4 terawan menyebut virus corona indonesia terd...,yes
2798,realffk buhari can t pronounce corona virus,no


In [10]:
X_val, y_val = split_dataframe(df_val)

### Test Data

In [11]:
df_test = pd.read_csv(f"{root_path}test.csv")
df_test

Unnamed: 0,text_a,label
0,jek dajal ga depok bang,no
1,detikcom untung depok masuk wilayah nya ridwan...,no
2,df dom jakarta depok yg gunain vc cabang nya c...,no
3,your2rl depok jkt,no
4,doakan indonesia selamat virus corona pkb depo...,yes
...,...,...
2795,ku tenang2 bae ku sih ya corona nya ga depok k...,no
2796,guru hati hati ya virus corona uda indonesia t...,yes
2797,4 terawan menyebut virus corona indonesia terd...,yes
2798,realffk buhari can t pronounce corona virus,no


In [12]:
X_test, y_test = split_dataframe(df_test)

## EDA

In [13]:
# Mencari panjang rata-rata dari kalimat yang ada
df_train["text_a"].apply(lambda x: len(x.split())).mean()

15.61811953150317

# Glove Embeddings

Pada tugas kali ini, penulis sebelumnya berencana untuk mencoba membangun *word embeddings vector* sendiri dengan menggunakan *corpus* yang didapat dari Wikipedia Indonesia. Namun, penulis tidak bisa menemukan berkas-berkas yang dibutuhkan oleh GloVe untuk membangun *embeddings* tersebut. Selain itu, OS yang dimiliki penulis tidak kompatibel dan tidak mampu menjalankan *script* yang digunakan oleh GloVe dalam membangun *embeddings* tersebut.

Pendekatan lain yang pernah penulis lakukan sebelumnya adalah dengan membangun *embeddings* sendiri dengan menggunakan pustaka GloVe Python. Hal ini dapat dilakukan dengan menggunakan teks yang ada sebagai masukan untuk pembangunan *embeddings* dengan menggunakan kode program berikut.

In [14]:
def create_corpus(tokens, window=20):
  # Instantiate the corpus
  c = Corpus()

  # Create the occurence matrix with context window of 20
  # Context window is the technique of counting co-occurence
  # 20 means that we will count co-occurence 20 words left-right
  # The number is chosen because average maximum words for the dataset is approx. 15
  c.fit(tokens, window)
  
  return c

In [15]:
# Embed the corpus into GloVe Model
# Components are the numbers of latent vector dimension
# Learning Rate is the SGD Learning Rate
# Epochs is the number of training epochs for fitting the corpus
# Number of threads is the number of threads used in training the data
def embed_glove(c, num_of_components=100, lr=0.05, epochs=50, num_of_threads=30):
  # Instantiate the model
  glove = Glove(no_components=num_of_components, learning_rate=lr)

  # Fit over the co-occurence matrix in the corpus
  glove.fit(c.matrix, epochs=epochs, no_threads=num_of_threads)

  # Add the vocab of corpus to the model
  glove.add_dictionary(c.dictionary)

  return glove

In [16]:
# Flatten list of list of tokens
def flatten_tokens(lol_tokens):
    return [t for lot in lol_tokens for t in lot]

In [17]:
# Using Tensorflow Keras Tokenizer API
# The tokenizer will take all preprocessed tokens from before
def construct_tokenizer(tokens, lower_text=False):
  # Create the tokenizer with specialized Out of Vocabulary Token
  tokenizer = Tokenizer(lower=lower_text, oov_token="<OOV>")

  # Fit the tokenizer into the tokens
  tokenizer.fit_on_texts(tokens)

  # Avoid out of range index when doing embedding lookup
  for i in tokenizer.word_index:
    tokenizer.word_index[i] = tokenizer.word_index[i] - 1

  return tokenizer

In [18]:
def generate_embed_matrix(embeddings, tokenizer):
  # Get the word index and number of tokens from the tokenizer
  word_index = tokenizer.word_index
  # Already included the OOV word
  num_of_tokens = len(word_index)

  # Get embeddings model dimension (length of each word vector)
  embeddings_dim = embeddings.no_components

  # Initialize numpy matrix of zeros as the embeddings matrix
  # The dimension will be the number of tokens x the embeddings dimension
  embeddings_matrix = np.zeros((num_of_tokens, embeddings_dim))

  # Iterate over all words in the word index
  for word, i in word_index.items():
    # Get the word index in the GloVe dictionary
    glove_word_index = embeddings.dictionary.get(word)
    if glove_word_index is not None:
      # Get the embeddings vector of the corresponding GloVe index
      embeddings_vector = embeddings.word_vectors[glove_word_index]
      embeddings_matrix[i] = embeddings_vector

  return embeddings_matrix

# Feature-Label Engineering

## Feature Engineering

In [19]:
# Rejoin all tokens back into a sentence
def rejoin_sentences(tokens):
  return ' '.join(tokens)

In [20]:
# Converting feature texts into sequence using Keras Tokenizer corpus
def convert_to_seq(df, tokenizer):
    df = df.apply(lambda x: rejoin_sentences(x))
    return tokenizer.texts_to_sequences(df.to_list())

In [21]:
# Generating features into Ragged Tensor
def generate_features(df, tokenizer):
    seqs = convert_to_seq(df, tokenizer)
    return constant(seqs)

## Label Encoding

In [22]:
def fit_encoder(df):
    label_encoder = LabelEncoder()
    label_encoder.fit(df)

    return label_encoder

In [23]:
def generate_labels(df, encoder):
    df_labels = encoder.transform(df)
    n_class = len(np.unique(df_labels))
    return df_labels

# Modelling

In [24]:
def construct_model(hp):
    # Get global model properties
    vocab_size = len(tokenizer.word_index)      # Number of tokens or vocabularies
    embeddings_dim = glove.no_components        # The dimension of embeddings matrix vector

    hp_units = hp.Int('units', min_value=32, max_value=128, step=32)
    hp_dropout = hp.Choice('dropout', values=[2e-1, 4e-1])
    hp_lr = hp.Float("learning_rate", min_value=1e-3, max_value=1e-2, sampling="log")

    # Initialize the Sequential Model
    seq_model = Sequential([
        # Initialize Ragged Input Layer
        InputLayer(input_shape=(None, ), ragged=True),
        # Convert the layer into densely-connected layer
        tftext.keras.layers.ToDense(pad_value=0, mask=True),
        # Inititalize Embedding Layer with weighted embeddings matrix
        Embedding(vocab_size, embeddings_dim, embeddings_initializer=Constant(embeddings_matrix), weights=[embeddings_matrix]),
        # Bidirectional LSTM
        # Bidirectional here means that the LSTM can do two-way learning
        # Return Sequences because there are still many layers needing it
        Bidirectional(LSTM(hp_units, return_sequences=True)),
        # Dropout Regularization
        Dropout(hp_dropout),
        # One directional LSTM layer
        LSTM(hp_units),
        # Transform the output with Leaky ReLU
        LeakyReLU(),
        # Add last dense layer with specified Activation Function
        Dense(1, "sigmoid", kernel_regularizer=l2(0.01))],
        name="nlp-non-ctx")
    
    seq_model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=hp_lr), metrics=["accuracy", Precision(), Recall()])

    return seq_model

In [25]:
def create_tuner():
    tuner = kt.Hyperband(construct_model,
                         objective='val_accuracy',
                         max_epochs=4,
                         factor=3,
                         directory='models',
                         project_name='nlp-non-ctx')
    
    cb = EarlyStopping(monitor='val_loss', patience=5)

    return tuner, cb

In [26]:
def execute_tuning(tuner, cb, X_train, y_train, X_val, y_val, epochs=2):
    tuner.search(X_train, y_train, epochs=epochs, validation_data=(X_val, y_val), callbacks=[cb])
    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

    return tuner, best_hps

In [27]:
def show_model_summary(model):
    model.summary()

In [28]:
# Create and train the hyper model
def fit_train(tuner, best_hps, X_train, y_train, X_val, y_val, epochs=10):
    hypermodel = tuner.hypermodel.build(best_hps)
    history = hypermodel.fit(X_train, y_train, epochs=epochs, validation_data=(X_val, y_val))
    
    val_acc_per_epoch = history.history['val_accuracy']
    best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1

    hypermodel = tuner.hypermodel.build(best_hps)

    # Retrain the model
    hypermodel.fit(X_train, y_train, epochs=best_epoch, validation_data=(X_val, y_val))

    return hypermodel

In [39]:
def eval_model(hypermodel, X_test, y_test):
    hypermodel.evaluate(X_test, y_test)

# Execution

## Generate GloVe Embeddings

In [30]:
tokens = X_train.to_list()
corpus = create_corpus(tokens)
glove = embed_glove(corpus)
flattened_tokens = flatten_tokens(tokens)
tokenizer = construct_tokenizer(flattened_tokens)
embeddings_matrix = generate_embed_matrix(glove, tokenizer)

## Data Generation

In [31]:
X_train = generate_features(X_train, tokenizer)
label_encoder = fit_encoder(y_train)
y_train = generate_labels(y_train, label_encoder)

In [32]:
X_val = generate_features(X_val, tokenizer)
y_val = generate_labels(y_val, label_encoder)

In [33]:
X_test = generate_features(X_test, tokenizer)
y_test = generate_labels(y_test, label_encoder)

## Hyperparameter Tuning + Training + Evaluation

In [34]:
tuner, cb = create_tuner()
tuner, best_hps = execute_tuning(tuner, cb, X_train, y_train, X_val, y_val)

Trial 10 Complete [00h 09m 31s]
val_accuracy: 0.8657143115997314

Best val_accuracy So Far: 0.866428554058075
Total elapsed time: 00h 56m 31s


In [35]:
# Train model with best epoch and hyperparameters
hypermodel = fit_train(tuner, best_hps, X_train, y_train, X_val, y_val)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [43]:
# Tuning results
print(best_hps.get("units"))
print(best_hps.get("dropout"))
print(best_hps.get("learning_rate"))

96
0.2
0.0011980378163067161


In [40]:
# Model Evaluation
eval_model(hypermodel, X_test, y_test)



# References

1.   https://www.tensorflow.org/tutorials/keras/keras_tuner
2.   https://keras.io/guides/keras_tuner/getting_started/
3.   https://keras.io/api/keras_tuner/tuners/hyperband/
4.   https://towardsdatascience.com/sentiment-analysis-using-lstm-and-glove-embeddings-99223a87fe8e
5.   https://rifqifai.com/membuat-model-glove-dari-korpus-wikipedia-bahasa-indonesia/
6.   https://github.com/stanfordnlp/GloVe
7.   https://coderzcolumn.com/tutorials/artificial-intelligence/keras-glove-embeddings-for-text-classification
8.   https://www.kaggle.com/code/hamishdickson/bidirectional-lstm-in-keras-with-glove-embeddings/notebook
9.   https://ksnugroho.medium.com/dasar-text-preprocessing-dengan-python-a4fa52608ffe

