# Library

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

from keras.utils.np_utils import to_categorical
from keras.models import load_model, Model, Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Embedding, SimpleRNN, Dropout

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Preprocessing
### Load Data

In [2]:
df = pd.read_pickle('preprocessed_df.pkl')
df = pd.DataFrame(df[['detokenize','akting']])
df.head()

Unnamed: 0,detokenize,akting
0,the desperate hour lakewood salah cerita suara...,1
1,edisi males review singkat tonton libur dp des...,1
2,plot utama orang deserter pursuit buru wamil j...,1
3,film hereditary horror thrill midsommar gatau ...,0
4,batman manusiawi tarung nya sadis scene pursui...,0


### Vectorize

In [3]:
df['sent_len'] = df['detokenize'].apply(lambda x: len(x.split(" ")))
max_sequence_length = np.round(df['sent_len'].mean() + 2 * df['sent_len'].std()).astype(int)
max_sequence_length

32

In [4]:
MAX_NB_WORDS = 1e5
le = LabelEncoder()

reviews = df["detokenize"]
label = le.fit_transform(df['akting'])
label = np.asarray(to_categorical(label))

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(reviews)
review_seq = np.array(tokenizer.texts_to_sequences(reviews), dtype=object)
review_seq = pad_sequences(review_seq, padding='post', maxlen=max_sequence_length)

word_index = tokenizer.word_index
print(f'Dictionary Size\t: {len(word_index)}')
print(f'Example Label\t:\n{label[:5]}')

Dictionary Size	: 17718
Example Label	:
[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]]


# Variables

In [5]:
embedding_dim = 300
vocab_size = min(MAX_NB_WORDS, len(word_index) + 1)
test_size_split = 0.1

# Split Data

In [6]:
x_train, x_test, y_train, y_test = (
    train_test_split(review_seq, 
                     label, 
                     test_size=test_size_split, 
                     stratify = label, 
                     random_state=42)
)

x_train = tf.constant(x_train)
x_test = tf.constant(x_test)

y_train = tf.constant(y_train)
y_test = tf.constant(y_test)

print(f'Train\t| X shape: {x_train.shape}\tY shape: {y_train.shape}')
print(f'Test\t| X shape: {x_test.shape}\tY shape: {y_test.shape}')

Train	| X shape: (15522, 32)	Y shape: (15522, 3)
Test	| X shape: (1725, 32)	Y shape: (1725, 3)


# Model RNN

In [7]:
def RNN(x_train_c, y_train_c, x_test_c, y_test_c):
    rnn = Sequential()
    rnn.add(Embedding(vocab_size, 
                      embedding_dim, 
                      input_length=max_sequence_length,
                      trainable=False))
    rnn.add(SimpleRNN(units=256, activation='relu', recurrent_dropout=0.2, return_sequences=True))
    rnn.add(Dropout(0.2))
    rnn.add(SimpleRNN(units=128, activation='relu', return_sequences=True))
    rnn.add(Dropout(0.2))
    rnn.add(SimpleRNN(units=64, activation='relu'))
    rnn.add(Dropout(0.2))
    rnn.add(Dense(units=3, activation='softmax'))
    
    rnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    early_stopping = EarlyStopping(monitor='val_loss', mode = 'min', verbose = 1, patience=3, min_delta=0.0001)
    model_checkpoint = ModelCheckpoint('rnn_baseline_akting.h5', monitor = 'val_accuracy', mode = 'max', verbose = 1, save_best_only = True)
    
    history = rnn.fit(x_train_c, y_train_c, epochs= 10, batch_size=128, 
                      validation_data=(x_test_c, y_test_c), verbose = 1,
                      callbacks=[early_stopping, model_checkpoint])
    
    model = load_model('rnn_baseline_akting.h5')
    y_pred = model.predict(x_test_c)
    y_pred_cat = y_pred.argmax(axis=1)
    y_test_cat = np.argmax(y_test_c, axis=1)
   
    cm = confusion_matrix(y_test_cat, y_pred_cat)
    classreport = classification_report(y_test_cat, y_pred_cat)
    f1 = f1_score(y_test_cat, y_pred_cat,  average="macro")
    recall = recall_score(y_test_cat, y_pred_cat, average="macro")
    precision = precision_score(y_test_cat, y_pred_cat, average="macro")
    accuracy = accuracy_score(y_test_cat, y_pred_cat)

    print(classreport)
    print("F1 Score : ", f1)
    print("Precision : ", precision)
    print("Recall : ", recall)
    print("Accuracy : ", accuracy)

    return [f1, precision, recall, accuracy, cm]

# Running

In [8]:
hasil = []

for i in range(1, 6):
    hasil.append([str(i)] + RNN(x_train, y_train, x_test, y_test))

Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.79826, saving model to rnn_baseline_akting.h5
Epoch 2/10
Epoch 2: val_accuracy did not improve from 0.79826
Epoch 3/10
Epoch 3: val_accuracy did not improve from 0.79826
Epoch 4/10
Epoch 4: val_accuracy improved from 0.79826 to 0.87420, saving model to rnn_baseline_akting.h5
Epoch 5/10
Epoch 5: val_accuracy did not improve from 0.87420
Epoch 6/10
Epoch 6: val_accuracy improved from 0.87420 to 0.87826, saving model to rnn_baseline_akting.h5
Epoch 7/10
Epoch 7: val_accuracy improved from 0.87826 to 0.87942, saving model to rnn_baseline_akting.h5
Epoch 8/10
Epoch 8: val_accuracy improved from 0.87942 to 0.88116, saving model to rnn_baseline_akting.h5
Epoch 9/10
Epoch 9: val_accuracy improved from 0.88116 to 0.88870, saving model to rnn_baseline_akting.h5
Epoch 10/10
Epoch 10: val_accuracy improved from 0.88870 to 0.89217, saving model to rnn_baseline_akting.h5
              precision    recall  f1-score   support

           0      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.79826, saving model to rnn_baseline_akting.h5
Epoch 2/10
Epoch 2: val_accuracy did not improve from 0.79826
Epoch 3/10
Epoch 3: val_accuracy improved from 0.79826 to 0.87826, saving model to rnn_baseline_akting.h5
Epoch 4/10
Epoch 4: val_accuracy improved from 0.87826 to 0.89275, saving model to rnn_baseline_akting.h5
Epoch 5/10
Epoch 5: val_accuracy did not improve from 0.89275
Epoch 6/10
Epoch 6: val_accuracy did not improve from 0.89275
Epoch 7/10
Epoch 7: val_accuracy did not improve from 0.89275
Epoch 8/10
Epoch 8: val_accuracy improved from 0.89275 to 0.89449, saving model to rnn_baseline_akting.h5
Epoch 9/10
Epoch 9: val_accuracy did not improve from 0.89449
Epoch 10/10
Epoch 10: val_accuracy did not improve from 0.89449
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        57
           1       0.94      0.95      0.95      1377
           2       0.71      0.78      0.7

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.79826, saving model to rnn_baseline_akting.h5
Epoch 2/10
Epoch 2: val_accuracy did not improve from 0.79826
Epoch 3/10
Epoch 3: val_accuracy improved from 0.79826 to 0.87826, saving model to rnn_baseline_akting.h5
Epoch 4/10
Epoch 4: val_accuracy improved from 0.87826 to 0.87884, saving model to rnn_baseline_akting.h5
Epoch 5/10
Epoch 5: val_accuracy improved from 0.87884 to 0.89217, saving model to rnn_baseline_akting.h5
Epoch 6/10
Epoch 6: val_accuracy did not improve from 0.89217
Epoch 7/10
Epoch 7: val_accuracy did not improve from 0.89217
Epoch 8/10
Epoch 8: val_accuracy improved from 0.89217 to 0.89565, saving model to rnn_baseline_akting.h5
Epoch 8: early stopping
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        57
           1       0.92      0.97      0.95      1377
           2       0.75      0.73      0.74       291

    accuracy                           0.90  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.79826, saving model to rnn_baseline_akting.h5
Epoch 2/10
Epoch 2: val_accuracy did not improve from 0.79826
Epoch 3/10
Epoch 3: val_accuracy improved from 0.79826 to 0.85565, saving model to rnn_baseline_akting.h5
Epoch 4/10
Epoch 4: val_accuracy improved from 0.85565 to 0.87652, saving model to rnn_baseline_akting.h5
Epoch 5/10
Epoch 5: val_accuracy improved from 0.87652 to 0.88232, saving model to rnn_baseline_akting.h5
Epoch 6/10
Epoch 6: val_accuracy improved from 0.88232 to 0.88638, saving model to rnn_baseline_akting.h5
Epoch 7/10
Epoch 7: val_accuracy improved from 0.88638 to 0.88928, saving model to rnn_baseline_akting.h5
Epoch 8/10
Epoch 8: val_accuracy did not improve from 0.88928
Epoch 9/10
Epoch 9: val_accuracy did not improve from 0.88928
Epoch 10/10
Epoch 10: val_accuracy did not improve from 0.88928
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        57
        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.79826, saving model to rnn_baseline_akting.h5
Epoch 2/10
Epoch 2: val_accuracy did not improve from 0.79826
Epoch 3/10
Epoch 3: val_accuracy did not improve from 0.79826
Epoch 4/10
Epoch 4: val_accuracy did not improve from 0.79826
Epoch 5/10
Epoch 5: val_accuracy did not improve from 0.79826
Epoch 6/10
Epoch 6: val_accuracy did not improve from 0.79826
Epoch 6: early stopping
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        57
           1       0.80      1.00      0.89      1377
           2       0.00      0.00      0.00       291

    accuracy                           0.80      1725
   macro avg       0.27      0.33      0.30      1725
weighted avg       0.64      0.80      0.71      1725

F1 Score :  0.29593810444874274
Precision :  0.26608695652173914
Recall :  0.3333333333333333
Accuracy :  0.7982608695652174


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Save Output

In [9]:
df = pd.DataFrame(hasil, columns =['iterasi', 'f1', 'precision', 'recall', 'accuracy', 'cm'])
df.to_excel('rnn_baseline_akting.xlsx')
df.head()

Unnamed: 0,iterasi,f1,precision,recall,accuracy,cm
0,1,0.556714,0.560093,0.555036,0.892174,"[[0, 35, 22], [0, 1337, 40], [0, 89, 202]]"
1,2,0.563894,0.549481,0.579493,0.894493,"[[0, 27, 30], [0, 1315, 62], [0, 63, 228]]"
2,3,0.562392,0.558789,0.566426,0.895652,"[[0, 32, 25], [0, 1332, 45], [0, 78, 213]]"
3,4,0.548133,0.565774,0.537565,0.889275,"[[0, 36, 21], [0, 1350, 27], [0, 107, 184]]"
4,5,0.295938,0.266087,0.333333,0.798261,"[[0, 57, 0], [0, 1377, 0], [0, 291, 0]]"
