# Library

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

from keras.utils.np_utils import to_categorical
from keras.models import load_model, Model, Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Embedding, SimpleRNN, Dropout

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Preprocessing
### Load Data

In [2]:
df = pd.read_pickle(r'C:\Users\HP Victus 16\Documents\TA_Code\Preprocessing\preprocessed_df.pkl')
df = pd.DataFrame(df[['detokenize','plot']])
df.head()

Unnamed: 0,detokenize,plot
0,the desperate hour lakewood salah cerita suara...,1
1,edisi males review singkat tonton libur dp des...,1
2,plot utama orang deserter pursuit buru wamil j...,1
3,film hereditary horror thrill midsommar gatau ...,0
4,batman manusiawi tarung nya sadis scene pursui...,1


In [3]:
reviews = df['detokenize'].astype(str)
label = pd.get_dummies(df['plot']).values

# Variables

In [4]:
#MAX_FEATURES = 10000
test_size_split = 0.1

# Split Data

In [5]:
x_train, x_test, y_train, y_test = (
    train_test_split(reviews, 
                     label, 
                     test_size=test_size_split, 
                     stratify = label, 
                     random_state=42)
)

print(f'Train\t| X shape: {x_train.shape}\tY shape: {y_train.shape}')
print(f'Test\t| X shape: {x_test.shape}\tY shape: {y_test.shape}')

Train	| X shape: (15522,)	Y shape: (15522, 3)
Test	| X shape: (1725,)	Y shape: (1725, 3)


# Feature Extraction

In [6]:
#vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features = MAX_FEATURES)
vectorizer = TfidfVectorizer(ngram_range=(1,2))
vectorizer.fit(x_train)
x_train_tfidf = vectorizer.fit_transform(x_train).todense()
x_test_tfidf = vectorizer.transform(x_test).todense()

# Matrix to Array
x_train = np.squeeze(np.asarray(x_train_tfidf))
x_train = x_train.reshape(-1, 1, x_train_tfidf.shape[1])

# Matrix to Array
x_test = np.squeeze(np.asarray(x_test_tfidf))
x_test = x_test.reshape(-1, 1, x_test_tfidf.shape[1])

# Model RNN

In [7]:
def RNN(x_train_c, y_train_c, x_test_c, y_test_c):
    rnn = Sequential()
    rnn.add(SimpleRNN(units=256, activation='relu', recurrent_dropout=0.2, return_sequences=True))
    rnn.add(Dropout(0.2))
    rnn.add(SimpleRNN(units=128, activation='relu', return_sequences=True))
    rnn.add(Dropout(0.2))
    rnn.add(SimpleRNN(units=64, activation='relu'))
    rnn.add(Dropout(0.2))
    rnn.add(Dense(units=3, activation='softmax'))
    
    rnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    rnn.build(x_train_c.shape)
    
    early_stopping = EarlyStopping(monitor='val_loss', mode = 'min', verbose = 1, patience=3, min_delta=0.0001)
    model_checkpoint = ModelCheckpoint('rnn_tfidf_plot.h5', monitor = 'val_accuracy', mode = 'max', verbose = 1, save_best_only = True)
    
    history = rnn.fit(x_train_c, y_train_c, epochs= 10, batch_size=128, 
                      validation_data=(x_test_c, y_test_c), verbose = 1,
                      callbacks=[early_stopping, model_checkpoint])
    
    model = load_model('rnn_tfidf_plot.h5')
    y_pred = model.predict(x_test_c)
    y_pred_cat = y_pred.argmax(axis=1)
    y_test_cat = np.argmax(y_test_c, axis=1)
   
    cm = confusion_matrix(y_test_cat, y_pred_cat)
    classreport = classification_report(y_test_cat, y_pred_cat)
    f1 = f1_score(y_test_cat, y_pred_cat,  average="macro")
    recall = recall_score(y_test_cat, y_pred_cat, average="macro")
    precision = precision_score(y_test_cat, y_pred_cat, average="macro")
    accuracy = accuracy_score(y_test_cat, y_pred_cat)

    print(classreport)
    print("F1 Score : ", f1)
    print("Precision : ", precision)
    print("Recall : ", recall)
    print("Accuracy : ", accuracy)

    return [f1, precision, recall, accuracy, cm]

# Running

In [8]:
hasil = []

for i in range(1, 6):
    hasil.append([str(i)] + RNN(x_train, y_train, x_test, y_test))

Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.70087, saving model to rnn_tfidf_plot.h5
Epoch 2/10
Epoch 2: val_accuracy did not improve from 0.70087
Epoch 3/10
Epoch 3: val_accuracy did not improve from 0.70087
Epoch 4/10
Epoch 4: val_accuracy did not improve from 0.70087
Epoch 4: early stopping
              precision    recall  f1-score   support

           0       0.66      0.57      0.61       368
           1       0.71      0.74      0.73       657
           2       0.71      0.73      0.72       700

    accuracy                           0.70      1725
   macro avg       0.69      0.68      0.69      1725
weighted avg       0.70      0.70      0.70      1725

F1 Score :  0.6854452340484434
Precision :  0.6934544458990577
Recall :  0.6802504010008477
Accuracy :  0.7008695652173913
Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.69391, saving model to rnn_tfidf_plot.h5
Epoch 2/10
Epoch 2: val_accuracy did not improve from 0.69391
Epoch 3/10
Epoch 3: val_accu

MemoryError: Unable to allocate 8.83 GiB for an array with shape (15522, 1, 152719) and data type float32

# Save Output

In [None]:
df = pd.DataFrame(hasil, columns =['iterasi', 'f1', 'precision', 'recall', 'accuracy', 'cm'])
#df.to_excel('rnn_tfidf_plot.xlsx')
df.head()

In [None]:
print("Accuracy : %.2f" % (df['accuracy'].mean()*100))
print("F1-Score : %.2f" % (df['f1'].mean()*100))