# Library

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from gensim.models import FastText

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

from keras.utils.np_utils import to_categorical
from keras.models import load_model, Model, Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Embedding, SimpleRNN, Dropout

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Preprocessing
### Load Data

In [2]:
df = pd.read_pickle('preprocessed_df.pkl')
df = pd.DataFrame(df[['detokenize','direktor']])
df.head()

Unnamed: 0,detokenize,direktor
0,the desperate hour lakewood salah cerita suara...,1
1,edisi males review singkat tonton libur dp des...,0
2,plot utama orang deserter pursuit buru wamil j...,0
3,film hereditary horror thrill midsommar gatau ...,0
4,batman manusiawi tarung nya sadis scene pursui...,0


In [3]:
reviews = df['detokenize'].astype(str)
label = pd.get_dummies(df['direktor']).values

# Variables

In [4]:
MAX_FEATURES = 1000
test_size_split = 0.2

# Split Data

In [5]:
x_train, x_test, y_train, y_test = (
    train_test_split(reviews, 
                     label, 
                     test_size=test_size_split, 
                     stratify = label, 
                     random_state=42)
)

print(f'Train\t| X shape: {x_train.shape}\tY shape: {y_train.shape}')
print(f'Test\t| X shape: {x_test.shape}\tY shape: {y_test.shape}')

Train	| X shape: (13797,)	Y shape: (13797, 3)
Test	| X shape: (3450,)	Y shape: (3450, 3)


# Feature Extraction

In [6]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features = MAX_FEATURES)
vectorizer.fit(x_train)

x_train_tfidf = vectorizer.fit_transform(x_train).todense()
x_test_tfidf = vectorizer.transform(x_test).todense()

# Feature Expansion

In [7]:
# Load trained fasttext model
model_exp = FastText.load('fasttext_news_tweet.model').wv

In [8]:
def feature_expansion(df,feature):
    for col in tqdm(df.columns): #loop per kolom
        try: 
            sim_word = model_exp.similar_by_word(col, topn = 20) #Mencari similarity berdasarkan nilai n
        except:
            sim_word = []
        if sim_word != []: #jika similarity tidak kosong
            for term in [sim_word[i][0] for i in range(len(sim_word))]: #loop per-word yang ada di Similarity
                if term in feature:
                    #untuk semua kolom yang mempunyai nilai 0 di kolom, tetapi mempunyai nilai yang bukan 0 pada kolom term
                    #nilainya diganti dengan nilai kolom term yang mempunyai nilai bukan 0
                    df[col][(df[col]==0) & (df[term]!=0)] = df[term][(df[col]==0) & (df[term]!=0)]
    return df

In [9]:
feature_ext = vectorizer.get_feature_names_out()

df_x_train = pd.DataFrame(x_train_tfidf, columns= feature_ext)
df_x_test = pd.DataFrame(x_test_tfidf, columns= feature_ext)

df_exp_x_train = feature_expansion(df_x_train, feature_ext)
df_exp_x_test = feature_expansion(df_x_test, feature_ext)

x_train = np.expand_dims(df_exp_x_train, 1)
x_test = np.expand_dims(df_exp_x_test, 1)

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:13<00:00, 72.25it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:13<00:00, 75.28it/s]


# Model RNN

In [10]:
def RNN(x_train_c, y_train_c, x_test_c, y_test_c):
    rnn = Sequential()
    rnn.add(SimpleRNN(units=256, activation='relu', recurrent_dropout=0.2, return_sequences=True))
    rnn.add(Dropout(0.2))
    rnn.add(SimpleRNN(units=128, activation='relu', return_sequences=True))
    rnn.add(Dropout(0.2))
    rnn.add(SimpleRNN(units=64, activation='relu'))
    rnn.add(Dropout(0.2))
    rnn.add(Dense(units=3, activation='softmax'))
    
    rnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    rnn.build(x_train_c.shape)
    
    early_stopping = EarlyStopping(monitor='val_loss', mode = 'min', verbose = 1, patience=3, min_delta=0.0001)
    model_checkpoint = ModelCheckpoint('rnn_tfidf_ft_direktor.h5', monitor = 'val_accuracy', mode = 'max', verbose = 1, save_best_only = True)
    
    history = rnn.fit(x_train_c, y_train_c, epochs= 10, batch_size=128, 
                      validation_data=(x_test_c, y_test_c), verbose = 1,
                      callbacks=[early_stopping, model_checkpoint])
    
    model = load_model('rnn_tfidf_ft_direktor.h5')
    y_pred = model.predict(x_test_c)
    y_pred_cat = y_pred.argmax(axis=1)
    y_test_cat = np.argmax(y_test_c, axis=1)
   
    cm = confusion_matrix(y_test_cat, y_pred_cat)
    classreport = classification_report(y_test_cat, y_pred_cat)
    f1 = f1_score(y_test_cat, y_pred_cat,  average="macro")
    recall = recall_score(y_test_cat, y_pred_cat, average="macro")
    precision = precision_score(y_test_cat, y_pred_cat, average="macro")
    accuracy = accuracy_score(y_test_cat, y_pred_cat)

    print(classreport)
    print("F1 Score : ", f1)
    print("Precision : ", precision)
    print("Recall : ", recall)
    print("Accuracy : ", accuracy)

    return [f1, precision, recall, accuracy, cm]

# Running

In [11]:
hasil = []

for i in range(1, 6):
    hasil.append([str(i)] + RNN(x_train, y_train, x_test, y_test))

Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.85536, saving model to rnn_tfidf_ft_direktor.h5
Epoch 2/10
Epoch 2: val_accuracy improved from 0.85536 to 0.87217, saving model to rnn_tfidf_ft_direktor.h5
Epoch 3/10
Epoch 3: val_accuracy improved from 0.87217 to 0.87304, saving model to rnn_tfidf_ft_direktor.h5
Epoch 4/10
Epoch 4: val_accuracy improved from 0.87304 to 0.87420, saving model to rnn_tfidf_ft_direktor.h5
Epoch 5/10
Epoch 5: val_accuracy did not improve from 0.87420
Epoch 6/10
Epoch 6: val_accuracy did not improve from 0.87420
Epoch 7/10
Epoch 7: val_accuracy did not improve from 0.87420
Epoch 7: early stopping
              precision    recall  f1-score   support

           0       0.31      0.03      0.06       117
           1       0.90      0.97      0.93      2951
           2       0.61      0.39      0.48       382

    accuracy                           0.87      3450
   macro avg       0.61      0.46      0.49      3450
weighted avg       0.84      0.87  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.85536, saving model to rnn_tfidf_ft_direktor.h5
Epoch 2/10
Epoch 2: val_accuracy improved from 0.85536 to 0.86667, saving model to rnn_tfidf_ft_direktor.h5
Epoch 3/10
Epoch 3: val_accuracy improved from 0.86667 to 0.87652, saving model to rnn_tfidf_ft_direktor.h5
Epoch 4/10
Epoch 4: val_accuracy improved from 0.87652 to 0.87971, saving model to rnn_tfidf_ft_direktor.h5
Epoch 5/10
Epoch 5: val_accuracy did not improve from 0.87971
Epoch 6/10
Epoch 6: val_accuracy did not improve from 0.87971
Epoch 6: early stopping
              precision    recall  f1-score   support

           0       0.71      0.04      0.08       117
           1       0.89      0.98      0.94      2951
           2       0.67      0.35      0.46       382

    accuracy                           0.88      3450
   macro avg       0.76      0.46      0.49      3450
weighted avg       0.86      0.88      0.85      3450

F1 Score :  0.49041993195701034
Precision 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.85536, saving model to rnn_tfidf_ft_direktor.h5
Epoch 2/10
Epoch 2: val_accuracy improved from 0.85536 to 0.86957, saving model to rnn_tfidf_ft_direktor.h5
Epoch 3/10
Epoch 3: val_accuracy did not improve from 0.86957
Epoch 4/10
Epoch 4: val_accuracy improved from 0.86957 to 0.87913, saving model to rnn_tfidf_ft_direktor.h5
Epoch 5/10
Epoch 5: val_accuracy did not improve from 0.87913
Epoch 6/10
Epoch 6: val_accuracy did not improve from 0.87913
Epoch 6: early stopping
              precision    recall  f1-score   support

           0       0.75      0.03      0.05       117
           1       0.89      0.98      0.93      2951
           2       0.68      0.34      0.45       382

    accuracy                           0.88      3450
   macro avg       0.77      0.45      0.48      3450
weighted avg       0.86      0.88      0.85      3450

F1 Score :  0.47804016049904297
Precision :  0.7749830580227851
Recall :  0.448038339612

# Save Output

In [12]:
df = pd.DataFrame(hasil, columns =['iterasi', 'f1', 'precision', 'recall', 'accuracy', 'cm'])
#df.to_excel('rnn_tfidf_ft_direktor.xlsx')
df.head()

Unnamed: 0,iterasi,f1,precision,recall,accuracy,cm
0,1,0.489499,0.605973,0.464047,0.874203,"[[4, 101, 12], [6, 2864, 81], [3, 231, 148]]"
1,2,0.452449,0.518384,0.431525,0.875362,"[[0, 108, 9], [0, 2901, 50], [0, 263, 119]]"
2,3,0.49042,0.758006,0.456775,0.87971,"[[5, 98, 14], [1, 2898, 52], [1, 249, 132]]"
3,4,0.451902,0.548522,0.427255,0.88,"[[0, 108, 9], [0, 2925, 26], [0, 271, 111]]"
4,5,0.47804,0.774983,0.448038,0.87913,"[[3, 103, 11], [1, 2902, 48], [0, 254, 128]]"


In [13]:
print("Accuracy : %.2f" % (df['accuracy'].mean()*100))
print("F1-Score : %.2f" % (df['f1'].mean()*100))

Accuracy : 87.77
F1-Score : 47.25
