# Library

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from gensim.models import FastText

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

from keras.utils.np_utils import to_categorical
from keras.models import load_model, Model, Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Embedding, SimpleRNN, Dropout

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Preprocessing
### Load Data

In [2]:
df = pd.read_pickle(r'C:\Users\HP Victus 16\Documents\TA_Code\Preprocessing\preprocessed_df.pkl')
df = pd.DataFrame(df[['detokenize','akting']])
df.head()

Unnamed: 0,detokenize,akting
0,the desperate hour lakewood salah cerita suara...,1
1,edisi males review singkat tonton libur dp des...,1
2,plot utama orang deserter pursuit buru wamil j...,1
3,film hereditary horror thrill midsommar gatau ...,0
4,batman manusiawi tarung nya sadis scene pursui...,0


In [3]:
reviews = df['detokenize'].astype(str)
label = pd.get_dummies(df['akting']).values

# Variables

In [4]:
MAX_FEATURES = 1000
test_size_split = 0.3

# Split Data

In [5]:
x_train, x_test, y_train, y_test = (
    train_test_split(reviews, 
                     label, 
                     test_size=test_size_split, 
                     stratify = label, 
                     random_state=42)
)

print(f'Train\t| X shape: {x_train.shape}\tY shape: {y_train.shape}')
print(f'Test\t| X shape: {x_test.shape}\tY shape: {y_test.shape}')

Train	| X shape: (12072,)	Y shape: (12072, 3)
Test	| X shape: (5175,)	Y shape: (5175, 3)


# Feature Extraction

In [6]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features = MAX_FEATURES)
vectorizer.fit(x_train)

x_train_tfidf = vectorizer.fit_transform(x_train).todense()
x_test_tfidf = vectorizer.transform(x_test).todense()

# Feature Expansion

In [7]:
# Load trained fasttext model
model_exp = FastText.load(r'C:\Users\HP Victus 16\Documents\TA_Code\Corpus_FastText\fasttext_news_tweet.model').wv

In [8]:
def feature_expansion(df,feature):
    for col in tqdm(df.columns): #loop per kolom
        try: 
            sim_word = model_exp.similar_by_word(col, topn = 45) #Mencari similarity berdasarkan nilai n
        except:
            sim_word = []
        if sim_word != []: #jika similarity tidak kosong
            for term in [sim_word[i][0] for i in range(len(sim_word))]: #loop per-word yang ada di Similarity
                if term in feature:
                    #untuk semua kolom yang mempunyai nilai 0 di kolom, tetapi mempunyai nilai yang bukan 0 pada kolom term
                    #nilainya diganti dengan nilai kolom term yang mempunyai nilai bukan 0
                    df[col][(df[col]==0) & (df[term]!=0)] = df[term][(df[col]==0) & (df[term]!=0)]
    return df

In [9]:
feature_ext = vectorizer.get_feature_names_out()

df_x_train = pd.DataFrame(x_train_tfidf, columns= feature_ext)
df_x_test = pd.DataFrame(x_test_tfidf, columns= feature_ext)

df_exp_x_train = feature_expansion(df_x_train, feature_ext)
df_exp_x_test = feature_expansion(df_x_test, feature_ext)

x_train = np.expand_dims(df_exp_x_train, 1)
x_test = np.expand_dims(df_exp_x_test, 1)

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:12<00:00, 81.93it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:12<00:00, 82.62it/s]


# Model RNN

In [10]:
def RNN(x_train_c, y_train_c, x_test_c, y_test_c):
    rnn = Sequential()
    rnn.add(SimpleRNN(units=256, activation='relu', recurrent_dropout=0.2, return_sequences=True))
    rnn.add(Dropout(0.2))
    rnn.add(SimpleRNN(units=128, activation='relu', return_sequences=True))
    rnn.add(Dropout(0.2))
    rnn.add(SimpleRNN(units=64, activation='relu'))
    rnn.add(Dropout(0.2))
    rnn.add(Dense(units=3, activation='softmax'))
    
    rnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    rnn.build(x_train_c.shape)
    
    early_stopping = EarlyStopping(monitor='val_loss', mode = 'min', verbose = 1, patience=3, min_delta=0.0001)
    model_checkpoint = ModelCheckpoint('rnn_tfidf_ft_akting.h5', monitor = 'val_accuracy', mode = 'max', verbose = 1, save_best_only = True)
    
    history = rnn.fit(x_train_c, y_train_c, epochs= 10, batch_size=128, 
                      validation_data=(x_test_c, y_test_c), verbose = 1,
                      callbacks=[early_stopping, model_checkpoint])
    
    model = load_model('rnn_tfidf_ft_akting.h5')
    y_pred = model.predict(x_test_c)
    y_pred_cat = y_pred.argmax(axis=1)
    y_test_cat = np.argmax(y_test_c, axis=1)
   
    cm = confusion_matrix(y_test_cat, y_pred_cat)
    classreport = classification_report(y_test_cat, y_pred_cat)
    f1 = f1_score(y_test_cat, y_pred_cat,  average="macro")
    recall = recall_score(y_test_cat, y_pred_cat, average="macro")
    precision = precision_score(y_test_cat, y_pred_cat, average="macro")
    accuracy = accuracy_score(y_test_cat, y_pred_cat)

    print(classreport)
    print("F1 Score : ", f1)
    print("Precision : ", precision)
    print("Recall : ", recall)
    print("Accuracy : ", accuracy)

    return [f1, precision, recall, accuracy, cm]

# Running

In [11]:
hasil = []

for i in range(1, 6):
    hasil.append([str(i)] + RNN(x_train, y_train, x_test, y_test))

Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.87517, saving model to rnn_tfidf_ft_akting.h5
Epoch 2/10
Epoch 2: val_accuracy improved from 0.87517 to 0.88329, saving model to rnn_tfidf_ft_akting.h5
Epoch 3/10
Epoch 3: val_accuracy did not improve from 0.88329
Epoch 4/10
Epoch 4: val_accuracy did not improve from 0.88329
Epoch 5/10
Epoch 5: val_accuracy improved from 0.88329 to 0.88502, saving model to rnn_tfidf_ft_akting.h5
Epoch 5: early stopping
              precision    recall  f1-score   support

           0       0.45      0.15      0.22       171
           1       0.92      0.96      0.94      4132
           2       0.73      0.67      0.70       872

    accuracy                           0.89      5175
   macro avg       0.70      0.59      0.62      5175
weighted avg       0.87      0.89      0.88      5175

F1 Score :  0.6209972053931256
Precision :  0.7023758347488611
Recall :  0.5935261736032112
Accuracy :  0.885024154589372
Epoch 1/10
Epoch 1: val_accuracy i

# Save Output

In [12]:
df = pd.DataFrame(hasil, columns =['iterasi', 'f1', 'precision', 'recall', 'accuracy', 'cm'])
#df.to_excel('rnn_tfidf_ft_akting.xlsx')
df.head()

Unnamed: 0,iterasi,f1,precision,recall,accuracy,cm
0,1,0.620997,0.702376,0.593526,0.885024,"[[25, 84, 62], [14, 3967, 151], [16, 268, 588]]"
1,2,0.61468,0.726127,0.573078,0.884444,"[[25, 97, 49], [16, 4031, 85], [11, 340, 521]]"
2,3,0.61239,0.784362,0.574757,0.887729,"[[20, 99, 52], [7, 4021, 104], [3, 316, 553]]"
3,4,0.599842,0.734577,0.579244,0.886184,"[[15, 88, 68], [8, 3970, 154], [4, 267, 601]]"
4,5,0.61376,0.727121,0.585253,0.888309,"[[20, 90, 61], [10, 3990, 132], [9, 276, 587]]"


In [13]:
print("Accuracy : %.2f" % (df['accuracy'].mean()*100))
print("F1-Score : %.2f" % (df['f1'].mean()*100))

Accuracy : 88.63
F1-Score : 61.23
