# Library

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from gensim.models import FastText
from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

from keras.utils.np_utils import to_categorical
from keras.models import load_model, Model, Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Embedding, SimpleRNN, Dropout

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Preprocessing
### Load Data

In [2]:
df = pd.read_pickle(r'C:\Users\HP Victus 16\Documents\TA_Code\Preprocessing\preprocessed_df.pkl')
df = pd.DataFrame(df[['detokenize','akting']])
df.head()

Unnamed: 0,detokenize,akting
0,the desperate hour lakewood salah cerita suara...,1
1,edisi males review singkat tonton libur dp des...,1
2,plot utama orang deserter pursuit buru wamil j...,1
3,film hereditary horror thrill midsommar gatau ...,0
4,batman manusiawi tarung nya sadis scene pursui...,0


In [3]:
reviews = df['detokenize']
label = df['akting']

# Variables

In [4]:
MAX_FEATURES = 1000
test_size_split = 0.3

# Feature Extraction

In [5]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features = MAX_FEATURES)

tfidf = vectorizer.fit_transform(reviews.astype(str))

# SMOTE

In [6]:
smote = SMOTE()

x_smote, y_smote = smote.fit_resample(tfidf, label)

# Split Data

In [7]:
x_train, x_test, y_train, y_test = (
    train_test_split(x_smote, 
                     y_smote, 
                     test_size=test_size_split, 
                     stratify = y_smote, 
                     random_state=42)
)

print(f'Train\t| X shape: {x_train.shape}\tY shape: {y_train.shape}')
print(f'Test\t| X shape: {x_test.shape}\tY shape: {y_test.shape}')

Train	| X shape: (28917, 1000)	Y shape: (28917,)
Test	| X shape: (12393, 1000)	Y shape: (12393,)


# Feature Expansion

In [8]:
# Load trained fasttext model
model_exp = FastText.load(r'C:\Users\HP Victus 16\Documents\TA_Code\Corpus_FastText\fasttext_news_tweet.model').wv

In [9]:
def feature_expansion(df, feature):
    for col in tqdm(df.columns): #loop per kolom
        try: 
            sim_word = model_exp.similar_by_word(col, topn = 40) #Mencari similarity berdasarkan nilai n
        except:
            sim_word = []
        if sim_word != []: #jika similarity tidak kosong
            for term in [sim_word[i][0] for i in range(len(sim_word))]: #loop per-word yang ada di Similarity
                if term in feature:
                    #untuk semua kolom yang mempunyai nilai 0 di kolom, tetapi mempunyai nilai yang bukan 0 pada kolom term
                    #nilainya diganti dengan nilai kolom term yang mempunyai nilai bukan 0
                    df[col][(df[col]==0) & (df[term]!=0)] = df[term][(df[col]==0) & (df[term]!=0)]
    return df

In [10]:
feature_ext = vectorizer.get_feature_names_out()

df_x_train = pd.DataFrame(x_train.todense(), columns = feature_ext)
df_x_test = pd.DataFrame(x_test.todense(), columns = feature_ext)

df_x_train_exp = feature_expansion(df_x_train, feature_ext)
df_x_test_exp = feature_expansion(df_x_test, feature_ext)

x_train = np.expand_dims(df_x_train_exp, 1)
x_test = np.expand_dims(df_x_test_exp, 1)

y_train = pd.get_dummies(y_train).values
y_test = pd.get_dummies(y_test).values

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:13<00:00, 75.80it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:12<00:00, 80.08it/s]


# Model RNN

In [11]:
def RNN(x_train_c, y_train_c, x_test_c, y_test_c):
    rnn = Sequential()
    rnn.add(SimpleRNN(units=256, activation='relu', recurrent_dropout=0.2, return_sequences=True))
    rnn.add(Dropout(0.2))
    rnn.add(SimpleRNN(units=128, activation='relu', return_sequences=True))
    rnn.add(Dropout(0.2))
    rnn.add(SimpleRNN(units=64, activation='relu'))
    rnn.add(Dropout(0.2))
    rnn.add(Dense(units=3, activation='softmax'))
    
    rnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    rnn.build(x_train_c.shape)
    
    early_stopping = EarlyStopping(monitor='val_loss', mode = 'min', verbose = 1, patience=3, min_delta=0.0001)
    model_checkpoint = ModelCheckpoint('rnn_tfidf_ft_smote_akting.h5', monitor = 'val_accuracy', mode = 'max', verbose = 1, save_best_only = True)
    
    history = rnn.fit(x_train_c, y_train_c, epochs= 10, batch_size=128, 
                      validation_data=(x_test_c, y_test_c), verbose = 1,
                      callbacks=[early_stopping, model_checkpoint])
    
    model = load_model('rnn_tfidf_ft_smote_akting.h5')
    y_pred = model.predict(x_test_c)
    y_pred_cat = y_pred.argmax(axis=1)
    y_test_cat = np.argmax(y_test_c, axis=1)
   
    cm = confusion_matrix(y_test_cat, y_pred_cat)
    classreport = classification_report(y_test_cat, y_pred_cat)
    f1 = f1_score(y_test_cat, y_pred_cat,  average="macro")
    recall = recall_score(y_test_cat, y_pred_cat, average="macro")
    precision = precision_score(y_test_cat, y_pred_cat, average="macro")
    accuracy = accuracy_score(y_test_cat, y_pred_cat)

    print(classreport)
    print("F1 Score : ", f1)
    print("Precision : ", precision)
    print("Recall : ", recall)
    print("Accuracy : ", accuracy)

    return [f1, precision, recall, accuracy, cm]

# Running

In [12]:
hasil = []

for i in range(1, 6):
    hasil.append([str(i)] + RNN(x_train, y_train, x_test, y_test))

Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.91511, saving model to rnn_tfidf_ft_smote_akting.h5
Epoch 2/10
Epoch 2: val_accuracy improved from 0.91511 to 0.94360, saving model to rnn_tfidf_ft_smote_akting.h5
Epoch 3/10
Epoch 3: val_accuracy improved from 0.94360 to 0.95675, saving model to rnn_tfidf_ft_smote_akting.h5
Epoch 4/10
Epoch 4: val_accuracy improved from 0.95675 to 0.96482, saving model to rnn_tfidf_ft_smote_akting.h5
Epoch 5/10
Epoch 5: val_accuracy did not improve from 0.96482
Epoch 6/10
Epoch 6: val_accuracy did not improve from 0.96482
Epoch 7/10
Epoch 7: val_accuracy did not improve from 0.96482
Epoch 7: early stopping
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      4131
           1       0.96      0.94      0.95      4131
           2       0.95      0.96      0.95      4131

    accuracy                           0.96     12393
   macro avg       0.96      0.96      0.96     12393
weighted avg       

Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.90874, saving model to rnn_tfidf_ft_smote_akting.h5
Epoch 2/10
Epoch 2: val_accuracy improved from 0.90874 to 0.93617, saving model to rnn_tfidf_ft_smote_akting.h5
Epoch 3/10
Epoch 3: val_accuracy improved from 0.93617 to 0.95489, saving model to rnn_tfidf_ft_smote_akting.h5
Epoch 4/10
Epoch 4: val_accuracy improved from 0.95489 to 0.95796, saving model to rnn_tfidf_ft_smote_akting.h5
Epoch 5/10
Epoch 5: val_accuracy improved from 0.95796 to 0.96304, saving model to rnn_tfidf_ft_smote_akting.h5
Epoch 6/10
Epoch 6: val_accuracy improved from 0.96304 to 0.96619, saving model to rnn_tfidf_ft_smote_akting.h5
Epoch 7/10
Epoch 7: val_accuracy did not improve from 0.96619
Epoch 8/10
Epoch 8: val_accuracy improved from 0.96619 to 0.96756, saving model to rnn_tfidf_ft_smote_akting.h5
Epoch 9/10
Epoch 9: val_accuracy did not improve from 0.96756
Epoch 9: early stopping
              precision    recall  f1-score   support

           0    

# Save Output

In [13]:
df = pd.DataFrame(hasil, columns =['iterasi', 'f1', 'precision', 'recall', 'accuracy', 'cm'])
#df.to_excel('rnn_tfidf_ft_smote_akting.xlsx')
df.head()

Unnamed: 0,iterasi,f1,precision,recall,accuracy,cm
0,1,0.964716,0.964709,0.964819,0.964819,"[[4123, 4, 4], [46, 3888, 197], [24, 161, 3946]]"
1,2,0.963768,0.963731,0.963851,0.963851,"[[4117, 13, 1], [39, 3894, 198], [23, 174, 3934]]"
2,3,0.966826,0.966792,0.966917,0.966917,"[[4123, 8, 0], [55, 3918, 158], [16, 173, 3942]]"
3,4,0.967508,0.967534,0.967562,0.967562,"[[4113, 7, 11], [34, 3907, 190], [18, 142, 3971]]"
4,5,0.966503,0.966487,0.966594,0.966594,"[[4121, 10, 0], [49, 3898, 184], [16, 155, 3960]]"


In [14]:
print("Accuracy : %.2f" % (df['accuracy'].mean()*100))
print("F1-Score : %.2f" % (df['f1'].mean()*100))

Accuracy : 96.59
F1-Score : 96.59
