# Hoax Detection Using RNN-LSTM
## Dataset from Satria Data 2020 - Big Data Challenge

## Covolutional Neural Network

In [None]:
import os
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.optimizers import Adam
from tensorflow import keras
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import ZeroPadding2D, Convolution2D, MaxPooling2D
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger

## Needed Function

In [None]:
# transform image into array
def preprocess_image(image_path):
    img = load_img(image_path, target_size=(224, 224, 3))
    img = img_to_array(img)
    img = img/255
    return img


# transform single image into array for prediction
def single_preprocess_image(image_path):
    img = load_img(image_path, target_size=(224, 224, 3))
    img = img_to_array(img)
    img = img/255
    img = np.expand_dims(img, axis=0)
    return img

In [None]:
# read dan preprocess training data
def train_data(train_data_path):
    print("\n-- PREPARE TRAINING DATA --")
    train_image = []
    train_label = []

    list_training = list(os.listdir(train_data_path))
    label_size = len(list_training)

    # CARA 2
    # load image from each subject
    sub_num = 0
    for sub in tqdm(sorted(list_training)):
        for photo in (os.listdir(f"{train_data_path}/{sub}")):
            filename = f"{train_data_path}/{sub}/{photo}"
            image_out = preprocess_image(filename)

            # iamge feature and class in binary
            train_image.append(image_out)
            train_label.append(sub)
        if(sub_num == 0):
            np_train = np.array(train_image)
        else:
            np_train = np.concatenate((np_train, np.array(train_image)))
        train_image.clear()
        sub_num += 1

    # encode train label
    le_train = LabelEncoder()
    train_label = le_train.fit_transform(train_label)
    train_label = to_categorical(train_label, label_size)

    # split data
    X_train = np_train
    y_train = train_label

    print("Train image list\t: ", sys.getsizeof(train_image)*len(train_image))
    print("X_train image np\t: ", X_train.nbytes)
    # train_image.clear()
    print("Train image list\t: ", sys.getsizeof(train_image)*len(train_image))
    return (X_train, y_train, le_train)


# read dan preprocess testing data
def test_data(test_data_path):
    print("\n-- PREPARE TESTING DATA --")
    test_image = []
    test_label = []

    list_testing = list(os.listdir(test_data_path))
    label_size = len(list_testing)

    # CARA 2
    # load image from each subject
    sub_num = 0
    for sub in tqdm(sorted(list_testing)):
        for photo in (os.listdir(f"{test_data_path}/{sub}")):
            filename = f"{test_data_path}/{sub}/{photo}"
            image_out = preprocess_image(filename)

            # iamge feature and class in binary
            test_image.append(image_out)
            test_label.append(sub)
        if(sub_num == 0):
            np_test = np.array(test_image)
        else:
            np_test = np.concatenate((np_test, np.array(test_image)))
        test_image.clear()
        sub_num += 1

    # Encode test label
    le_test = LabelEncoder()
    test_label = le_test.fit_transform(test_label)
    test_label = to_categorical(test_label, label_size)

    # split data
    X_test = np_test
    y_test = test_label

    print("Test image list\t: ", sys.getsizeof(test_image)*len(test_image))
    print("X_test image np\t: ", X_test.nbytes)
    # test_image.clear()
    print("Test image list\t: ", sys.getsizeof(test_image)*len(test_image))
    return (X_test, y_test, le_test)

In [None]:
# DEFINE RESNET-50 MODEL (imported from tensorflow.keras)
def resnet50(output_class):
    model = ResNet50(classes=output_class, weights=None,
                     input_shape=(224, 224, 3))
    return model

# DEFINE MODEL CALLBACKS
def my_callbacks(MODEL_NAME, label_size, image_per_label, EPOCHS, BS):
    model_callbacks = [
        # EarlyStopping(monitor="val_loss", patience=5),
        CSVLogger(
            filename=f"../Model/{MODEL_NAME}/history_{MODEL_NAME}_label{label_size}_data{image_per_label}_e{EPOCHS}_bs{BS}.csv", separator=",", append=False),
        ModelCheckpoint(
            filepath=f"../Model/{MODEL_NAME}/model_{MODEL_NAME}_label{label_size}_data{image_per_label}_e{EPOCHS}_bs{BS}_val_loss.h5", monitor="val_loss", save_best_only=True),
        ModelCheckpoint(
            filepath=f"../Model/{MODEL_NAME}/model_{MODEL_NAME}_label{label_size}_data{image_per_label}_e{EPOCHS}_bs{BS}_val_accuracy.h5", monitor="val_accuracy", save_best_only=True)
    ]
    return model_callbacks

# DEFINE LOAD MODEL
def load_my_model(MODEL_NAME, label_size, image_per_label, EPOCHS, BS):
    model = load_model(
        f"../Model/{MODEL_NAME}/model_{MODEL_NAME}_label{label_size}_data{image_per_label}_e{EPOCHS}_bs{BS}_val_accuracy.h5")
    return model

In [None]:
# Font option
title_font = {'weight': 'medium', 'size': 'medium'}
axis_font = {'size': 'small'}


# Visualize accuracy from training model
def visualize_accuracy(Historia, MODEL_NAME, label_size, image_per_label, EPOCHS, BS):
    # xlimit
    Numero = np.arange(1, EPOCHS+1, 1)

    # plot accuracy
    plt.figure()
    plt.plot(Numero, Historia.history["accuracy"], label="train_acc")
    plt.plot(Numero, Historia.history["val_accuracy"], label="val_acc")
    plt.title("Training and Validation Accuracy on Dataset", **title_font)
    plt.xlabel(
        f"Epoch {EPOCHS} Batch Size {BS} Label {label_size} Data {image_per_label}", **axis_font)
    plt.ylabel("Accuracy", **axis_font)
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.grid()
    plt.savefig(
        f"../Model/{MODEL_NAME}/figure_accuracy_{MODEL_NAME}_label{label_size}_data{image_per_label}_e{EPOCHS}_bs{BS}.png", dpi=600)


# Visualize loss from training model
def visualize_loss(Historia, MODEL_NAME, label_size, image_per_label, EPOCHS, BS):
    # xlimit
    Numero = np.arange(1, EPOCHS+1, 1)

    # plot loss
    plt.figure()
    plt.plot(Numero, Historia.history["loss"], label="train_loss")
    plt.plot(Numero, Historia.history["val_loss"], label="val_loss")
    plt.title("Training and Validation Loss on Dataset", **title_font)
    plt.xlabel(
        f"Epoch {EPOCHS} Batch Size {BS} Label {label_size} Data {image_per_label}", **axis_font)
    plt.ylabel("Loss", **axis_font)
    plt.legend(loc='upper right')
    plt.tight_layout()
    plt.grid()
    plt.savefig(
        f"../Model/{MODEL_NAME}/figure_loss_{MODEL_NAME}_label{label_size}_data{image_per_label}_e{EPOCHS}_bs{BS}.png", dpi=600)


# Print classification report to csv
def print_class_reports(tes, predictions, target_names, MODEL_NAME, label_size, image_per_label, EPOCHS, BS):
    report = classification_report(
        tes, predictions, target_names=target_names, output_dict=True)
    dataframe = pd.DataFrame(report).transpose()
    dataframe.to_csv(
        f"../Model/{MODEL_NAME}/reports_{MODEL_NAME}_label{label_size}_data{image_per_label}_e{EPOCHS}_bs{BS}.csv")
    print(classification_report(tes, predictions, target_names=target_names))


# print confusion matrix and visualize it
def print_conf_matrix(tes, predictions, target_names, MODEL_NAME, label_size, image_per_label, EPOCHS, BS):
    confusion_mtx = confusion_matrix(tes, predictions)
    plt.figure()
    sns.heatmap(confusion_mtx, xticklabels=target_names, yticklabels=target_names,
                # annot=True,
                fmt='g',
                cbar_kws={'label': 'Individual Image'},
                )
    plt.title("Confusion Matrix on Prediction", **title_font)
    plt.xlabel('Prediction', **axis_font)
    plt.xticks(fontsize=4)
    plt.yticks(fontsize=4)
    plt.ylabel('Label', **axis_font)
    plt.tight_layout()
    plt.savefig(
        f"../Model/{MODEL_NAME}/confusion_matrix_{MODEL_NAME}_label{label_size}_data{image_per_label}_e{EPOCHS}_bs{BS}.png", dpi=600)
    print(confusion_mtx)

## PREPROCESS

In [None]:
# Initiate Constant
BS = 10
EPOCHS = 150
MODEL_NAME = "RESNET50"
LEARNING_RATE = 0.001
SPLIT_SIZE = ""

print("[INFO] Hyperparameter:")
print("Epoch: " + str(EPOCHS))
print("Learning rate: " + str(LEARNING_RATE))
print("Batch Size: " + str(BS))

In [None]:
# directory of train and test dataset
training_dataset = f"../Dataset/SplittedDataset{SPLIT_SIZE}/training_data"
list_training = list(os.listdir(training_dataset))
testing_dataset = f"../Dataset/SplittedDataset{SPLIT_SIZE}/testing_data"
list_testing = list(os.listdir(testing_dataset))

label_size = len(list_training)
print("Label Size:", label_size)
image_per_label = len(os.listdir(f"{training_dataset}/{list_training[0]}"))
print("Image Per Label:", image_per_label)

In [None]:
# READ TRAIN AND TEST DATA
X_train, y_train, le_train = train_data(training_dataset)
X_test, y_test, le_test = test_data(testing_dataset)

## BUILD AND TRAIN MODEL

In [None]:
# DEFINE MODEL
model = resnet50(label_size)

# SUMMARY MODEL
model.summary()

In [None]:
# COMPILE MODEL
adam = Adam(learning_rate=LEARNING_RATE)
model.compile(optimizer=adam, loss='categorical_crossentropy',
              metrics=['accuracy'])

# Initiate Callbacks
my_callbacks = my_callbacks(MODEL_NAME, label_size,
                            image_per_label, EPOCHS, BS)

In [None]:
# Initiate start time
start_time = time.time()

# TRAIN MODEL
Historia = model.fit(X_train, y_train, validation_data=(
    X_test, y_test), callbacks=my_callbacks, epochs=EPOCHS, batch_size=BS)

# end time
print("--- %s seconds ---" % (time.time() - start_time))

## EVALUATE MODEL

In [None]:
model = load_my_model(MODEL_NAME, label_size, image_per_label, EPOCHS, BS)

results = model.evaluate(X_test, y_test, batch_size=BS)
print(results)

In [None]:
predictions = [np.argmax(x) for x in model.predict(X_test, batch_size=BS)]
tes = [np.argmax(y) for y in y_test]

In [None]:
# Print classification report
print_class_reports(tes, predictions, le_test.classes_, MODEL_NAME,
                    label_size, image_per_label, EPOCHS, BS)

In [None]:
# print confusion matrix
print_conf_matrix(tes, predictions, le_test.classes_, MODEL_NAME,
                  label_size, image_per_label, EPOCHS, BS)

## OBSERVE MODEL

In [None]:
# accuracy
visualize_accuracy(Historia, MODEL_NAME,
                   label_size, image_per_label, EPOCHS, BS)

In [None]:
# loss
visualize_loss(Historia, MODEL_NAME,
               label_size, image_per_label, EPOCHS, BS)