# Hoax Detection Using CNN
## Dataset from Satria Data 2020 - Big Data Challenge

## Covolutional Neural Network

In [1]:
import os
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import ZeroPadding2D, Convolution2D, MaxPooling2D, Dense, Flatten, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from keras_preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger

## Needed Function

In [2]:
# Read Training image from dataframe
def optimize_aug_train_data(train_dataframe, BS):
    train_datagen = ImageDataGenerator(
        rescale=1./255,
        validation_split=0.20
    )
    train_dataset = train_datagen.flow_from_dataframe(
        dataframe=train_dataframe,
        target_size=(224,224),
        class_mode="binary",
        subset="training",
        batch_size=BS,
        directory="../data/training/image/",
        x_col="nama file gambar",
        y_col="label",
        shuffle=True
    )
    
    validation_dataset = train_datagen.flow_from_dataframe(
        dataframe=train_dataframe,
        target_size=(224,224),
        class_mode="binary",
        subset="validation",
        batch_size=BS,
        directory="../data/training/image/",
        x_col="nama file gambar",
        y_col="label",
        shuffle=True
    )
    return train_dataset, validation_dataset


# Read testing image from dataframe
def optimize_aug_test_data(train_dataframe, BS):
    test_datagen = ImageDataGenerator(
        rescale=1./255
    )
    test_dataset = test_datagen.flow_from_dataframe(
        dataframe=train_dataframe,
        target_size=(224,224),
        class_mode="binary",
        batch_size=BS,
        directory="../data/training/image/",
        x_col="nama file gambar",
    )
    return train_dataset

In [3]:
# DEFINE MODEL CALLBACKS
def my_callbacks(EPOCHS, BS):
    model_callbacks = [
        # EarlyStopping(monitor="val_loss", patience=5),
        CSVLogger(
            filename=f"../log/history_cnn_e{EPOCHS}_bs{BS}.csv", separator=",", append=False),
        ModelCheckpoint(
            filepath=f"../model/model_cnn_e{EPOCHS}_bs{BS}_val_loss.h5", monitor="val_loss", save_best_only=True),
        ModelCheckpoint(
            filepath=f"../model/model_cnn_e{EPOCHS}_bs{BS}_val_accuracy.h5", monitor="val_accuracy", save_best_only=True)
    ]
    return model_callbacks

# DEFINE LOAD MODEL
def load_my_model(EPOCHS, BS):
    model = load_model(
        f"../model/model_cnn_e{EPOCHS}_bs{BS}_val_accuracy.h5")
    return model

In [4]:
# Font option
title_font = {'weight': 'medium', 'size': 'medium'}
axis_font = {'size': 'small'}


# Visualize accuracy from training model
def visualize_accuracy(Historia, EPOCHS, BS):
    # xlimit
    Numero = np.arange(1, EPOCHS+1, 1)

    # plot accuracy
    plt.figure()
    plt.plot(Numero, Historia.history["accuracy"], label="train_acc")
    plt.plot(Numero, Historia.history["val_accuracy"], label="val_acc")
    plt.title("Training and Validation Accuracy on Dataset", **title_font)
    plt.xlabel(
        f"Epoch {EPOCHS} Batch Size {BS}", **axis_font)
    plt.ylabel("Accuracy", **axis_font)
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.grid()
    plt.show()
#     plt.savefig(
#         f"../Model/{MODEL_NAME}/figure_accuracy_{MODEL_NAME}_label{label_size}_data{image_per_label}_e{EPOCHS}_bs{BS}.png", dpi=600)


# Visualize loss from training model
def visualize_loss(Historia, EPOCHS, BS):
    # xlimit
    Numero = np.arange(1, EPOCHS+1, 1)

    # plot loss
    plt.figure()
    plt.plot(Numero, Historia.history["loss"], label="train_loss")
    plt.plot(Numero, Historia.history["val_loss"], label="val_loss")
    plt.title("Training and Validation Loss on Dataset", **title_font)
    plt.xlabel(
        f"Epoch {EPOCHS} Batch Size {BS}", **axis_font)
    plt.ylabel("Loss", **axis_font)
    plt.legend(loc='upper right')
    plt.tight_layout()
    plt.grid()
    plt.show()
#     plt.savefig(
#         f"../Model/{MODEL_NAME}/figure_loss_{MODEL_NAME}_label{label_size}_data{image_per_label}_e{EPOCHS}_bs{BS}.png", dpi=600)


# Print classification report to csv
def print_class_reports(tes, predictions, target_names, EPOCHS, BS):
    report = classification_report(
        tes, predictions, target_names=target_names, output_dict=True)
    dataframe = pd.DataFrame(report).transpose()
    dataframe.to_csv(
        f"../log/class_reports_model_cnn_e{EPOCHS}_bs{BS}.csv")
    print(classification_report(tes, predictions, target_names=target_names))


# print confusion matrix and visualize it
def print_conf_matrix(tes, predictions, target_names, EPOCHS, BS):
    confusion_mtx = confusion_matrix(tes, predictions)
    plt.figure()
    sns.heatmap(confusion_mtx, xticklabels=target_names, yticklabels=target_names,
                # annot=True,
                fmt='g',
                cbar_kws={'label': 'Individual Image'},
                )
    plt.title("Confusion Matrix on Prediction", **title_font)
    plt.xlabel('Prediction', **axis_font)
    plt.xticks(fontsize=4)
    plt.yticks(fontsize=4)
    plt.ylabel('Label', **axis_font)
    plt.tight_layout()
    plt.show()
#     plt.savefig(
#         f"../Model/{MODEL_NAME}/confusion_matrix_{MODEL_NAME}_label{label_size}_data{image_per_label}_e{EPOCHS}_bs{BS}.png", dpi=600)
#     print(confusion_mtx)

## PREPROCESS

In [5]:
# Initiate Constant
BS = 10
EPOCHS = 25
LEARNING_RATE = 0.001
SPLIT_SIZE = ""

print("[INFO] Hyperparameter:")
print("Epoch: " + str(EPOCHS))
print("Learning rate: " + str(LEARNING_RATE))
print("Batch Size: " + str(BS))

[INFO] Hyperparameter:
Epoch: 25
Learning rate: 0.001
Batch Size: 10


In [6]:
# read train dataset
df_train = pd.read_excel("../data/training/DataLatih.xlsx", engine="openpyxl", dtype=str)
df_train.head()

Unnamed: 0,ID,label,tanggal,judul,narasi,nama file gambar,judul_translate,narasi_translate
0,71,1,2020-08-17 00:00:00,Pemakaian Masker Menyebabkan Penyakit Legionna...,A caller to a radio talk show recently shared ...,71.jpg,Pemakaian Masker Menyebabkan Penyakit Legionna...,Seorang penelepon ke talk show radio baru-baru...
1,461,1,2020-07-17 00:00:00,Instruksi Gubernur Jateng tentang penilangan ...,Yth.Seluruh Anggota Grup Sesuai Instruksi Gube...,461.png,Instruksi Gubernur Jateng TENTANG penilangan B...,Yth.Seluruh Anggota Anggota Grup Sesuai Instru...
2,495,1,2020-07-13 00:00:00,Foto Jim Rohn: Jokowi adalah presiden terbaik ...,Jokowi adalah presiden terbaik dlm sejarah ban...,495.png,Foto Jim Rohn: Jokowi Adalah Presiden Terbaik ...,Jokowi Adalah Presiden Terbaik dlm Sejarah ban...
3,550,1,2020-07-08 00:00:00,"ini bukan politik, tapi kenyataan Pak Jokowi b...","Maaf Mas2 dan Mbak2, ini bukan politik, tapi k...",550.png,"Suami Bukan politik, TAPI Kenyataan Pak Jokowi...","Maaf Mas2 Dan Mbak2, Penyanyi Bukan politik, T..."
4,681,1,2020-06-24 00:00:00,Foto Kadrun kalo lihat foto ini panas dingin,Kadrun kalo lihat foto ini panas dingin . .,681.jpg,Foto Kadrun kalo lihat foto Penyanyi Panas Dingin,Kadrun kalo lihat foto Penyanyi Panas Dingin. .


In [47]:
df_train_0 = df_train[df_train.label == "0"]
df_train_dummy = df_train.append(df_train_0, ignore_index=True)
df_train_dummy = df_train_dummy.append(df_train_0, ignore_index=True)
df_train_dummy.label.value_counts()

1    3465
0    2298
Name: label, dtype: int64

In [7]:
# read test dataset
df_test = pd.read_excel("../data/testing/DataUji.xlsx", engine="openpyxl", dtype=str)
df_test.head()

Unnamed: 0,ID,tanggal,judul,narasi,nama file gambar,judul_translate,narasi_translate
0,238057,2020-07-13 00:00:00,Narasi Tito Karnavian Berideologi Komunis Kare...,TITO KARNIVAN ITU BERIDIOLOGI KOMUNIS DIA BISA...,238057.jpg,Narasi Tito Karnavian Berideologi Komunis KARE...,TITO KARNIVAN ITU beridiologi Komunis DIA BISA...
1,238158,2020-07-06 00:00:00,Anies: Seberat beratnya Pekerjaan Akan terasa ...,Seberat beratnya Pekerjaan Akan terasa ringan ...,238158.jpg,Anies: seberat beratnya Pekerjaan Akan terasa ...,Seberat beratnya Pekerjaan Akan terasa Anda Ri...
2,238865,2020-04-22 00:00:00,Hindu di india Melemparkan Patung Buatan Merek...,Hindu di india melemparkan patung buatan merek...,238865.jpg,Hindu di india Melemparkan Patung Buatan Merek...,Hindu di india melemparkan patung Buatan merek...
3,248298,2019-10-22 00:00:00,RSCM Praktekkan Penyedotan Plug Vena/Saluran ...,Mulai Hari ini di RSCM mulai diPraktekkan Peny...,248298.jpg,RSCM praktekkan penyedotan Plug Vena / Saluran...,Mulai Hari Penyanyi di RSCM Mulai diPraktekkan...
4,255176,2020-05-01 00:00:00,Permohonan Kelonggaran Angsuran ke OJK,"Untuk sekedar info, Bagi anda yg punya ansuran...",255176.jpg,Permohonan Kelonggaran Angsuran Ke OJK,"Untuk Sekedar info, Bagi Andari yg Punya ansur..."


In [8]:
# READ TRAIN AND TEST DATA
train_dataset, validation_dataset = optimize_aug_train_data(df_train, BS)
# X_test, y_test, le_test = test_data(testing_dataset)

Found 3377 validated image filenames belonging to 2 classes.
Found 844 validated image filenames belonging to 2 classes.


  .format(n_invalid, x_col)
  .format(n_invalid, x_col)


## BUILD AND TRAIN MODEL

In [9]:
from tensorflow.keras.backend import clear_session
clear_session()

In [10]:
# DEFINE MODEL
model = Sequential()
model.add(ZeroPadding2D((1, 1), input_shape=(224, 224, 3)))
model.add(Convolution2D(16, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2), strides=(2, 2)))
model.add(Dropout(0.50))

model.add(Flatten())
model.add(Dense(8, activation="relu"))

# output layer 
model.add(Dense(1, activation="sigmoid"))
# SUMMARY MODEL
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
zero_padding2d (ZeroPadding2 (None, 226, 226, 3)       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 224, 224, 16)      448       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 112, 112, 16)      0         
_________________________________________________________________
dropout (Dropout)            (None, 112, 112, 16)      0         
_________________________________________________________________
flatten (Flatten)            (None, 200704)            0         
_________________________________________________________________
dense (Dense)                (None, 8)                 1605640   
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 9

In [11]:
# COMPILE MODEL
adam = Adam(learning_rate=LEARNING_RATE)
model.compile(optimizer=adam, loss='binary_crossentropy',
              metrics=['accuracy'])

# Initiate Callbacks
my_callbacks = my_callbacks(EPOCHS, BS)

In [None]:
# Initiate start time
start_time = time.time()

# TRAIN MODEL
Historia = model.fit(train_dataset, validation_data=validation_dataset,callbacks=my_callbacks, epochs=EPOCHS, batch_size=BS)

# end time
print("--- %s seconds ---" % (time.time() - start_time))

## EVALUATE MODEL

In [None]:
model = load_my_model(EPOCHS, BS)

loss0, acc0 = model.evaluate(train_dataset)
print(loss0, acc0)

loss1, acc1 = model.evaluate(validation_dataset)
print(loss1, acc1)

In [None]:
predictions = np.round(model.predict(validation_dataset))
tes = validation_dataset.classes

In [None]:
# Print classification report
print_class_reports(tes, predictions, validation_dataset.class_indices, EPOCHS, BS)

In [None]:
# print confusion matrix
print_conf_matrix(tes, predictions, validation_dataset.class_indices, EPOCHS, BS)

## OBSERVE MODEL

In [None]:
# accuracy
visualize_accuracy(Historia, EPOCHS, BS)

In [None]:
# loss
visualize_loss(Historia, EPOCHS, BS)

## SAVE RESULT TO CSV

In [None]:
predictions_cnn = [int(y[0]) for y in np.round(model.predict(test_dataset))]
results_cnn = pd.DataFrame(zip(df_test["ID"], predictions_cnn), columns=["ID","Prediksi"])
results_cnn.to_csv("../result/neural_network/results_cnn.csv", index=False)