# Hoax Detection Using Traditional Machine Learning
## Dataset from Satria Data 2020 - Big Data Challenge

In [1]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import pickle
from pandarallel import pandarallel
from nltk.corpus import stopwords
from nltk.stem.snowball import PorterStemmer
from string import punctuation
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/prinanda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# multiprocessing Initialization
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [4]:
# Read Dataset
train_data = pd.read_excel("../data/training/DataLatih.xlsx", engine="openpyxl")
test_data = pd.read_excel("../data/testing/DataUji.xlsx", engine="openpyxl")

In [5]:
train_data.head()

Unnamed: 0,ID,label,tanggal,judul,narasi,nama file gambar,judul_translate,narasi_translate
0,71,1,2020-08-17 00:00:00,Pemakaian Masker Menyebabkan Penyakit Legionna...,A caller to a radio talk show recently shared ...,71.jpg,Pemakaian Masker Menyebabkan Penyakit Legionna...,Seorang penelepon ke talk show radio baru-baru...
1,461,1,2020-07-17 00:00:00,Instruksi Gubernur Jateng tentang penilangan ...,Yth.Seluruh Anggota Grup Sesuai Instruksi Gube...,461.png,Instruksi Gubernur Jateng TENTANG penilangan B...,Yth.Seluruh Anggota Anggota Grup Sesuai Instru...
2,495,1,2020-07-13 00:00:00,Foto Jim Rohn: Jokowi adalah presiden terbaik ...,Jokowi adalah presiden terbaik dlm sejarah ban...,495.png,Foto Jim Rohn: Jokowi Adalah Presiden Terbaik ...,Jokowi Adalah Presiden Terbaik dlm Sejarah ban...
3,550,1,2020-07-08 00:00:00,"ini bukan politik, tapi kenyataan Pak Jokowi b...","Maaf Mas2 dan Mbak2, ini bukan politik, tapi k...",550.png,"Suami Bukan politik, TAPI Kenyataan Pak Jokowi...","Maaf Mas2 Dan Mbak2, Penyanyi Bukan politik, T..."
4,681,1,2020-06-24 00:00:00,Foto Kadrun kalo lihat foto ini panas dingin,Kadrun kalo lihat foto ini panas dingin . .,681.jpg,Foto Kadrun kalo lihat foto Penyanyi Panas Dingin,Kadrun kalo lihat foto Penyanyi Panas Dingin. .


In [6]:
test_data.head()

Unnamed: 0,ID,tanggal,judul,narasi,nama file gambar,Unnamed: 5,judul_translate,narasi_translate
0,238057,2020-07-13 00:00:00,Narasi Tito Karnavian Berideologi Komunis Kare...,TITO KARNIVAN ITU BERIDIOLOGI KOMUNIS DIA BISA...,238057.jpg,,Narasi Tito Karnavian Berideologi Komunis KARE...,TITO KARNIVAN ITU beridiologi Komunis DIA BISA...
1,238158,2020-07-06 00:00:00,Anies: Seberat beratnya Pekerjaan Akan terasa ...,Seberat beratnya Pekerjaan Akan terasa ringan ...,238158.jpg,,Anies: seberat beratnya Pekerjaan Akan terasa ...,Seberat beratnya Pekerjaan Akan terasa Anda Ri...
2,238865,2020-04-22 00:00:00,Hindu di india Melemparkan Patung Buatan Merek...,Hindu di india melemparkan patung buatan merek...,238865.jpg,,Hindu di india Melemparkan Patung Buatan Merek...,Hindu di india melemparkan patung Buatan merek...
3,248298,2019-10-22 00:00:00,RSCM Praktekkan Penyedotan Plug Vena/Saluran ...,Mulai Hari ini di RSCM mulai diPraktekkan Peny...,248298.jpg,,RSCM praktekkan penyedotan Plug Vena / Saluran...,Mulai Hari Penyanyi di RSCM Mulai diPraktekkan...
4,255176,2020-05-01 00:00:00,Permohonan Kelonggaran Angsuran ke OJK,"Untuk sekedar info, Bagi anda yg punya ansuran...",255176.jpg,,Permohonan Kelonggaran Angsuran Ke OJK,"Untuk Sekedar info, Bagi Andari yg Punya ansur..."


## Pre-Processing

In [7]:
# Reconstruct train dataframe
train_df = pd.DataFrame()
train_df["konten"] = train_data["judul_translate"] + " " + train_data["narasi_translate"]
train_df["Class"] = train_data["label"]

# Reconstruct test dataframe
test_df = pd.DataFrame()
test_df["ID"] = test_data["ID"]
test_df["konten"] = test_data["judul_translate"] + " " + test_data["narasi_translate"]

In [8]:
# define stopword list, indonesia
STOPWORDS = set(StopWordRemoverFactory().get_stop_words() + stopwords.words('english'))

# define list kata singkat
KATASINGKAT = {"dlm":"dalam", "gw":"saya", "yg":"yang", "lu":"kamu", "dkt":"dekat", "kalo":"kalau", "n":"and"}

# define stemmer sastrawi for Indonesia
stemmer_ind = StemmerFactory().create_stemmer()
stemmer_eng = PorterStemmer()

In [9]:
# function of preprocessing

def remove_kata_singkat(word):
    if word in list(KATASINGKAT.keys()):
        return KATASINGKAT.get(word)
    else:
        return word
    
def normalize_word(row):
    # remove punctuation
    konten = re.sub(r'[^a-zA-Z\s]', '', row.konten, re.I|re.A)
    
    # case folding and remove kata singkat
    konten = " ".join([remove_kata_singkat(word.lower()).strip() for word in nltk.word_tokenize(konten)])
    
    # remove stopword and number
    konten = " ".join([word for word in nltk.word_tokenize(konten) if word not in punctuation and word.isalpha() and word not in STOPWORDS])
    
    # stemming
    konten = stemmer_ind.stem(konten)
    konten = stemmer_eng.stem(konten)
    
    # final assignment
    row.konten = konten
    
    return row

In [None]:
# # apply preprocess to dataframe
# train_df = train_df.apply(normalize_word, axis=1)
# test_df = test_df.apply(normalize_word, axis=1)

In [10]:
# Parallel preprocess to dataframe with progressbar
train_df = train_df.parallel_apply(normalize_word, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1058), Label(value='0 / 1058'))), …

In [11]:
test_df = test_df.parallel_apply(normalize_word, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=118), Label(value='0 / 118'))), HB…

In [12]:
train_df.head()

Unnamed: 0,konten,Class
0,pakai masker sebab sakit legionnaires orang te...,1
1,instruksi gubernur jateng tilang masker muka u...,1
2,foto jim rohn jokowi presiden baik sejarah ban...,1
3,suami bukan politik nyata pak jokowi hasil pul...,1
4,foto kadrun kalau lihat foto nyanyi panas ding...,1


In [13]:
test_df.head()

Unnamed: 0,ID,konten
0,238057,narasi tito karnavian ideologi komunis pernah ...
1,238158,anies berat berat kerja asa ringan bila kerja ...
2,238865,hindu india lempar patung buat laut tolong cor...
3,248298,rscm praktek sedot plug vena salur darah mulai...
4,255176,mohon longgar angsur ojk dar info andar punya ...


## Feature Extraction Using TF-IDF

In [14]:
# konten_all = train_df.loc[:,"konten"].append(test_df.loc[:,"konten"], ignore_index=True)
konten_train = train_df.loc[:,"konten"]

In [15]:
# konten_all
konten_train

0       pakai masker sebab sakit legionnaires orang te...
1       instruksi gubernur jateng tilang masker muka u...
2       foto jim rohn jokowi presiden baik sejarah ban...
3       suami bukan politik nyata pak jokowi hasil pul...
4       foto kadrun kalau lihat foto nyanyi panas ding...
                              ...                        
4226    kpk larang bawa brimob senjata masuk gedung dp...
4227    foto jabat uang bawah palu arit jangan mau ali...
4228    gambar denny siregar musuh warga tasikmalaya b...
4229    kaesang bapak kesederhaan nipu rakyat indonesi...
4230    laser termometer gun rusak struktur otak nolak...
Name: konten, Length: 4231, dtype: object

In [16]:
tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=0.0)

# tokenize and build vocab0
tfidf_matrix = tfidf_vectorizer.fit(konten_train)

# encode training entry
tfidf_matrix = tfidf_vectorizer.transform(train_df.loc[:,"konten"])
tfidf_matrix = tfidf_matrix.toarray()

# get all unique words in the corpus
vocab = tfidf_vectorizer.get_feature_names()

# show document feature vectors
training_df = pd.DataFrame(tfidf_matrix, columns=vocab)
training_df["Class"] = train_df.loc[:,"Class"]
training_df

Unnamed: 0,aa,aaamiin,aaamiinn,aac,aaj,aalaamiin,aalamiin,aamiiin,aamiin,aamiinkan,...,zuhur,zulhasan,zulkarnaen,zulkarnain,zulkieflimansyah,zulkifli,zulkifliemansyah,zumi,zurina,Class
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4229,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [17]:
# encode testing entry
tfidf_matrix_test = tfidf_vectorizer.transform(test_df.loc[:,"konten"])
tfidf_matrix_test = tfidf_matrix_test.toarray()

# show document feature vectors
testing_df = pd.DataFrame(tfidf_matrix_test, columns=vocab)
testing_df

Unnamed: 0,aa,aaamiin,aaamiinn,aac,aaj,aalaamiin,aalamiin,aamiiin,aamiin,aamiinkan,...,zuey,zuhur,zulhasan,zulkarnaen,zulkarnain,zulkieflimansyah,zulkifli,zulkifliemansyah,zumi,zurina
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
466,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
467,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Split data
X =  training_df.drop("Class", axis=1)
y =  training_df["Class"]

X_test = testing_df.copy()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [19]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((3384, 14035), (847, 14035), (3384,), (847,))

## Build and Train Model

### 1. KNN

In [20]:
knn_model = KNeighborsClassifier(n_neighbors= 1)
knn_model.fit(X_train, y_train)

# Save the trained model as a pickle string.
filename = '../model/tfidf_knn_model.pkl'
pickle.dump(knn_model,open(filename, 'wb'))
  
# Load the pickled model
knn_from_pickle = pickle.load(open(filename, 'rb'))
  
# Use the loaded pickled model to make predictions
knn_from_pickle.predict(X_test)

val_pred_knn = knn_from_pickle.predict(X_val)
print(classification_report(y_val, val_pred_knn))
print(confusion_matrix(y_val, val_pred_knn, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.25      0.25      0.25       152
           1       0.84      0.84      0.84       695

    accuracy                           0.73       847
   macro avg       0.54      0.54      0.54       847
weighted avg       0.73      0.73      0.73       847

[[ 38 114]
 [112 583]]


In [21]:
# save results to csv
predictions_knn = knn_from_pickle.predict(X_test)
result_knn = pd.DataFrame(zip(test_df["ID"], predictions_knn), columns=["ID", "Prediksi"])
result_knn.to_csv("../result/tfidf/result_knn.csv", index=False)

### 2. Naive Bayes

In [22]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Save the trained model as a pickle string.
filename = '../model/tfidf_nb_model.pkl'
pickle.dump(nb_model,open(filename, 'wb'))
  
# Load the pickled model
nb_from_pickle = pickle.load(open(filename, 'rb'))
  
# Use the loaded pickled model to make predictions
nb_from_pickle.predict(X_test)

val_pred_nb = nb_from_pickle.predict(X_val)
print(classification_report(y_val, val_pred_nb))
print(confusion_matrix(y_val, val_pred_nb, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.26      0.28      0.27       152
           1       0.84      0.82      0.83       695

    accuracy                           0.73       847
   macro avg       0.55      0.55      0.55       847
weighted avg       0.74      0.73      0.73       847

[[ 43 109]
 [123 572]]


In [23]:
# save results to csv
predictions_nb = nb_from_pickle.predict(X_test)
result_nb = pd.DataFrame(zip(test_df["ID"], predictions_nb), columns=["ID", "Prediksi"])
result_nb.to_csv("../result/tfidf/result_nb.csv", index=False)

### 3. SVM

In [24]:
svm_model = SVC(kernel="rbf")
svm_model.fit(X_train, y_train)

# Save the trained model as a pickle string.
filename = '../model/tfidf_svm_model.pkl'
pickle.dump(svm_model,open(filename, 'wb'))
  
# Load the pickled model
svm_from_pickle = pickle.load(open(filename, 'rb'))
  
# Use the loaded pickled model to make predictions
svm_from_pickle.predict(X_test)

val_pred_svm = svm_from_pickle.predict(X_val)
print(classification_report(y_val, val_pred_svm))
print(confusion_matrix(y_val, val_pred_svm, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.61      0.07      0.13       152
           1       0.83      0.99      0.90       695

    accuracy                           0.83       847
   macro avg       0.72      0.53      0.52       847
weighted avg       0.79      0.83      0.76       847

[[ 11 141]
 [  7 688]]


In [25]:
# save results to csv
predictions_svm = svm_from_pickle.predict(X_test)
results_svm = pd.DataFrame(zip(test_df["ID"], predictions_svm), columns=["ID", "Prediksi"])
results_svm.to_csv("../result/tfidf/result_svm.csv", index=False)