# Hoax Detection Using Traditional Machine Learning
## Dataset from Satria Data 2020 - Big Data Challenge

In [142]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from string import punctuation
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/prinanda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# nltk.download('stopwords')

In [4]:
# Read Dataset
train_data = pd.read_excel("Dataset/training/train.xlsx")
test_data = pd.read_excel("Dataset/testing/test.xlsx")

In [191]:
train_data.head()

Unnamed: 0,ID,label,tanggal,judul,narasi,nama file gambar
0,71,1,2020-08-17 00:00:00,Pemakaian Masker Menyebabkan Penyakit Legionna...,A caller to a radio talk show recently shared ...,71.jpg
1,461,1,2020-07-17 00:00:00,Instruksi Gubernur Jateng tentang penilangan ...,Yth.Seluruh Anggota Grup Sesuai Instruksi Gube...,461.png
2,495,1,2020-07-13 00:00:00,Foto Jim Rohn: Jokowi adalah presiden terbaik ...,Jokowi adalah presiden terbaik dlm sejarah ban...,495.png
3,550,1,2020-07-08 00:00:00,"ini bukan politik, tapi kenyataan Pak Jokowi b...","Maaf Mas2 dan Mbak2, ini bukan politik, tapi k...",550.png
4,681,1,2020-06-24 00:00:00,Foto Kadrun kalo lihat foto ini panas dingin,Kadrun kalo lihat foto ini panas dingin . .,681.jpg


In [6]:
test_data.head()

Unnamed: 0,ID,tanggal,judul,narasi,nama file gambar
0,238057,2020-07-13 00:00:00,Narasi Tito Karnavian Berideologi Komunis Kare...,TITO KARNIVAN ITU BERIDIOLOGI KOMUNIS DIA BISA...,238057.jpg
1,238158,2020-07-06 00:00:00,Anies: Seberat beratnya Pekerjaan Akan terasa ...,Seberat beratnya Pekerjaan Akan terasa ringan ...,238158.jpg
2,238865,2020-04-22 00:00:00,Hindu di india Melemparkan Patung Buatan Merek...,Hindu di india melemparkan patung buatan merek...,238865.jpg
3,248298,2019-10-22 00:00:00,RSCM Praktekkan Penyedotan Plug Vena/Saluran ...,Mulai Hari ini di RSCM mulai diPraktekkan Peny...,248298.jpg
4,255176,2020-05-01 00:00:00,Permohonan Kelonggaran Angsuran ke OJK,"Untuk sekedar info, Bagi anda yg punya ansuran...",255176.jpg


## Pre-Processing

### Tokenization

In [162]:
df_sample = train_data[["label","judul","narasi"]].head()

In [192]:
# define stopword list, indonesia
STOPWORDS = set(StopWordRemoverFactory().get_stop_words() + stopwords.words('english'))

# define list kata singkat
KATASINGKAT = {"dlm":"dalam", "gw":"saya", "yg":"yang", "lu":"kamu", "dkt":"dekat", "kalo":"kalau", "n":"and"}

In [193]:
def remove_kata_singkat(word):
    if word in list(KATASINGKAT.keys()):
        return KATASINGKAT.get(word)
    else:
        return word

In [194]:
# function of preprocessing
def normalize_word(row):
    # remove punctuation
    judul = re.sub(r'[^a-zA-Z\s]', '', row.judul, re.I|re.A)
    narasi = re.sub(r'[^a-zA-Z\s]', '', row.narasi, re.I|re.A)
    
    # case folding and remove kata singkat
    judul = " ".join([remove_kata_singkat(word.lower()).strip() for word in nltk.word_tokenize(row.judul)])
    narasi = " ".join([remove_kata_singkat(word.lower()).strip() for word in nltk.word_tokenize(row.narasi)])
    
    # remove stopword and number
    judul = " ".join([word for word in nltk.word_tokenize(judul) if word not in punctuation and word.isalpha() and word not in STOPWORDS])
    narasi = " ".join([word for word in nltk.word_tokenize(narasi) if word not in punctuation and word.isalpha() and word not in STOPWORDS])
    
    # final assignment
    row.judul = judul
    row.narasi = narasi
    
    return row

In [195]:
df_sample = df_sample.apply(normalize_word, axis=1)

In [196]:
df_sample

Unnamed: 0,label,judul,narasi
0,1,pemakaian masker menyebabkan penyakit legionna...,caller radio talk show recently shared wife ho...
1,1,instruksi gubernur jateng penilangan bermasker...,anggota grup sesuai instruksi gubernur jawa te...
2,1,foto jim rohn jokowi presiden terbaik sejarah ...,jokowi presiden terbaik sejarah bangsa indones...
3,1,bukan politik kenyataan pak jokowi berhasil me...,maaf bukan politik kenyataan pak jokowi berhas...
4,1,foto kadrun kalau lihat foto panas dingin,kadrun kalau lihat foto panas dingin


## Build and Train Model

### 1. Bag of Words Using Count Vectorizer

In [202]:
judul = df_sample.loc[:,"judul"]
narasi = df_sample.loc[:, "narasi"]
label = df_sample.loc[:,"label"]
judul_narasi = df_sample.judul + " " + df_sample.narasi

In [203]:
label

0    1
1    1
2    1
3    1
4    1
Name: label, dtype: int64

In [198]:
judul_narasi[0]

'pemakaian masker menyebabkan penyakit legionnaires caller radio talk show recently shared wife hospitalized told covid couple days left live doctor friend suggested tested legionnaires disease wore mask every day day long turns legionnaires disease moisture bacteria mask given antibiotics within two days better spikes covid really something else due mask induced infections'

In [204]:

cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(judul_narasi)
cv_matrix = cv_matrix.toarray()

# get all unique words in the corpus
vocab = cv.get_feature_names()
# show document feature vectors
training_df = pd.DataFrame(cv_matrix, columns=vocab)
training_df["Class"] = label
training_df

Unnamed: 0,agama,anggota,antibiotics,apps,assistance,baca,bacteria,balik,bangsa,bawa,...,tugas,turns,two,uang,umum,via,wife,within,wore,Class
0,0,0,1,0,0,0,1,0,0,0,...,0,1,1,0,0,0,1,1,1,1
1,0,1,0,1,0,0,0,0,0,0,...,1,0,0,0,1,1,0,0,0,1
2,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,1
3,1,0,0,0,1,1,0,1,0,1,...,0,0,0,3,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [205]:
# Split data
X_train =  training_df.drop("Class", axis=1)
y_train =  training_df["Class"]

In [207]:
y_train

0    1
1    1
2    1
3    1
4    1
Name: Class, dtype: int64