In [3]:
# import
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer

# For random forest
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

# For MLP
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

import joblib
# For parallel processing
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm

In [4]:
df = pd.read_csv('WELFake_Dataset.csv')

# for testing with 1000 rows
# df = pd.read_csv('WELFake_Dataset.csv', nrows=1000)

df.head()
df.shape

(72134, 4)

In [5]:
# Download NLTK
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('stopwords')
#nltk.download('wordnet')

# Preprocessing

In [6]:
df.isna().sum()

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

In [7]:
df = df.dropna()
df = df.drop(columns=['Unnamed: 0'])
df.isna().sum()

title    0
text     0
label    0
dtype: int64

In [8]:
df.head()

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1


In [9]:
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [10]:
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    # Remove stop words
    tokens = [word for word in tokens if word.lower() not in stop_words]
    # POS tagging
    tagged_tokens = pos_tag(tokens)
    # Lemmatize
    lemmatized_tokens = []
    for word, tag in tagged_tokens:
        if tag.startswith('N'): 
            lemmatized_tokens.append(lemmatizer.lemmatize(word, pos='n'))
        elif tag.startswith('V'):  
            lemmatized_tokens.append(lemmatizer.lemmatize(word, pos='v'))
        elif tag.startswith('R'): 
            lemmatized_tokens.append(lemmatizer.lemmatize(word, pos='r'))
        elif tag.startswith('J'): 
            lemmatized_tokens.append(lemmatizer.lemmatize(word, pos='a'))
        else:
            lemmatized_tokens.append(word)
    
    # Extract words and their POS tags
    return " ".join([f"{word}/{tag}" for word, tag in tagged_tokens])

In [11]:
# Pre-processing (parallel)
num_cores = multiprocessing.cpu_count()

def parallel_preprocess(text_series):
    return text_series.apply(preprocess_text)

tqdm.pandas()

# Chunk size. sesuain sama hardware. 200 makan kurang lebih 4 GB RAM. semakin besar semakin berat.
chunk_size = 200
# Process dataframe in chunks (so python don't eat your 12 gigs of RAM and brick your machine XDXD)
print("Pre-processing title column...")
for chunk_start in tqdm(range(0, len(df), chunk_size)):
    chunk_end = min(chunk_start + chunk_size, len(df))
    chunk = df['title'].iloc[chunk_start:chunk_end]
    processed_chunk = Parallel(n_jobs=num_cores)(delayed(preprocess_text)(text) for text in chunk)
    df.loc[chunk.index, 'preprocessed_title'] = processed_chunk
print("Pre-processing title column complete.")

print("Pre-processing text column...")
for chunk_start in tqdm(range(0, len(df), chunk_size)):
    chunk_end = min(chunk_start + chunk_size, len(df))
    chunk = df['text'].iloc[chunk_start:chunk_end]
    processed_chunk = Parallel(n_jobs=num_cores)(delayed(preprocess_text)(text) for text in chunk)
    df.loc[chunk.index, 'preprocessed_text'] = processed_chunk
print("Pre-processing text column complete.")


Pre-processing title column...


100%|██████████| 358/358 [00:35<00:00, 10.21it/s]


Pre-processing title column complete.
Pre-processing text column...


100%|██████████| 358/358 [03:16<00:00,  1.83it/s]

Pre-processing text column complete.





In [12]:
# Pre-processing (single)
#tqdm.pandas()
#df['processed_text'] = df['text'].progress_apply(preprocess_text)

In [13]:
df.head()

Unnamed: 0,title,text,label,preprocessed_title,preprocessed_text
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,law/NN enforcement/NN high/JJ alert/NN followi...,comment/NN expected/VBN barack/RB obama/JJ mem...
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,unbelievable/JJ !/. obama/NN ’/NNP attorney/NN...,",/, demonstrators/NNS gathered/VBD last/JJ nig..."
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,"bobby/NN jindal/NN ,/, raised/VBD hindu/NN ,/,...",dozen/NN politically/RB active/JJ pastors/NNS ...
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,satan/JJ 2/CD :/: russia/NN unvelis/JJ image/N...,"rs-28/JJ sarmat/NN missile/NN ,/, dubbed/VBN s..."
5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,time/NN !/. christian/JJ group/NN sues/NNS ama...,say/VB one/CD time/NN someone/NN sued/VBD sout...


In [14]:
df['combined'] = df['preprocessed_text'] + " " + df['preprocessed_title']
x_train, x_test, y_train, y_test = train_test_split(df['combined'], df['label'], test_size=0.2, random_state=42)
df['combined'].head()

0    comment/NN expected/VBN barack/RB obama/JJ mem...
2    ,/, demonstrators/NNS gathered/VBD last/JJ nig...
3    dozen/NN politically/RB active/JJ pastors/NNS ...
4    rs-28/JJ sarmat/NN missile/NN ,/, dubbed/VBN s...
5    say/VB one/CD time/NN someone/NN sued/VBD sout...
Name: combined, dtype: object

In [15]:
#TF IDF
vectorizer = TfidfVectorizer(max_df=0.7)
tfidf_train = vectorizer.fit_transform(x_train)
tfidf_test = vectorizer.transform(x_test)


# RandomForest Model

In [16]:
rf_model = RandomForestClassifier(n_jobs = -1, verbose = 2)
rf_model.fit(tfidf_train, y_train)

y_pred = rf_model.predict(tfidf_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.


building tree 1 of 100building tree 2 of 100
building tree 3 of 100

building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100


[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    5.0s


building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   31.9s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    0.0s


Accuracy: 0.9484204640760414


[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.4s finished


In [17]:
joblib.dump(rf_model, 'random_forest_model.pkl')


['random_forest_model.pkl']

In [22]:
# GPU Check
from tensorflow.python.client import device_lib
print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
def get_available_devices():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]
print(get_available_devices())

2.16.1
Num GPUs Available:  0
['/device:CPU:0']


# MLP

In [19]:
# Reset the variable (just in case)
x_train, x_test, y_train, y_test = train_test_split(df['combined'], df['label'], test_size=0.2, random_state=42)

# TF-IDF
vectorizer = TfidfVectorizer(max_df=0.7)
tfidf_train = vectorizer.fit_transform(x_train)
tfidf_test = vectorizer.transform(x_test)

In [20]:
num_classes = len(np.unique(y_train))
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)

input_shape = tfidf_train.shape[1]

In [21]:
model = Sequential()

# Input layer
model.add(Input(shape=(input_shape,)))

# Dense layers
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))

# Output layer
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(tfidf_train, y_train, epochs=3, batch_size=32, validation_data=(tfidf_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(tfidf_test, y_test)
print(f'Test Accuracy: {accuracy}')

Epoch 1/3
[1m1789/1789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m619s[0m 345ms/step - accuracy: 0.9104 - loss: 0.2181 - val_accuracy: 0.9708 - val_loss: 0.0787
Epoch 2/3
[1m1789/1789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m618s[0m 345ms/step - accuracy: 0.9912 - loss: 0.0282 - val_accuracy: 0.9676 - val_loss: 0.1118
Epoch 3/3
[1m1789/1789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m616s[0m 344ms/step - accuracy: 0.9971 - loss: 0.0093 - val_accuracy: 0.9728 - val_loss: 0.1404
[1m448/448[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9735 - loss: 0.1341
Test Accuracy: 0.9728124141693115


In [23]:
joblib.dump(model, 'MLP_model.pkl')


['MLP_model.pkl']