In [1]:
# LSTM Fake News Classification Model
import numpy as np
import pandas as pd
import tensorflow as tf
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import StratifiedKFold

In [2]:
# If we ant to fix random seed for reproducibility of our results
# seed = 0
# np.random.seed(seed)
# tf.random.set_seed(0)

In [3]:
# Load in the training dataset from Github
dataset_location = 'https://raw.githubusercontent.com/SeunAjao/FakeNewsClassifier/main/charlie.tsv'
data_set = pd.read_csv(dataset_location, sep = '\t', quoting = 3, header = None)
X = data_set.iloc[:, 0]
Y = data_set.iloc[:, 1]


In [4]:
# Check that you have correctlly imported n number of rows. Thee should be only two columns for each of the datasets
print(data_set.shape)

(2067, 2)


In [5]:
print(data_set)

                                                      0  1
0     Breaking: At least 10 dead, 5 injured after tO...  1
1     France: 10 people dead after shooting at HQ of...  1
2     Ten killed in shooting at headquarters of Fren...  1
3     BREAKING: 10 dead in shooting at headquarters ...  1
4     Reuters: 10 people shot dead at headquarters o...  1
...                                                 ... ..
2062  Some hostages seen leaving Paris kosher market...  0
2063  Gunman holding hostages at kosher supermarket ...  0
2064  Respect to the French Police #JeSuisCharliehtt...  0
2065  MORE: Police official: Suspects in Charlie Heb...  0
2066  #BREAKING - Both #CharlieHebdo suspects killed...  0

[2067 rows x 2 columns]


In [6]:
# tokenize the training texts and make it sequential
top_words = 50
tokenizer = Tokenizer(num_words=top_words)
tokenizer.fit_on_texts(X)
sequences_train = tokenizer.texts_to_sequences(X)

In [7]:
# tokenize the testing texts and make it sequential
tokenizer.fit_on_texts(X)
sequences_test = tokenizer.texts_to_sequences(X)

In [8]:
#word_index_train = tokenizer.word_index_train
#print('Found %s unique tokens.' % len(word_index_train))
# Y = Y.values.reshape(2600,)
# print(Y.shape)

In [9]:
# truncate and pad input sequences to a 300 characters per tweet
max_tweet_length = 300
X = sequence.pad_sequences(sequences_train, maxlen=max_tweet_length)

In [10]:
# define 10-fold cross validation test harness
#kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed) the random stste argument would make our results reproducible
kfold = StratifiedKFold(n_splits=10, shuffle=True)
cvscores = []

In [11]:
# Create the training split data
for train, test in kfold.split(X, Y):
	# create the model
	embedding_vector_length = 32
	model = Sequential()
	# load the dataset with word embedding but only keep the top n words, zero the rest
	model.add(Embedding(top_words, embedding_vector_length, input_length=max_tweet_length))

In [12]:
	# create the model object
	embedding_vector_length = 32
	model = Sequential()

In [13]:
# load the dataset with word embedding but only keep the top n words, zero the rest
model.add(Embedding(top_words, embedding_vector_length, input_length=max_tweet_length))


In [14]:
# Let's create our evaluation metrics for our model within Keras

# Model Precision: TP / (TP + FP)
def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

# Model Recall: TP / (TP + FN)
def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

# Model FMeasure: 2 TP / (2 TP + FP + FN)
def fmeasure(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    f1 = 2 * (p * r) / (p + r + K.epsilon())
    return f1



In [15]:
# compile the model
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics =['accuracy', precision, recall, fmeasure])
print(model.summary())

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 300, 32)           1600      
                                                                 
 lstm (LSTM)                 (None, 100)               53200     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 54901 (214.46 KB)
Trainable params: 54901 (214.46 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [16]:
# Let's fit the model with 20 epochs..you may choose to train for longer if it gets better bute remember to avoid overfitting.
# The default train-test split in Keras is 80:20 but we will use a 70:30 split for our task

model.fit(X[train], Y[train], epochs=20, batch_size=64, validation_split =0.3)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x79db364d1ea0>

In [17]:

# Split the training data into 80% and 20% portions with random state for reproducibility
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) # 80% training and 20% test

In [18]:
# Checking the dimension of your output from above
#print(X_train.shape)
#print(X_test.shape)
#print(Y_train.shape)
#print(Y_test.shape)

In [19]:
# You can check model performance and see what your results are
# Train the model using the 80:20 train-split
#clf = model.fit(X_train, Y_train, epochs=0, batch_size=64)

In [20]:
# Performance here will vary from the CV option we are adopting from the lab...possibly lower
# Predict the response for test dataset
#Y_pred = model.predict(X_test)


In [21]:
#Final evaluation of the model
scores = model.evaluate(X[test], Y[test])



In [22]:
print(X[test].shape)


(206, 300)


In [23]:
print(X[train].shape)

(1861, 300)


In [24]:
# View the evaluation scores of your model

print("Accuracy: %.2f%%" % (scores[1]*100))
print("Precision: %.2f%%" % (scores[2]*100))
print("Recall: %.2f%%" % (scores[3]*100))
print("Fmeasure: %.2f%%" % (scores[4]*100))
print(scores)

# Your more reliable cross validation accuracy
cvscores.append(scores[1] * 100)
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))




Accuracy: 81.55%
Precision: 26.19%
Recall: 19.02%
Fmeasure: 21.71%
[0.5537721514701843, 0.8155339956283569, 0.2619047462940216, 0.19024726748466492, 0.21714285016059875]
81.55% (+/- 0.00%)
