In [1]:
import pandas as pd
df = pd.read_csv("./news.csv")
df.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [2]:
# Set value of label to 1 if FAKE, else to 0 for REAL
df['label'] = (df['label'] == 'FAKE').astype('int')
# Concatenate title and text into column alltext
df['alltext'] = df['title'] + ". " + df['text']
# Transform the dataset to contain only the label and all text columns
df = df.reindex(columns=['alltext', 'label'])
df.head(5)  # Show first 5 rows in dataset

Unnamed: 0,alltext,label
0,You Can Smell Hillary’s Fear. Daniel Greenfiel...,1
1,Watch The Exact Moment Paul Ryan Committed Pol...,1
2,Kerry to go to Paris in gesture of sympathy. U...,0
3,Bernie supporters on Twitter erupt in anger ag...,1
4,The Battle of New York: Why This Primary Matte...,0


In [3]:
# Remove texts that are less than 50 characters long
df.drop(df[df.alltext.str.len() < 50].index, inplace=True)
df = df.reset_index(drop=True)


def convert_to_lowercase(text):  # Convert all characters to lowercase
    return text.lower()


# Convert text to lowercase
df['alltext'] = df['alltext'].apply(convert_to_lowercase)
print("Samples:", len(df['alltext']))
print("Labels:", len(df['label']), "\n")
# Print the first 1000 characters of the first text as an example
print(df['alltext'].iloc[0][0:1000])
# Save processed dataset to csv file
df.to_csv('fakenews_processed.csv', index=False)

Samples: 6327
Labels: 6327 

you can smell hillary’s fear. daniel greenfield, a shillman journalism fellow at the freedom center, is a new york writer focusing on radical islam. 
in the final stretch of the election, hillary rodham clinton has gone to war with the fbi. 
the word “unprecedented” has been thrown around so often this election that it ought to be retired. but it’s still unprecedented for the nominee of a major political party to go war with the fbi. 
but that’s exactly what hillary and her people have done. coma patients just waking up now and watching an hour of cnn from their hospital beds would assume that fbi director james comey is hillary’s opponent in this election. 
the fbi is under attack by everyone from obama to cnn. hillary’s people have circulated a letter attacking comey. there are currently more media hit pieces lambasting him than targeting trump. it wouldn’t be too surprising if the clintons or their allies were to start running attack ads against the fbi.

In [4]:
from sklearn.model_selection import train_test_split
# Split dataset into 30% for test and 70% for training
train, test = train_test_split(df, test_size=0.30, random_state=42)
X_train = train["alltext"].values  # Get the documents for training
Y_train = train["label"].values  # Get the labels for training
X_test = test["alltext"].values  # Get the documents for testing
Y_test = test["label"].values  # Get the labels for testing

In [5]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Reduce the number of shown warnings

In [6]:
import tensorflow as tf
from keras.layers import TextVectorization
# The maximum number of words to be used (most frequent)
MAX_VOCABULARY_WORDS = 5000
# Number of words in each text. Sequence length to pad the outputs to.
MAX_SEQUENCE_LENGTH = 200
EMBEDDING_DIM = 10  # Size of the word embedding to be used
# Create a preprocessing layer which maps text features to integer sequences
vectorize_layer = TextVectorization(
    max_tokens=MAX_VOCABULARY_WORDS,  # Maximum size of the vocabulary for this layer
    output_mode='int',  # Represent each word in the vocabulary with an integer
    output_sequence_length=MAX_SEQUENCE_LENGTH)  # Pad the sequence length to size MAX_SEQUENCE_LENGTH
# Computes a vocabulary of string terms from tokens in a dataset.
vectorize_layer.adapt(X_train)
vocabulary = vectorize_layer.get_vocabulary()  # Get the vocabulary
print("Vocabulary size: " + str(len(vocabulary)) + " words")




Vocabulary size: 5000 words


In [7]:
from keras import Sequential
from keras.layers import Input, Embedding, LSTM, Dense, Bidirectional
model = Sequential(name="MyLSTM")
model.add(Input(shape=(1,), dtype=tf.string))

model.add(vectorize_layer)
model.add(Embedding(input_dim=MAX_VOCABULARY_WORDS,  # Size of the vocabulary
                    output_dim=EMBEDDING_DIM,  # Size of the word embedding
                    input_length=MAX_SEQUENCE_LENGTH))  # Length of input sequences
# Set return_sequences=True if you want additional LSTM layer
model.add(Bidirectional(LSTM(16, return_sequences=True)))
model.add(Bidirectional(LSTM(16, go_backwards=True, dropout=0.2)))
model.add(Dense(1, activation='sigmoid'))
model.summary()  # Print model summary

Model: "MyLSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVe  (None, 200)               0         
 ctorization)                                                    
                                                                 
 embedding (Embedding)       (None, 200, 10)           50000     
                                                                 
 bidirectional (Bidirection  (None, 200, 32)           3456      
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 32)                6272      
 onal)                                                           
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                            

In [8]:
from keras.optimizers import Adam
EPOCHS = 50
BATCH_SIZE = 64
LEARNING_RATE = 0.01
opt = Adam(learning_rate=LEARNING_RATE) # Initialise Adam optimiser with a leanring rate of 0.01
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy']) # Initialise model

In [9]:
from keras.callbacks import EarlyStopping
early = EarlyStopping(monitor="val_accuracy", patience=4,
                      restore_best_weights=True, mode="auto")
history = model.fit(X_train, Y_train,
                    epochs=EPOCHS, batch_size=BATCH_SIZE,
                    validation_split=0.1,  # Use 10% of training data for validation
                    callbacks=[early]
                    )

Epoch 1/50

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
