# 1. Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 2. Importing the Dataset

In [2]:
data = pd.read_csv("news.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


# 3. Preprocessing Dataset

In [3]:
data = data.drop(["Unnamed: 0"], axis=1)
data.head(5)

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


# 4. Data Encoding

In [4]:
le = preprocessing.LabelEncoder()
le.fit(data['label'])
data['label'] = le.transform(data['label'])

# 5. Variables Setup

In [5]:
embedding_dim = 50
max_length = 54
padding_type = 'post'
trunc_type = 'post'
oov_tok = "<OOV>"
training_size = 3000
test_portion = 0.1

# 6. Tokenization 

In [6]:
title = []
text = []
labels = []
for x in range(training_size):
    title.append(data['title'][x])
    text.append(data['text'][x])
    labels.append(data['label'][x])

tokenizer1 = Tokenizer()
tokenizer1.fit_on_texts(title)
word_index1 = tokenizer1.word_index
vocab_size1 = len(word_index1)
sequences1 = tokenizer1.texts_to_sequences(title)
padded1 = pad_sequences(sequences1, padding=padding_type, truncating=trunc_type)

# 7. Splitting Data for Training and Testing

In [7]:
split = int(test_portion * training_size)
training_sequences1 = padded1[split:training_size]
test_sequences1 = padded1[0:split]
test_labels = labels[0:split]
training_labels = labels[split:training_size]

# 8. Reshaping Data for LSTM


In [8]:
training_sequences1 = np.array(training_sequences1)
test_sequences1 = np.array(test_sequences1)

# 9. Generating Word Embedding

In [10]:
import urllib.request
import zipfile

# Download the file
url = "https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip"
urllib.request.urlretrieve(url, "glove.6B.zip")

# Unzip the file
with zipfile.ZipFile("glove.6B.zip", "r") as zip_ref:
    zip_ref.extractall(".")


In [11]:
embedding_index = {}
with open('glove.6B.50d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs
        
embedding_matrix = np.zeros((vocab_size1 + 1, embedding_dim))

for word, i in word_index1.items():
    if i < vocab_size1:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# 10. Model Architecture

In [12]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size1 + 1, embedding_dim, input_length=max_length, 
                              weights=[embedding_matrix], trainable=False),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



# 11. Training the Model

In [13]:
history = model.fit(
    training_sequences1, 
    np.array(training_labels), 
    epochs=50, 
    validation_data=(test_sequences1, np.array(test_labels)), 
    verbose=2
)

Epoch 1/50
85/85 - 9s - 101ms/step - accuracy: 0.6137 - loss: 0.6484 - val_accuracy: 0.6467 - val_loss: 0.5967
Epoch 2/50
85/85 - 1s - 13ms/step - accuracy: 0.7074 - loss: 0.5737 - val_accuracy: 0.6900 - val_loss: 0.5453
Epoch 3/50
85/85 - 1s - 12ms/step - accuracy: 0.7244 - loss: 0.5355 - val_accuracy: 0.7167 - val_loss: 0.5355
Epoch 4/50
85/85 - 1s - 12ms/step - accuracy: 0.7611 - loss: 0.4898 - val_accuracy: 0.7467 - val_loss: 0.4934
Epoch 5/50
85/85 - 1s - 12ms/step - accuracy: 0.7804 - loss: 0.4554 - val_accuracy: 0.7300 - val_loss: 0.5163
Epoch 6/50
85/85 - 1s - 12ms/step - accuracy: 0.8207 - loss: 0.4034 - val_accuracy: 0.7233 - val_loss: 0.5516
Epoch 7/50
85/85 - 1s - 14ms/step - accuracy: 0.8367 - loss: 0.3604 - val_accuracy: 0.7100 - val_loss: 0.5535
Epoch 8/50
85/85 - 1s - 15ms/step - accuracy: 0.8756 - loss: 0.3021 - val_accuracy: 0.7000 - val_loss: 0.6886
Epoch 9/50
85/85 - 1s - 14ms/step - accuracy: 0.8700 - loss: 0.3111 - val_accuracy: 0.7167 - val_loss: 0.5598
Epoch 10/

# 12. Sample Prediction

In [14]:
X = "Karry to go to France in gesture of sympathy"

sequences = tokenizer1.texts_to_sequences([X])
sequences = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
if model.predict(sequences, verbose=0)[0][0] >= 0.5:
    print("This news is True")
else:
    print("This news is False")

This news is False
