# Fake News Detection - Sequence Vectorization (Content)

## Data Preparation

In [1]:
import time

import tensorflow as tf
import tensorflow.keras.preprocessing.text as kpt
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import json
import re

import nltk
import json
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

### Import News Content as Input and News Label as Output

In [2]:
start_time = time.time()

npz_content = np.load('Content_Data.npz',allow_pickle=True)

content = npz_content['inputs']
output = npz_content['targets']

content.shape[0], output.shape[0]

(9805, 9805)

### Text Preprocessing
1. Remove Special Characters using Regular Expressions
2. Tokenize text
3. Word Normalization (Lemmatization)
4. Remove Stopwords

In [3]:
stopword = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

cleaned_content = []

for j in range(content.shape[0]):
    content_sentence = re.sub('[^A-Za-z0-9 ]+', '', content[j])
    content_tokens = nltk.word_tokenize(content_sentence)
    content_lemmatized_word = [lemmatizer.lemmatize(word) for word in content_tokens]
    content_removed_stopwords = [word for word in content_lemmatized_word if word not in stopword]
    cleaned_content.append(" ".join(word.lower() for word in content_removed_stopwords))

cleaned_content = np.asarray(cleaned_content,dtype=object)

len(cleaned_content), len(output)

(9805, 9805)

### Split Train and Test Data

In [4]:
X_train_content, X_test_content, y_train_content, y_test_content = train_test_split(cleaned_content, output, test_size = 0.2, random_state = 1)

### Feed News Content into Keras Tokenizer

In [5]:
max_vocab_content = 30000

tokenizer_content = Tokenizer(num_words = max_vocab_content)
tokenizer_content.fit_on_texts(X_train_content)

dictionary_content = tokenizer_content.word_index

### Convert Text to Word Sequence based on Dictionary (And Make All Texts The Same Length) 

In [6]:
def convert_text_to_index_array_content(text):
    return [dictionary_content[word] for word in kpt.text_to_word_sequence(text)]

allWordIndices_content = []

for text_content in X_train_content:
    wordIndices_content = convert_text_to_index_array_content(text_content)
    allWordIndices_content.append(wordIndices_content)

### Cast the Word Indices into Numpy Arrays (For Any Future Usage)

In [7]:
allWordIndices_content = np.asarray(allWordIndices_content)

### Create One-Hot Matrices From the Word Indices and Make the Output Categorical

In [8]:
train_content_x = tokenizer_content.sequences_to_matrix(allWordIndices_content, mode='binary')
train_content_y = tf.keras.utils.to_categorical(y_train_content, 2)

### Create and Train a Simple Neural Network Model with News Content

In [9]:
model_content = tf.keras.Sequential()

model_content.add(tf.keras.layers.Dense(128, input_shape=(max_vocab_content,), activation = 'relu'))
model_content.add(tf.keras.layers.Dropout(0.2))
model_content.add(tf.keras.layers.Dense(16,activation='relu'))
model_content.add(tf.keras.layers.Dropout(0.2))
model_content.add(tf.keras.layers.Dense(2,activation='relu'))
model_content.add(tf.keras.layers.Activation('sigmoid'))

model_content.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy', 'Recall'])

batch_size = 500
max_epochs = 10
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)

model_content.fit(train_content_x, train_content_y, batch_size = batch_size, epochs = max_epochs, callbacks=[early_stopping], validation_split = 0.2 , shuffle=True, verbose=1)

Train on 6275 samples, validate on 1569 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<tensorflow.python.keras.callbacks.History at 0x1ead1782da0>

### Convert Test Data into Similar Format to Feed into Neural Network Model Trained for News Content

In [10]:
# create another Tokenizer for model testing 
tokenizer_test_content = Tokenizer(num_words = max_vocab_content)

# this utility makes sure that all the words in your input
# are registered in the dictionary
# before trying to turn them into a matrix.
def convert_text_to_index_array_content_test(text):
    words = kpt.text_to_word_sequence(text)
    wordIndices = []
    for word in words:
        if word in dictionary_content:
            wordIndices.append(dictionary_content[word])
    return wordIndices

allWordIndices_test_content = []

for text_test in X_test_content:
    wordIndices_test = convert_text_to_index_array_content_test(text_test)
    allWordIndices_test_content.append(wordIndices_test)

# create one-hot matrices out of the indexed train_content_input
test_content_x = tokenizer_test_content.sequences_to_matrix(allWordIndices_test_content, mode='binary')

# treat the labels as categories
test_content_y = tf.keras.utils.to_categorical(y_test_content, 2)

### Test Performance on Neural Network Model (Accuracy)

In [11]:
model_content_loss, model_content_accuracy, model_content_recall = model_content.evaluate(test_content_x, test_content_y)



In [12]:
model_content_loss, model_content_accuracy, model_content_recall

(0.490866194812088, 0.8995411, 0.9403366)

In [13]:
y_pred_content = model_content.predict_classes(test_content_x)

con_mat_content = tf.math.confusion_matrix(labels=y_test_content, predictions=y_pred_content).numpy()

In [14]:
con_mat_content

array([[875,  92],
       [ 87, 907]])

In [15]:
end_time = time.time()

time_elapsed = end_time - start_time
time_elapsed

67.07493495941162

In [16]:
sequence_model_performance = {
                                "Content_Accuracy": str(model_content_accuracy),
                                "Content_Recall": str(model_content_recall),
                                "Time": str(time_elapsed)
                             }

with open('Model_Sequence_Content.json', 'w') as Model_Sequence_File:
    json.dump(sequence_model_performance, Model_Sequence_File)