# Fake News Detection - N-gram Vectorization (Content)

## Data Preparation

In [1]:
import time

import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

import re
import nltk
import json
from nltk.stem import WordNetLemmatizer

### Import News Title and Content as Input and News Label as Output

In [2]:
start_time = time.time()

npz_title = np.load('Title_Data.npz',allow_pickle=True)
npz_content = np.load('Content_Data.npz',allow_pickle=True)

title = npz_title['inputs']
content = npz_content['inputs']
output = npz_title['targets']

title.shape, content.shape, output.shape

((9805,), (9805,), (9805,))

### Text Preprocessing
1. Remove Special Characters using Regular Expressions
2. Tokenize text
3. Word Normalization (Lemmatization)

In [3]:
lemmatizer = WordNetLemmatizer()

cleaned_title = []
cleaned_content = []

for i in range(title.shape[0]):
    title_sentence = re.sub('[^A-Za-z0-9 ]+', '', title[i])
    title_tokens = nltk.word_tokenize(title_sentence)
    title_lemmatized_word = [lemmatizer.lemmatize(word) for word in title_tokens]
    cleaned_title.append(" ".join(word.lower() for word in title_lemmatized_word))

for j in range(content.shape[0]):
    content_sentence = re.sub('[^A-Za-z0-9 ]+', '', content[j])
    content_tokens = nltk.word_tokenize(content_sentence)
    content_lemmatized_word = [lemmatizer.lemmatize(word) for word in content_tokens]
    cleaned_content.append(" ".join(word.lower() for word in content_lemmatized_word))

cleaned_title = np.asarray(cleaned_title, dtype=object)
cleaned_content = np.asarray(cleaned_content, dtype=object)

### Split Train and Test Data

In [4]:
X_train_title, X_test_title, y_train_title, y_test_title = train_test_split(cleaned_title, output, test_size = 0.2, random_state = 1)
X_train_content, X_test_content, y_train_content, y_test_content = train_test_split(cleaned_content, output, test_size = 0.2, random_state = 1)

### Learn Vocabulary from Training Text and Vectorize the Training Texts

In [5]:
max_vocab_title = 15000
max_vocab_content = 30000

# Create TF-IDF of title and content
tfidf_title = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS,ngram_range=(1,3), max_features=max_vocab_title)
tfidf_content = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS,ngram_range=(1,3), max_features=max_vocab_content)

sparse_tfidf_train_title = tfidf_title.fit_transform(X_train_title)
sparse_tfidf_train_content = tfidf_content.fit_transform(X_train_content)

X_tfidf_train_title = sparse_tfidf_train_title.toarray()
X_tfidf_train_content = sparse_tfidf_train_content.toarray()

### Create and Train a Simple Neural Network Model with News Title

In [6]:
model_title = tf.keras.Sequential()
model_title.add(tf.keras.layers.Dense(64,input_shape=(max_vocab_title,),activation='relu'))
model_title.add(tf.keras.layers.Dropout(0.2))
model_title.add(tf.keras.layers.Dense(16,activation='relu'))
model_title.add(tf.keras.layers.Dropout(0.2))
model_title.add(tf.keras.layers.Dense(1,activation='relu'))
model_title.add(tf.keras.layers.Activation('sigmoid'))

model_title.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy', 'Recall'])

batch_size = 200
max_epochs = 10
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)

model_title.fit(X_tfidf_train_title, y_train_title, batch_size = batch_size, epochs = max_epochs, callbacks=[early_stopping], validation_split = 0.2 , shuffle=True, verbose=1)

Train on 6275 samples, validate on 1569 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


<tensorflow.python.keras.callbacks.History at 0x21a43d106d8>

### Create and Train a Simple Neural Network Model with News Content

In [7]:
model_content = tf.keras.Sequential()
model_content.add(tf.keras.layers.Dense(64,input_shape=(max_vocab_content,),activation='relu'))
model_content.add(tf.keras.layers.Dropout(0.2))
model_content.add(tf.keras.layers.Dense(16,activation='relu'))
model_content.add(tf.keras.layers.Dropout(0.2))
model_content.add(tf.keras.layers.Dense(1,activation='relu'))
model_content.add(tf.keras.layers.Activation('sigmoid'))

model_content.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy', 'Recall'])

batch_size = 200
max_epochs = 10
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)

model_content.fit(X_tfidf_train_content, y_train_content, batch_size = batch_size, epochs = max_epochs, callbacks=[early_stopping], validation_split = 0.2 , shuffle=True, verbose=1)

Train on 6275 samples, validate on 1569 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


<tensorflow.python.keras.callbacks.History at 0x21a2eead550>

### Convert Test Data into Similar Format to Feed into Neural Network Model Trained

In [8]:
sparse_tfidf_test_title = tfidf_title.transform(X_test_title)
sparse_tfidf_test_content = tfidf_content.transform(X_test_content)

X_tfidf_test_title = sparse_tfidf_test_title.toarray()
X_tfidf_test_content = sparse_tfidf_test_content.toarray()

### Test Performance on Neural Network Model (Accuracy)

In [9]:
model_title_loss, model_title_accuracy, model_title_recall = model_title.evaluate(X_tfidf_test_title, y_test_title)
model_content_loss, model_content_accuracy, model_content_recall = model_content.evaluate(X_tfidf_test_content, y_test_content)





In [10]:
model_title_loss, model_title_accuracy, model_title_recall

(0.620833043478266, 0.7786843, 0.8755102)

In [11]:
model_content_loss, model_content_accuracy, model_content_recall

(0.4646333352240417, 0.8969913, 0.98163265)

In [12]:
y_pred_title = model_title.predict_classes(X_tfidf_test_title)

con_mat_title = tf.math.confusion_matrix(labels=y_test_title, predictions=y_pred_title).numpy()

y_pred_content = model_content.predict_classes(X_tfidf_test_content)

con_mat_content = tf.math.confusion_matrix(labels=y_test_content, predictions=y_pred_content).numpy()

In [13]:
con_mat_title

array([[669, 312],
       [122, 858]])

In [14]:
con_mat_content

array([[797, 184],
       [ 18, 962]])

In [15]:
end_time = time.time()

time_elapsed = end_time - start_time
time_elapsed

76.78582525253296

In [17]:
tfidf_model_performance = {
                                "Title_Accuracy": str(model_title_accuracy),
                                "Title_Recall": str(model_title_recall),
                                "Content_Accuracy": str(model_content_accuracy),
                                "Content_Recall": str(model_content_recall),
                                "Time": str(time_elapsed)
                          }

with open('Model_TFIDF.json', 'w') as Model_TFIDF_file:
    json.dump(tfidf_model_performance, Model_TFIDF_file)