# Fake News Detection - N-gram Vectorization (Title)

## Data Preparation

In [1]:
import time

import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

import re
import nltk
import json
from nltk.stem import WordNetLemmatizer

### Import News Title as Input and News Label as Output

In [2]:
start_time = time.time()

npz_title = np.load('Title_Data.npz',allow_pickle=True)

title = npz_title['inputs']
output = npz_title['targets']

title.shape, output.shape

((9805,), (9805,))

### Text Preprocessing
1. Remove Special Characters using Regular Expressions
2. Tokenize text
3. Word Normalization (Lemmatization)

In [3]:
lemmatizer = WordNetLemmatizer()

cleaned_title = []

for i in range(title.shape[0]):
    title_sentence = re.sub('[^A-Za-z0-9 ]+', '', title[i])
    title_tokens = nltk.word_tokenize(title_sentence)
    title_lemmatized_word = [lemmatizer.lemmatize(word) for word in title_tokens]
    cleaned_title.append(" ".join(word.lower() for word in title_lemmatized_word))

cleaned_title = np.asarray(cleaned_title, dtype=object)

### Split Train and Test Data

In [4]:
X_train_title, X_test_title, y_train_title, y_test_title = train_test_split(cleaned_title, output, test_size = 0.2, random_state = 1)

### Learn Vocabulary from Training Text and Vectorize the Training Texts

In [5]:
max_vocab_title = 15000

# Create TF-IDF of title
tfidf_title = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS,ngram_range=(1,3), max_features=max_vocab_title)

sparse_tfidf_train_title = tfidf_title.fit_transform(X_train_title)

X_tfidf_train_title = sparse_tfidf_train_title.toarray()

### Create and Train a Simple Neural Network Model with News Title

In [6]:
model_title = tf.keras.Sequential()
model_title.add(tf.keras.layers.Dense(64,input_shape=(max_vocab_title,),activation='relu'))
model_title.add(tf.keras.layers.Dropout(0.2))
model_title.add(tf.keras.layers.Dense(16,activation='relu'))
model_title.add(tf.keras.layers.Dropout(0.2))
model_title.add(tf.keras.layers.Dense(1,activation='relu'))
model_title.add(tf.keras.layers.Activation('sigmoid'))

model_title.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy', 'Recall'])

batch_size = 200
max_epochs = 10
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)

model_title.fit(X_tfidf_train_title, y_train_title, batch_size = batch_size, epochs = max_epochs, callbacks=[early_stopping], validation_split = 0.2 , shuffle=True, verbose=1)

Train on 6275 samples, validate on 1569 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<tensorflow.python.keras.callbacks.History at 0x2a707933240>

### Convert Test Data into Similar Format to Feed into Neural Network Model Trained

In [7]:
sparse_tfidf_test_title = tfidf_title.transform(X_test_title)

X_tfidf_test_title = sparse_tfidf_test_title.toarray()

### Test Performance on Neural Network Model (Accuracy)

In [8]:
model_title_loss, model_title_accuracy, model_title_recall = model_title.evaluate(X_tfidf_test_title, y_test_title)



In [9]:
model_title_loss, model_title_accuracy, model_title_recall

(0.6057588501830055, 0.773075, 0.8973843)

In [10]:
y_pred_title = model_title.predict_classes(X_tfidf_test_title)

con_mat_title = tf.math.confusion_matrix(labels=y_test_title, predictions=y_pred_title).numpy()

In [11]:
con_mat_title

array([[624, 343],
       [102, 892]])

In [12]:
end_time = time.time()

time_elapsed = end_time - start_time
time_elapsed

7.476693868637085

In [13]:
tfidf_model_performance = {
                                "Title_Accuracy": str(model_title_accuracy),
                                "Title_Recall": str(model_title_recall),
                                "Time": str(time_elapsed)
                          }

with open('Model_TFIDF_Title.json', 'w') as Model_TFIDF_file:
    json.dump(tfidf_model_performance, Model_TFIDF_file)