In [1]:
import numpy as np
import pandas as pd
import json
import csv
import random

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers

import pprint
import tensorflow.compat.v1 as tf
from tensorflow.python.framework import ops
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
tf.disable_eager_execution()





The dataset used is an online  News dataset that has two classes which are labelled 'FAKE' or 'REAL'. This dataset has different text that are mainly related to candidates for elections.

In [2]:
# Reading the data 
data = pd.read_csv("news.csv") 
data.head() 

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


After the dataset visualization, we have decided to drop the second column because the infromation in there wasn't relevant for our data preprocessing.

In [3]:
data = data.drop(["Unnamed: 0"], axis=1) 
data.head(5) 

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
# encoding the labels 
le = preprocessing.LabelEncoder() 
le.fit(data['label']) 
data['label'] = le.transform(data['label']) 

In [5]:
embedding_dim = 50
max_length = 54
trunc_type = 'post'
padding_type = 'post'
#This is a token used to represent out-of-vocabulary (OOV) words. These are words that are not found in the vocabulary during tokenization. 
#Using <OOV> as the token allows the model to handle words that it hasn't seen before.
oov_tok = "<OOV>"
training_size = 3000
test_portion = .1

In [6]:
title = [] 
text = [] 
labels = [] 
for x in range(training_size): 
	title.append(data['title'][x]) 
	text.append(data['text'][x]) 
	labels.append(data['label'][x]) 

These parameters are commonly used in natural language processing (NLP) tasks, especially when working with text data for tasks like text classification or sentiment analysis.

In [29]:
#Building the embeddings
tokenizer1 = Tokenizer() 
tokenizer1.fit_on_texts(title) 
word_index1 = tokenizer1.word_index 
vocab_size1 = len(word_index1) 
sequences1 = tokenizer1.texts_to_sequences(title) 
padded1 = pad_sequences( 
	sequences1, padding=padding_type, truncating=trunc_type) 
split = int(test_portion * training_size) 
training_sequences1 = padded1[split:training_size] 
test_sequences1 = padded1[0:split] 
test_labels = labels[0:split] 
training_labels = labels[split:training_size] 

In [8]:
# Open the original GloVe vectors file
with open('glove.6B.50d.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()

# Insert header line with vocabulary size and vector size
lines.insert(0, f'{len(lines)} 50\n')

# Save the modified file
with open('corrected_glove.6B.50d.txt', 'w', encoding='utf-8') as f:
    f.writelines(lines)

In [9]:
from gensim.models import KeyedVectors

# Load GloVe vectors with explicit encoding
embeddings_index = {} 
with open('corrected_glove.6B.50d.txt', encoding='utf-8') as f: 
    for line in f: 
        values = line.split() 
        word = values[0] 
        coefs = np.asarray(values[1:], dtype='float32') 
        embeddings_index[word] = coefs
        
# Generating embeddings 
embeddings_matrix = np.zeros((vocab_size1+1, embedding_dim)) 
for word, i in word_index1.items(): 
	embedding_vector = embeddings_index.get(word) 
	if embedding_vector is not None: 
		embeddings_matrix[i] = embedding_vector 

In [10]:
#Buildidng architecture 
model = tf.keras.Sequential([ 
	tf.keras.layers.Embedding(vocab_size1+1, embedding_dim, 
							input_length=max_length, weights=[ 
								embeddings_matrix], 
							trainable=False), 
	tf.keras.layers.Dropout(0.2), 
	tf.keras.layers.Conv1D(64, 5, activation='relu'), 
	tf.keras.layers.MaxPooling1D(pool_size=4), 
	tf.keras.layers.LSTM(64), 
	tf.keras.layers.Dense(1, activation='sigmoid') 
]) 
model.compile(loss='binary_crossentropy', 
			optimizer='adam', metrics=['accuracy']) 
model.summary() 




Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 54, 50)            377600    
                                                                 
 dropout (Dropout)           (None, 54, 50)            0         
                                                                 
 conv1d (Conv1D)             (None, 50, 64)            16064     
                                                                 
 max_pooling1d (MaxPooling1  (None, 12, 64)            0         
 D)                                                              
                                                                 
 lstm (LSTM)                 (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                     

In [11]:
num_epochs = 10

training_padded = np.array(training_sequences1) 
training_labels = np.array(training_labels) 
testing_padded = np.array(test_sequences1) 
testing_labels = np.array(test_labels) 

history = model.fit(training_padded, training_labels, 
					epochs=num_epochs, 
					validation_data=(testing_padded, 
									testing_labels), 
					verbose=2) 




Train on 2700 samples, validate on 300 samples
Epoch 1/10


  updates = self.state_updates


2700/2700 - 19s - loss: 0.6497 - accuracy: 0.6093 - val_loss: 0.5789 - val_accuracy: 0.6833 - 19s/epoch - 7ms/sample
Epoch 2/10
2700/2700 - 9s - loss: 0.5761 - accuracy: 0.6893 - val_loss: 0.5379 - val_accuracy: 0.6933 - 9s/epoch - 3ms/sample
Epoch 3/10
2700/2700 - 7s - loss: 0.5366 - accuracy: 0.7344 - val_loss: 0.5213 - val_accuracy: 0.7233 - 7s/epoch - 3ms/sample
Epoch 4/10
2700/2700 - 6s - loss: 0.4953 - accuracy: 0.7619 - val_loss: 0.4872 - val_accuracy: 0.7233 - 6s/epoch - 2ms/sample
Epoch 5/10
2700/2700 - 14s - loss: 0.4268 - accuracy: 0.8030 - val_loss: 0.4721 - val_accuracy: 0.7533 - 14s/epoch - 5ms/sample
Epoch 6/10
2700/2700 - 2s - loss: 0.3964 - accuracy: 0.8181 - val_loss: 0.5109 - val_accuracy: 0.7467 - 2s/epoch - 894us/sample
Epoch 7/10
2700/2700 - 2s - loss: 0.3276 - accuracy: 0.8585 - val_loss: 0.4921 - val_accuracy: 0.7633 - 2s/epoch - 659us/sample
Epoch 8/10
2700/2700 - 2s - loss: 0.2981 - accuracy: 0.8678 - val_loss: 0.4772 - val_accuracy: 0.7700 - 2s/epoch - 692us/

In [23]:
# sample text to check if fake or not 
X = "Trump is black"

# Predicting of the text
def predict_fake_news(text):
    # Tokenize and pad the input sequence
    sequences = tokenizer1.texts_to_sequences([X])[0]
    sequences = pad_sequences([sequences], maxlen=54, padding=padding_type, truncating=trunc_type)
    
    # Make prediction
    prediction = model.predict(sequences, verbose=0)[0][0]
    
    # Determine if the news is true or false based on prediction
    if prediction >= 0.5:
        return "This news is True"
    else:
        return "This news is false"

In [13]:
import gradio as gr

  from .autonotebook import tqdm as notebook_tqdm


In [27]:
from tqdm.notebook import tqdm as notebook_tqdm

In [28]:
text_input = gr.Interface(fn=predict_fake_news, inputs="textbox", outputs="text", title="Fake News Detection")

text_input.launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




  updates=self.state_updates,
Traceback (most recent call last):
  File "C:\Users\Harb\anaconda3\envs\ComputerVision\lib\site-packages\gradio\queueing.py", line 501, in call_prediction
    output = await route_utils.call_process_api(
  File "C:\Users\Harb\anaconda3\envs\ComputerVision\lib\site-packages\gradio\route_utils.py", line 253, in call_process_api
    output = await app.get_blocks().process_api(
  File "C:\Users\Harb\anaconda3\envs\ComputerVision\lib\site-packages\gradio\blocks.py", line 1695, in process_api
    result = await self.call_function(
  File "C:\Users\Harb\anaconda3\envs\ComputerVision\lib\site-packages\gradio\blocks.py", line 1235, in call_function
    prediction = await anyio.to_thread.run_sync(
  File "C:\Users\Harb\anaconda3\envs\ComputerVision\lib\site-packages\anyio\to_thread.py", line 28, in run_sync
    return await get_asynclib().run_sync_in_worker_thread(func, *args, cancellable=cancellable,
  File "C:\Users\Harb\anaconda3\envs\ComputerVision\lib\site-pack

In [25]:
demo = gr.Interface(fn=predict_fake_news, inputs=text_input, outputs=text, title="Fake News Detection").launch()
demo.launch(share=True)

TypeError: inputs must be a string, list, or Component, not Gradio Interface for: predict_fake_news
---------------------------------------
inputs:
|-textbox
outputs:
|-textbox

In [19]:
demo.launch(share=True)

NameError: name 'demo' is not defined