# Fake news detetction using stance

### Import Libraries

In [21]:
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf
from keybert import KeyBERT
from newsapi import NewsApiClient
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional, Dropout, Input, concatenate
from tensorflow.keras.models import Model, load_model, save_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical, plot_model

In [22]:
body = pd.read_csv("train_bodies.csv")
stance = pd.read_csv("train_stances.csv")

In [7]:
# from tqdm.notebook import tqdm
# count=0
# for i in tqdm(range(stance.shape[0])):
#     for j in range(body.shape[0]):
#         if body.loc[j,'Body ID']==stance.loc[i,'Body ID']:
#             stance.loc[i,'articleBody'] = body.loc[j,'articleBody']

# stance.to_csv('data_combined.csv',index=False)

In [8]:
data = pd.read_csv('data_combined.csv')
data.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody
0,Police find mass graves with at least '15 bodi...,712,unrelated,Danny Boyle is directing the untitled film\n\n...
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree,Hundreds of Palestinians were evacuated from t...
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated,30-year-old Moscow resident was hospitalized w...
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated,(Reuters) - A Canadian soldier was shot at the...
4,Spider burrowed through tourist's stomach and ...,1923,disagree,"Fear not arachnophobes, the story of Bunbury's..."


In [9]:
data['stance_cat'] = data['Stance'].map({'agree':0,'disagree':1,'discuss':2,'unrelated':3}).astype(int)
data['Stance'].value_counts()

unrelated    36545
discuss       8909
agree         3678
disagree       840
Name: Stance, dtype: int64

In [10]:
data.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody,stance_cat
0,Police find mass graves with at least '15 bodi...,712,unrelated,Danny Boyle is directing the untitled film\n\n...,3
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree,Hundreds of Palestinians were evacuated from t...,0
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated,30-year-old Moscow resident was hospitalized w...,3
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated,(Reuters) - A Canadian soldier was shot at the...,3
4,Spider burrowed through tourist's stomach and ...,1923,disagree,"Fear not arachnophobes, the story of Bunbury's...",1


In [28]:
headlines  = data['Headline'].to_list()
bodies = data['articleBody'].to_list()
stance = data['stance_cat'].values.astype('int32')

### Prepare Dataset for Training
Next, we standardize, tokenize, and vectorize the data using the helpful tf.keras.layers.Tokenizer layer.

`Standardization` refers to preprocessing the text, typically to remove punctuation or HTML elements to simplify the dataset. Tokenization refers to splitting strings into tokens (for example, splitting a sentence into individual words, by splitting on whitespace). Vectorization refers to converting tokens into numbers so they can be fed into a neural network. All of these tasks can be accomplished with this layer.

In [23]:
max_features = 10000
EMBEDDING_DIM = 100
max_seq_length_head = 15
max_seq_length_body = 40

In [24]:
headline_tokenizer = Tokenizer(num_words=max_features)
headline_tokenizer.fit_on_texts(headlines)
head_vocab_size = len(headline_tokenizer.word_index) + 1

body_tokenizer = Tokenizer(num_words=max_features)
body_tokenizer.fit_on_texts(bodies)
body_vocab_size = len(body_tokenizer.word_index) + 1

### Save Vocabulary

In [50]:
file_to_store = open("headline_tokenizer.pickle", "wb")
pickle.dump(headline_tokenizer, file_to_store)
file_to_store.close()

file_to_store = open("body_tokenizer.pickle", "wb")
pickle.dump(body_tokenizer, file_to_store)
file_to_store.close()

### Creating Sequences for both headline and body

In [25]:
encoded_docs_headline = headline_tokenizer.texts_to_sequences(headlines) 
padded_docs_headline = pad_sequences(encoded_docs_headline, max_seq_length_head,  padding='post', truncating='post')


encoded_docs_body = body_tokenizer.texts_to_sequences(bodies)
padded_docs_body = pad_sequences(encoded_docs_body, max_seq_length_body,  padding='post', truncating='post')

Using Stanfor's 100d GloVe to represent tokens

In [31]:
GLOVE_DIR = "glove.6B.100d.txt"

def setup_embedding_index():
    embedding_index = dict()
    f = open(GLOVE_DIR, encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.array(values[1:],dtype='float32')
        embedding_index[word] = coefs
    f.close()
    return embedding_index

embeddings_index = setup_embedding_index()

In [32]:
embedding_matrix_headline = np.zeros((head_vocab_size, EMBEDDING_DIM))

for word, i in headline_tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix_headline[i] = embedding_vector
dims = len(embedding_matrix_headline[0])

print(dims)

100


In [33]:
embedding_matrix_body = np.zeros((body_vocab_size, EMBEDDING_DIM))

for word, i in body_tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix_body[i] = embedding_vector
dims = len(embedding_matrix_body[0])

print(dims)

100


### Train Test Spilt

In [34]:
split_size = int(len(padded_docs_body)*0.9)

headline_train = padded_docs_headline[:split_size, :]
headline_test = padded_docs_headline[split_size:, :]

body_train = padded_docs_body[:split_size, :]
body_test = padded_docs_body[split_size:, :]

stance = stance.reshape(-1, 1)
labels = stance

train_labels = labels[:split_size, :]
test_labels = labels[split_size:, :]

In [36]:
# input layer for headline
input_headline = Input(shape=15, name='input_headline')
embedding_headline = Embedding(input_dim = head_vocab_size, output_dim =100 ,
                                     weights=[embedding_matrix_headline],
                                     input_length = 15, trainable=True)(input_headline)

# Input layer for body
input_body = Input(shape=40, name='input_body')
embedding_body = Embedding(input_dim=body_vocab_size, output_dim=100,
                          weights=[embedding_matrix_body],
                          input_length=40, trainable=True)(input_body)

# Create two paralle Bidirectional LSTM layers for the headline and body
lstm_head = Bidirectional(LSTM(64))(embedding_headline)
lstm_body = Bidirectional(LSTM(64))(embedding_body)
addition_layer = concatenate([lstm_head, lstm_body], axis=1)
dense = Dense(64, activation='relu')(addition_layer)

# Output layer with softmax activation
output = Dense(4, activation='softmax')(dense)

# create the model
model_combined = Model(inputs=[input_headline, input_body], outputs=output)

# compile the model
model_combined.compile(optimizer = 'adam', loss =tf.keras.losses.SparseCategoricalCrossentropy(), metrics = ['accuracy'])

In [37]:
model_combined.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_headline (InputLayer)    [(None, 15)]         0           []                               
                                                                                                  
 input_body (InputLayer)        [(None, 40)]         0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 15, 100)      388000      ['input_headline[0][0]']         
                                                                                                  
 embedding_3 (Embedding)        (None, 40, 100)      2742800     ['input_body[0][0]']             
                                                                                              

In [56]:
tf.keras.utils.plot_model(
    model_combined,
    to_file='model.png',
    show_shapes=False,
    show_dtype=False,
    show_layer_names=True,
    rankdir='TB',
    expand_nested=False,
    dpi=96,
    layer_range=None,
    show_layer_activations=False
)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [40]:
model_combined.fit([headline_train, body_train],  train_labels, 
                  epochs=15, verbose=1,
                validation_data=([headline_test, body_test], test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x294740bdf10>

### Save and Load Model


In [49]:
# save_model(model_combined, 'perfect_model')

# perfect_model = load_model('perfect_model')



INFO:tensorflow:Assets written to: perfect_model\assets


INFO:tensorflow:Assets written to: perfect_model\assets


### Preprocess new input text

In [54]:
def predict(head, body, model):
    encoded_docs_headline = headline_tokenizer.texts_to_sequences([head]) 
    padded_docs_headline = pad_sequences(encoded_docs_headline, max_seq_length_head, padding='post', truncating='post')
    
    encoded_docs_body = body_tokenizer.texts_to_sequences([body]) 
    padded_docs_body = pad_sequences(encoded_docs_body, max_seq_length_body, padding='post', truncating='post')
    # print(encoded_docs_headline, encoded_docs_body)
    
    res = model.predict([padded_docs_headline, padded_docs_body])

    stance = {0:"Agree",
              1:"Disagree",
              2:"Discuss",
              3:"Unrelated"}
    
    return stance[np.argmax(res)]

In [55]:
predict("elon musk buys twitter", "hey", perfect_model)

'Unrelated'

In [227]:
key_model = KeyBERT()

Downloading: 100%|██████████| 90.9M/90.9M [00:29<00:00, 3.10MB/s]
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 26.5kB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 28.0kB/s]
Downloading: 100%|██████████| 466k/466k [00:11<00:00, 39.7kB/s] 
Downloading: 100%|██████████| 350/350 [00:00<00:00, 87.2kB/s]
Downloading: 100%|██████████| 13.2k/13.2k [00:00<00:00, 4.39MB/s]
Downloading: 100%|██████████| 232k/232k [00:01<00:00, 216kB/s]  


In [2]:
def search(claim = None):
    Agree = 0
    Disagree = 0
            
    isReal = False
    isFake = False
    data = []
    try:

        if claim != "":
            key_words = key_model.extract_keywords(claim, top_n=1, \
                keyphrase_ngram_range=(1,5),stop_words='english')
            key_words = key_words[0][0]

            newsapi = NewsApiClient(api_key='a5bdaefa54ef4ddcacecfc76d8747434')
            result = newsapi.get_everything(q=key_words, page_size=50, \
            language='en', from_param='2022-04-28')

            articles = result['articles']
        
        
            for _, article in enumerate(articles):
                stance = predict(claim, article['description'], perfect_model)

                data.append({"Title":article['title'],
                            "Source": article['source']["name"],
                            "Decription":article['description'],
                            "Link":article['url'],
                            "Content":article['content'],
                            "Stance":stance})
                
                if stance == "Agree" or stance == "Discuss":
                    Agree += 1
                elif stance == 'Disagree':
                    Disagree += 1
        else:
            # print("put claim")
            pass
            
     
        
    except ConnectionError as e:
        print(e)

    if Agree > Disagree:
        isReal = True
        
    else:
        isFake = True
    
    return isReal, isFake, data