In [None]:
# Dependencies

!pip install spacy==2.2.3
!python -m spacy download en_core_web_sm
!pip install beautifulsoup4==4.9.1
!pip install textblob==0.15.3
!pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git --upgrade --force-reinstall

In [48]:
import pandas as pd 
import numpy as np 
import nltk
import re

import gensim

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

import tensorflow as tf

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Dropout, Dense, Embedding
from keras import Sequential
import preprocess_kgptalkie as ps
from preprocess import preprocess_data


In [26]:
# Using the 1st dataset

df_real = pd.read_csv('data/True.csv')
df_fake = pd.read_csv('data/Fake.csv')
df_real['real'] = 1
df_fake['real'] = 0
df_1 = pd.concat([df_fake, df_real], axis=0)
df_1.head()

Unnamed: 0,title,text,subject,date,real
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [27]:
# Using the 2nd dataset

df_2 = pd.read_csv('data/news_articles.csv')
df_2['real'] = df_2['label'].replace({'Real':1,'Fake':0})
df_2 = df_2.dropna(axis=0)
df_2 = pd.DataFrame(df_2[['title','text', 'real']])
df_2.head()

Unnamed: 0,title,text,real
0,muslims busted they stole millions in govt ben...,print they should pay all the back all the mon...,1.0
1,re why did attorney general loretta lynch plea...,why did attorney general loretta lynch plead t...,1.0
2,breaking weiner cooperating with fbi on hillar...,red state \nfox news sunday reported this mor...,1.0
3,pin drop speech by father of daughter kidnappe...,email kayla mueller was a prisoner and torture...,1.0
4,fantastic trumps point plan to reform healthc...,email healthcare reform to make america great ...,1.0


In [24]:
# Creating a tokenizer with the preprocessed dataset

df = pd.read_csv("data/preprocessed_data.csv")
x = [row.split() for row in df.text.tolist()]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x)

In [30]:
# Defining Features after preprocessing

X_1 = preprocess_data(df_1, tokenizer=tokenizer)
X_2 = preprocess_data(df_2, tokenizer=tokenizer)
X = np.concatenate((X_1, X_2))
print(X_1.shape, X_2.shape, X.shape)

(44898, 1000) (2045, 1000) (46943, 1000)


In [32]:
# Defining output label for the dataset

Y_1 = df_1.real.values
Y_2 = df_2.real.values
Y = np.concatenate((Y_1, Y_2))
print(Y_1.shape, Y_2.shape, Y.shape)

(44898,) (2045,) (46943,)


In [35]:
# Word Embedding 
word2vec = gensim.models.Word2Vec(sentences=x, window=5, min_count = 1)
def get_weights(model):
    weights = np.zeros(((len(tokenizer.word_index) + 1), 100))

    for word, i in tokenizer.word_index.items():
        weights[i] = word2vec.wv[word]
    return weights
embedding_vectors = get_weights(word2vec)

In [36]:
model = Sequential([
    Embedding((len(tokenizer.word_index) + 1), output_dim = 100, weights=[embedding_vectors], input_length = 1000, trainable=False),
    LSTM(units = 128),
    Dropout(0.2),
    Dense(256),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1000, 100)         37537400  
                                                                 
 lstm (LSTM)                 (None, 128)               117248    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 256)               33024     
                                                                 
 dense_1 (Dense)             (None, 1)                 257       
                                                                 
Total params: 37,687,929
Trainable params: 150,529
Non-trainable params: 37,537,400
_________________________________________________________________


In [37]:
# Training the validation set 

x_train, x_test, y_train, y_test = train_test_split(X, Y)

In [38]:
# Training the model 

model.fit(x_train, y_train, validation_split=0.3, epochs=6)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x21de59a6af0>

In [39]:
predictions = (model.predict(x_test) >= 0.5).astype(int)
print('score:', accuracy_score(y_test, predictions))

score: 0.9816802999318337


In [46]:
model.save('weights.h5')

In [49]:
# saving an image of the model in the form of a plot

tf.keras.utils.plot_model(
    model,
    to_file='model.png',
    show_shapes=False,
    show_dtype=False,
    show_layer_names=True,
    rankdir='TB',
    expand_nested=False,
    dpi=96,
    layer_range=None,
    show_layer_activations=False,
    show_trainable=False
)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.
