In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
nltk.download('wordnet')
nltk.download('stopwords')

import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk. stem import WordNetLemmatizer

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score 
# from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout

from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))

import pickle

%matplotlib inline

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Using TensorFlow backend.


In [2]:

with open("clean_fake_real_data.pkl", 'rb') as picklefile: 
    df = pickle.load(picklefile)

In [3]:
df.head()

Unnamed: 0,title,content,publication,type
0,Muslims BUSTED: They Stole Millions In Gov’t B...,Print They should pay all the back all the mon...,100percentfedup,fake
1,Re: Why Did Attorney General Loretta Lynch Ple...,Why Did Attorney General Loretta Lynch Plead T...,100percentfedup,fake
2,BREAKING: Weiner Cooperating With FBI On Hilla...,Red State : \nFox News Sunday reported this mo...,100percentfedup,fake
3,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,100percentfedup,fake
4,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,100percentfedup,fake


In [4]:
ps = WordNetLemmatizer()

# text preprocessing
corpus = []
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', str(df['content'][i])) # removing sepcial characters and numbers
    review = review.lower() # lowering the text
    review = review.split() 
    # removing stopwords and lemmatization
    review = [ps.lemmatize(word) for word in review if not word in set(stopwords.words('english'))] 
    review = ' '.join(review)
    df['content'][i] = review
    corpus.append(review)

all_words = []
for text in corpus:
    all_words.append(text.split())
    
words = [j for i in all_words for j in i]

from nltk.probability import FreqDist
fdist = FreqDist(words)

In [5]:
df.head()

Unnamed: 0,title,content,publication,type
0,Muslims BUSTED: They Stole Millions In Gov’t B...,print pay back money plus interest entire fami...,100percentfedup,fake
1,Re: Why Did Attorney General Loretta Lynch Ple...,attorney general loretta lynch plead fifth bar...,100percentfedup,fake
2,BREAKING: Weiner Cooperating With FBI On Hilla...,red state fox news sunday reported morning ant...,100percentfedup,fake
3,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,email kayla mueller prisoner tortured isi chan...,100percentfedup,fake
4,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,email healthcare reform make america great sin...,100percentfedup,fake


In [6]:
y = pd.get_dummies(df['type']) # fake, real:two column one-hot encoding
y = np.array(y)

In [7]:
y

array([[1, 0],
       [1, 0],
       [1, 0],
       ...,
       [0, 1],
       [0, 1],
       [0, 1]], dtype=uint8)

In [8]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each news.
MAX_SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 100
# tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(df['content'].values)
word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens.')

Found 155186 unique tokens.


In [9]:
X = tokenizer.texts_to_sequences(df['content'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of X:', X.shape)

Shape of X: (28665, 250)


In [10]:
Y = pd.get_dummies(df['type']).values
print('Shape of Y:', Y.shape)

Shape of Y: (28665, 2)


In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(22932, 250) (22932, 2)
(5733, 250) (5733, 2)


In [12]:
from keras.layers.normalization import BatchNormalization

In [24]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
# model.add(SpatialDropout1D(0.2))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 250, 100)          5000000   
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 258       
Total params: 5,117,506
Trainable params: 5,117,506
Non-trainable params: 0
_________________________________________________________________


In [25]:
es = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=3, verbose=0, mode='auto')

model.fit(X_train, Y_train, batch_size=64, epochs=100, validation_data=(X_test, Y_test), callbacks=[es])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 22932 samples, validate on 5733 samples
Epoch 1/100
Epoch 2/100
Epoch 4/100
Epoch 5/100


<keras.callbacks.callbacks.History at 0x7fc2ee510e48>

In [26]:
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)
print("train-acc = " + str(accuracy_score(np.argmax(Y_train, axis=1), np.argmax(train_pred, axis=1))))
print("test-acc = " + str(accuracy_score(np.argmax(Y_test, axis=1), np.argmax(test_pred, axis=1))))

train-acc = 0.9954212454212454
test-acc = 0.9017966160823304
