In [1]:
import os
import sys
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Activation, Conv2D, Input, Embedding, Reshape, MaxPool2D, Concatenate, Flatten, Dropout, Dense, Conv1D
from keras.layers import MaxPool1D, LSTM
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam

In [2]:
import pandas as pd
df = pd.read_csv("./data.csv")

In [3]:
df.dropna(subset=['Body'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [4]:
df['Headline'] = df['Headline'].str.lower()
df['Body'] = df['Body'].str.lower()
df.head()

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,four ways bob corker skewered donald trump,image copyright getty images\non sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,linklater's war veteran comedy speaks to moder...,"london (reuters) - “last flag flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,trump’s fight with corker jeopardizes his legi...,the feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,egypt's cheiron wins tie-up with pemex for mex...,mexico city (reuters) - egypt’s cheiron holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,jason aldean opens 'snl' with vegas tribute,"country singer jason aldean, who was performin...",1


In [5]:
import string
def remove_punctuations(text):
    punctuations = set(string.punctuation)
    text = str(text)
    # return text.translate(str.maketrans('', '', punctuations))
    return " ".join([word for word in text.split() if word not in punctuations])

df['Headline'] = df['Headline'].apply(lambda x: remove_punctuations(x))
df['Body'] = df['Body'].apply(lambda x: remove_punctuations(x))

In [6]:
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])

df['Headline'] = df['Headline'].apply(lambda x: remove_stopwords(x))
df['Body'] = df['Body'].apply(lambda x: remove_stopwords(x))

In [7]:
import re
def remove_spl_chars(text):
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text

df['Headline'] = df['Headline'].apply(lambda x: remove_spl_chars(x))
df['Body'] = df['Body'].apply(lambda x: remove_spl_chars(x))

In [8]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer(language='english')
def stem(text):
    stemmed_sentence = " ".join(stemmer.stem(word) for word in text.split())
    return stemmed_sentence

df['Headline'] = df['Headline'].apply(lambda x: stem(x))
df['Body'] = df['Body'].apply(lambda x: stem(x))

In [9]:
def remove_url(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

df['Headline'] = df['Headline'].apply(lambda x: remove_url(x))
df['Body'] = df['Body'].apply(lambda x: remove_url(x))

In [10]:
texts = []
labels = []
for i in range(3988):
    texts.append(df['Body'][i])
    labels.append(df['Label'][i])

print(len(texts))
len(labels)

3988


3988

In [11]:
max_words = 10000
max_seq_length = 1000
# the percentage of train test split to be applied
validation_split = 0.2
# the dimension of vectors to be used
embedding_dim = 100
# filter sizes of the different conv layers 
filter_sizes = [3,4,5]
num_filters = 512
embedding_dim = 100
# dropout probability
drop = 0.5
batch_size = 30
epochs = 2

In [12]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [13]:
word_index = tokenizer.word_index
len(word_index)

32035

In [14]:
word_index

{'s': 1,
 'said': 2,
 'one': 3,
 'year': 4,
 'time': 5,
 't': 6,
 'new': 7,
 'would': 8,
 'game': 9,
 'like': 10,
 'state': 11,
 'it': 12,
 'also': 13,
 'two': 14,
 'week': 15,
 'first': 16,
 'trump': 17,
 'say': 18,
 'go': 19,
 'peopl': 20,
 'i': 21,
 'get': 22,
 'play': 23,
 '1': 24,
 'us': 25,
 'report': 26,
 'make': 27,
 'news': 28,
 'last': 29,
 'use': 30,
 'mr': 31,
 'presid': 32,
 'back': 33,
 'world': 34,
 'could': 35,
 '2017': 36,
 'team': 37,
 'nation': 38,
 'take': 39,
 'com': 40,
 '2': 41,
 'work': 42,
 'photo': 43,
 'govern': 44,
 'day': 45,
 'run': 46,
 'stori': 47,
 'u': 48,
 'season': 49,
 'come': 50,
 'look': 51,
 'may': 52,
 'includ': 53,
 'we': 54,
 '3': 55,
 'right': 56,
 'even': 57,
 'that': 58,
 'mani': 59,
 'call': 60,
 'imag': 61,
 'countri': 62,
 'unit': 63,
 'content': 64,
 '5': 65,
 'start': 66,
 '4': 67,
 'the': 68,
 'sourc': 69,
 'compani': 70,
 'he': 71,
 'continu': 72,
 'player': 73,
 'reuter': 74,
 '10': 75,
 'show': 76,
 'three': 77,
 'way': 78,
 'need'

In [15]:
data = pad_sequences(sequences, maxlen=max_seq_length)
labels = to_categorical(np.asarray(labels))

In [37]:
a = "this is a car"
seq = tokenizer.texts_to_sequences([a])
d = pad_sequences(seq, maxlen=max_seq_length)

In [39]:
d.shape

(1, 1000)

In [42]:
d[:1]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

In [16]:
label_index = {'Fake': 1, 'Not_Fake': 0}

In [17]:
print("Shape of data tensor: ", data.shape)
print("Shape of label tensor: ", labels.shape)

Shape of data tensor:  (3988, 1000)
Shape of label tensor:  (3988, 2)


In [18]:
type(data)

numpy.ndarray

In [19]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(validation_split * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [20]:
print(x_train.shape)
x_val.shape

(3191, 1000)


(797, 1000)

In [21]:
x_train[0].shape

(1000,)

In [22]:
embeddings_idx = {}
f = open("./glove.6B.100d.txt", encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_idx[word] = coefs
f.close()

In [23]:
len(embeddings_idx)

400000

In [24]:
len(word_index)

32035

In [25]:
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_idx.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [26]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.13739   ,  0.77890998,  0.80053997, ..., -0.61676002,
         0.44703001, -0.27967   ],
       [-0.13128   , -0.45199999,  0.043399  , ..., -0.30526   ,
        -0.045495  ,  0.56509   ],
       ...,
       [-0.12560999, -0.020344  , -0.31240001, ...,  0.38492   ,
        -0.55392998, -0.29003999],
       [-0.017809  ,  0.74079001,  0.37839001, ...,  0.23705   ,
        -0.41218999, -0.090674  ],
       [ 0.17927   ,  0.23128   , -0.55498999, ...,  0.43808001,
         0.15075   , -0.59816998]])

In [27]:
from keras.layers import Embedding

embedding_layer = Embedding(input_dim=len(word_index)+1, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_seq_length,
                            trainable=False)

In [28]:
inputs = Input(shape=(max_seq_length, ), dtype='int32')
embedding = embedding_layer(inputs)
embedding.shape

TensorShape([None, 1000, 100])

In [29]:
reshaped_inp = Reshape((max_seq_length, embedding_dim, 1))(embedding)
reshaped_inp.shape

TensorShape([None, 1000, 100, 1])

In [30]:
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding="valid", kernel_initializer='normal',
                activation='relu')(reshaped_inp)
maxpool_1 = MaxPool2D(pool_size=(max_seq_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_1.shape


TensorShape([None, 1, 1, 512])

In [31]:
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding="valid", kernel_initializer='normal',
                activation='relu')(reshaped_inp)
maxpool_2 = MaxPool2D(pool_size=(max_seq_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_2)
maxpool_2.shape

TensorShape([None, 1, 1, 512])

In [32]:
conv_3 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding="valid", kernel_initializer='normal',
                activation='relu')(reshaped_inp)
maxpool_3 = MaxPool2D(pool_size=(max_seq_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_3)
maxpool_3.shape

TensorShape([None, 1, 1, 512])

In [33]:
concatenated_tensor = Concatenate(axis=1)([maxpool_1, maxpool_2, maxpool_3])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(units=2, activation='sigmoid')(dropout)

In [34]:
output.shape

TensorShape([None, 2])

In [35]:
model = Model(inputs=inputs, outputs=output)
adam = Adam(learning_rate=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 1000)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, 1000, 100)            3203600   ['input_1[0][0]']             
                                                                                                  
 reshape (Reshape)           (None, 1000, 100, 1)         0         ['embedding[0][0]']           
                                                                                                  
 conv2d (Conv2D)             (None, 998, 1, 512)          154112    ['reshape[0][0]']             
                                                                                              

In [43]:
print("Traning Model...")
model.fit(x_train, y_train, batch_size=batch_size, epochs=5, verbose=1, validation_data=(x_val, y_val))

Traning Model...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x24188e4ca30>

In [None]:
x_train.shape

(3191, 1000)

In [48]:
def RNN():
    inputs = Input(name='inputs',shape=[max_seq_length])
    layer = Embedding(max_words,100,input_length=max_seq_length)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(2,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [49]:
from keras.optimizers import RMSprop

model_lstm = RNN()
model_lstm.summary()
model_lstm.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 1000)]            0         
                                                                 
 embedding_2 (Embedding)     (None, 1000, 100)         1000000   
                                                                 
 lstm_1 (LSTM)               (None, 64)                42240     
                                                                 
 FC1 (Dense)                 (None, 256)               16640     
                                                                 
 activation_2 (Activation)   (None, 256)               0         
                                                                 
 dropout_2 (Dropout)         (None, 256)               0         
                                                                 
 out_layer (Dense)           (None, 2)                 514 

In [50]:
from keras.callbacks import EarlyStopping

model_lstm.fit(x_train,y_train,batch_size=128,epochs=3,validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x2418b07b7f0>

In [None]:
x_train

array([[   0,    0,    0, ...,  457, 1503,   36],
       [   0,    0,    0, ...,  569, 2133, 3556],
       [   0,    0,    0, ...,  187,    9,  408],
       ...,
       [   0,    0,    0, ...,  768,  487,  888],
       [   0,    0,    0, ...,  119,   87,  410],
       [   0,    0,    0, ...,   51,   10,   56]])

In [51]:
model_lstm_json = model_lstm.to_json()
with open("model_lstm_json.json", "w") as json_file:
    json_file.write(model_lstm_json)
# serialize weights to HDF5
model.save_weights("model_lstm.h5")
print("Saved model to disk")

Saved model to disk


In [55]:
model_lstm.predict(x_train)



array([[0.01096675, 0.9890069 ],
       [0.00440007, 0.99562013],
       [0.00657555, 0.9935213 ],
       ...,
       [0.9220456 , 0.07908832],
       [0.9452604 , 0.05582776],
       [0.9892207 , 0.01163793]], dtype=float32)