In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin
/kaggle/input/fake-and-real-news-dataset/Fake.csv
/kaggle/input/fake-and-real-news-dataset/True.csv


In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, LSTM
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, Activation, Flatten
from keras.models import Model
from keras.initializers import Constant
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from gensim.models.keyedvectors import KeyedVectors

Using TensorFlow backend.


In [3]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
import nltk 
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [4]:
df_real = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')
df_fake = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')

In [5]:
df_real.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [6]:
df_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [7]:
df_real.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [8]:
df_fake.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [9]:
df_real.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [10]:
df_real['text'][0]

'WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases for non-defense “discretionary” spending on programs that support educat

In [11]:
df_real.shape

(21417, 4)

In [12]:
df_fake.shape

(23481, 4)

In [13]:
df_real['sentiment'] = 1
df_fake['sentiment'] = 0

In [14]:
df_real.shape

(21417, 5)

In [15]:
import re

In [16]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [17]:
def get_cleaned_data(input_data, mode='df'):
    stop = stopwords.words('english')
    
    input_df = ''
    
    if mode != 'df':
        input_df = pd.DataFrame([input_data], columns=['text'])
    else:
        input_df = input_data
        
    #lowercase the text
    input_df['text'] = input_df['text'].str.lower()
    
    input_df['text'] = input_df['text'].apply(lambda elem: decontracted(elem))
    
    #remove special characters
    input_df['text'] = input_df['text'].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))
    
    # remove numbers
    input_df['text'] = input_df['text'].apply(lambda elem: re.sub(r"\d+", "", elem))
    
    #remove stopwords
    input_df['text'] = input_df['text'].apply(lambda x: ' '.join([word.strip() for word in x.split() if word not in (stop)]))
    
   
    input_df['text'] = input_df['text'].apply(lambda words: (wordnet_lemmatizer.lemmatize(words)))

    
    return input_df

In [18]:
df_real = get_cleaned_data(df_real)
df_fake = get_cleaned_data(df_fake)

In [19]:
df_real.head()

Unnamed: 0,title,text,subject,date,sentiment
0,"As U.S. budget fight looms, Republicans flip t...",washington reuters head conservative republica...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,washington reuters transgender people allowed ...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,washington reuters special counsel investigati...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,washington reuters trump campaign adviser geor...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,seattlewashington reuters president donald tru...,politicsNews,"December 29, 2017",1


In [20]:
df_real['text'][0]

'washington reuters head conservative republican faction us congress voted month huge expansion national debt pay tax cuts called fiscal conservative sunday urged budget restraint keeping sharp pivot way among republicans us representative mark meadows speaking cbs face nation drew hard line federal spending lawmakers bracing battle january return holidays wednesday lawmakers begin trying pass federal budget fight likely linked issues immigration policy even november congressional election campaigns approach republicans seek keep control congress president donald trump republicans want big budget increase military spending democrats also want proportional increases nondefense discretionary spending programs support education scientific research infrastructure public health environmental protection trump administration already willing say going increase nondefense discretionary spending percent meadows chairman small influential house freedom caucus said program democrats saying thats e

In [21]:
data=pd.concat([df_real,df_fake],axis=0,ignore_index=True)

In [22]:
data.tail(10)

Unnamed: 0,title,text,subject,date,sentiment
44888,Seven Iranians freed in the prisoner swap have...,st century wire says week historic internation...,Middle-east,"January 20, 2016",0
44889,#Hashtag Hell & The Fake Left,dady chery gilbert mercierall writers desire r...,Middle-east,"January 19, 2016",0
44890,Astroturfing: Journalist Reveals Brainwashing ...,vic bishop waking timesour reality carefully c...,Middle-east,"January 19, 2016",0
44891,The New American Century: An Era of Fraud,paul craig robertsin last years th century fra...,Middle-east,"January 19, 2016",0
44892,Hillary Clinton: ‘Israel First’ (and no peace ...,robert fantina counterpunchalthough united sta...,Middle-east,"January 18, 2016",0
44893,McPain: John McCain Furious That Iran Treated ...,st century wire says wire reported earlier wee...,Middle-east,"January 16, 2016",0
44894,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,st century wire says familiar theme whenever d...,Middle-east,"January 16, 2016",0
44895,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,patrick henningsen st century wireremember oba...,Middle-east,"January 15, 2016",0
44896,How to Blow $700 Million: Al Jazeera America F...,st century wire says al jazeera america go his...,Middle-east,"January 14, 2016",0
44897,10 U.S. Navy Sailors Held by Iranian Military ...,st century wire says wire predicted new year l...,Middle-east,"January 12, 2016",0


In [23]:
data.head()

Unnamed: 0,title,text,subject,date,sentiment
0,"As U.S. budget fight looms, Republicans flip t...",washington reuters head conservative republica...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,washington reuters transgender people allowed ...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,washington reuters special counsel investigati...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,washington reuters trump campaign adviser geor...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,seattlewashington reuters president donald tru...,politicsNews,"December 29, 2017",1


In [24]:
data.isnull().sum()

title        0
text         0
subject      0
date         0
sentiment    0
dtype: int64

In [25]:
data.shape

(44898, 5)

In [26]:
g=[]

In [27]:
for i in data['text']:
    g.append(i)

In [28]:
maxl = max([len(s) for s in g])
print ('Maximum sequence length in the list of sentences:', maxl)

Maximum sequence length in the list of sentences: 38698


In [29]:
X=data['text']
Y=data['sentiment']

In [30]:
tokenizer=Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 202165 unique tokens.


# Padding Sequences

As the maxlength is very big we will be selecting 2000 as our maxlength..

In [31]:
X = tokenizer.texts_to_sequences(X.values)
X = pad_sequences(X, maxlen=2000)

In [32]:
Y = pd.get_dummies(data['sentiment'],columns=data["sentiment"]).values
Y

array([[0, 1],
       [0, 1],
       [0, 1],
       ...,
       [1, 0],
       [1, 0],
       [1, 0]], dtype=uint8)

In [33]:
Y.shape

(44898, 2)

In [34]:
X.shape

(44898, 2000)

# Splitting the Dataset..

In [35]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [36]:
X_train

array([[   0,    0,    0, ..., 1384,   80, 6798],
       [   0,    0,    0, ...,  537,  174,  326],
       [   0,    0,    0, ...,  165,  966,   80],
       ...,
       [   0,    0,    0, ..., 1743,  533, 7526],
       [   0,    0,    0, ..., 1739,   94, 2156],
       [   0,    0,    0, ..., 1401, 8084,  182]], dtype=int32)

# Now importing the pretrained embedding index from Google index..

In [37]:
path='/kaggle/input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'

In [38]:
 wv_from_bin = KeyedVectors.load_word2vec_format(path, binary=True, limit=500000) 
  #extracting word vectors from google news vector
 embeddings_index = {}
 for word, vector in zip(wv_from_bin.vocab, wv_from_bin.vectors):
      coefs = np.asarray(vector, dtype='float32')
      embeddings_index[word] = coefs

In [39]:
print('Found %s word vectors.' % len(embeddings_index))

Found 500000 word vectors.


In [40]:
vocab_size = len(tokenizer.word_index) + 1

In [41]:
print(vocab_size)

202166


In [42]:
# embedding_matrix = np.zeros((vocab_size, 300))
# for word, i in word_index.items():
#     try:
#         embedding_vector = embeddings_index[word]
#         embedding_matrix[i] = embedding_vector
#     except KeyError:
#         embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),300)

In [43]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [44]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.00909424, -0.04418945,  0.09960938, ...,  0.14453125,
         0.18066406, -0.08691406],
       [-0.07910156,  0.12158203, -0.00842285, ..., -0.39257812,
         0.07763672,  0.27148438],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

# Model-->

As word2vec has 300 dimensions do we are choosing output dimension as 300 units..

In [45]:
 model = Sequential()

#Non-trainable embeddidng layer
model.add(Embedding(vocab_size, output_dim=300, weights=[embedding_matrix], input_length=2000, trainable=False))
    
model.add(LSTM(units=128 , return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(units=64))
model.add(Dropout(0.1))
model.add(Dense(units = 32 , activation = 'relu'))
model.add(Dense(2, activation='sigmoid'))

In [46]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2000, 300)         60649800  
_________________________________________________________________
lstm_1 (LSTM)                (None, 2000, 128)         219648    
_________________________________________________________________
dropout_1 (Dropout)          (None, 2000, 128)         0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                

In [47]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [48]:
model.fit(X_train,Y_train,batch_size = 256 , validation_data = (X_test,Y_test) , epochs = 5)

Train on 35918 samples, validate on 8980 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x7f9f20077e10>

In [53]:
# prediction = model.predict_classes(X_test)
# cf_matrix = confusion_matrix(Y_test,prediction)
# sns.heatmap(cf_matrix, annot=True, fmt='g', xticklabels = ['Fake','Real'] , yticklabels = ['Fake','Real'])

In [67]:
def get_pred_output(text_to_check):
    sequences = tokenizer.texts_to_sequences([text_to_check])
    data = pad_sequences(sequences, maxlen=2000)
    predicted_val = model.predict_classes(data)
#     predicted_val = model.predict(data)    
#     if predicted_val.max() > 0.7:
#         output = 1
#     else:
#          output = 0
    
    return predicted_val

In [84]:
unseen_real_data = """
Twenty-three more people have tested positive for COVID-19 in Tripura, taking the total number of cases in the state to 232.

The number of active cases stands at 65 while 165 people have recovered and have been discharged and two have migrated to other states.

Chief Minister Biplab Kumar Deb said, among the new cases, 18 people have come from Maharashtra by train.
"""

In [73]:
unseen_fake_data = """
Americans to fund killing babies in abortion that she has been caught trying to add taxpayer financing of abortions to the bill to combat the Coronavirus and provide economic stimulus to the nation as it deals with the COVD-19 outbreak.
Nancy Pelosi has a long history of promoting abortion and her first act after becoming Speaker in 2019 was pushing legislation to use tax money for abortions. So it’s no surprise she is trying to exploit the Coronavirus pandemic to push abortion funding again.
As The Daily Caller reports: House Speaker Nancy Pelosi sought to include a potential way to guarantee federal funding for abortion into the coronavirus economic stimulus plan, according to multiple senior White House officials.
Speaking to the Daily Caller, those officials alleged that while negotiating the stimulus with U.S. Treasury Secretary Steve Mnuchin, Pelosi tried to lobby for “several” provisions that stalled bipartisan commitment to the effort. One was a mandate for up to $1 billion to reimburse laboratory claims, which White House officials say would set a precedent of health spending without protections outlined in the Hyde Amendment.
LifeNews depends on the support of readers like you to combat the pro-abortion media. Please donate now.
“A New mandatory funding stream that does not have Hyde protections would be unprecedented,” one White House official explained. “Under the guise of protecting people, Speaker Pelosi is working to make sure taxpayer dollars are spent covering abortion—which is not only backwards, but goes against historical norms.”
A second White House official referred to the provision as a “slush fund” and yet another questioned “what the Hyde Amendment and abortion have to do with protecting Americans from coronavirus?”
Americans should insist to their members of Congress that we need a clean bill that provides aggressive action to help patients and spur the economy. Killing babies with our tax dollars is not the answer to the coronavirus and the situation should not be exploited for political gain.
"""

In [85]:
text_to_check = unseen_real_data
pred = get_pred_output(text_to_check)
print('Unseen real data prediction {} '.format(pred[0]))

text_to_check = unseen_fake_data
pred = get_pred_output(text_to_check)
print('Unseen fake data prediction {} '.format(pred[0]))

Unseen real data prediction 1 
Unseen fake data prediction 0 


In [77]:
data.iloc[1000:1500]

Unnamed: 0,title,text,subject,date,sentiment
1000,No role for Assad in Syria's future: Tillerson,geneva reuters president bashar alassad family...,politicsNews,"October 26, 2017",1
1001,U.S. veterans to Trump: Save bank customers' r...,washington reuters largest us veterans service...,politicsNews,"October 26, 2017",1
1002,House narrowly passes measure paving way for T...,washington reuters us house representatives he...,politicsNews,"October 26, 2017",1
1003,U.S. appoints new top official at Havana embas...,havana reuters united states said thursday des...,politicsNews,"October 26, 2017",1
1004,Fatal Niger operation sparks calls for public ...,washington reuters democratic us lawmakers cal...,politicsNews,"October 26, 2017",1
...,...,...,...,...,...
1495,"Typical U.S. family earning $100,000 to get $1...",washington reuters typical us family two child...,politicsNews,"September 28, 2017",1
1496,Senators close to bipartisan deal on health ex...,washington reuters two us senators parties clo...,politicsNews,"September 28, 2017",1
1497,U.S. votes to advance FCC chairman nomination,washington reuters us senate voted thursday ad...,politicsNews,"September 28, 2017",1
1498,Connecticut governor vetoes budget as spending...,new york reuters connecticut governor dannel m...,politicsNews,"September 28, 2017",1


In [78]:
data.iloc[31000:31500]

Unnamed: 0,title,text,subject,date,sentiment
31000,WATCH: NANCY PELOSI COMES UNGLUED! Calls A Pie...,representative nancy pelosi dca member congres...,politics,"Oct 27, 2017",0
31001,HOUSTON TEXAN PLAYERS THREATEN Owner For Telli...,nyp reported players bob mcnair employs happy ...,politics,"Oct 27, 2017",0
31002,FBI INFORMANT Blows The Whistle on Obama DOJ’s...,former fbi informant blew whistle highprofile ...,politics,"Oct 27, 2017",0
31003,UNHINGED DEM HIJACKS TUCKER…Refuses to Leave S...,democratic california rep brad sherman appeare...,politics,"Oct 27, 2017",0
31004,INVESTIGATION LAUNCHED: SECOND TRESPASSER May ...,news second man able sneak press gaggle inside...,politics,"Oct 27, 2017",0
...,...,...,...,...,...
31495,PATRIOT ARTIST’S LATEST TRIBUTE TO TRUMP SUPPO...,patriot artist john mcnaughton revealed latest...,politics,"Aug 21, 2017",0
31496,CHELSEA CLINTON Uses “Lucifer” To Support Argu...,chelsea clinton thought quite clever cited sto...,politics,"Aug 21, 2017",0
31497,MUSLIM PROFESSOR A NO-SHOW AT LECTURE: Afraid ...,recall muslim professor caught video see expla...,politics,"Aug 21, 2017",0
31498,WOW! SECRET SERVICE DIRECTOR Sets Record Strai...,usa today published article today egregiously ...,politics,"Aug 21, 2017",0


In [81]:
text_to_check = data.text[1500]
pred = get_pred_output(text_to_check)
print('Seen Real data prediction {} '.format(pred[0]))

text_to_check = data.text[31500]
pred = get_pred_output(text_to_check)
print('Seen Fake data prediction {} '.format(pred[0]))

Seen Real data prediction 1 
Seen Fake data prediction 0 


# So our model is predicting quite well..

# Now saving our model as a h5 model..-->

In [86]:
model.save('final_lstm_model(word2vec).h5')