In [1]:
# importing necessary libraries
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import Dense,LSTM,Embedding,Bidirectional,Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re

In [2]:
data_t=pd.read_csv("True.csv")        # reading true dataset
data_f=pd.read_csv("Fake.csv")        # reading fake dataset

In [3]:
data_t['class']=1
data_f['class']=0

In [4]:
print(data_t.isna().sum(),data_f.isna().sum())

title      0
text       0
subject    0
date       0
class      0
dtype: int64 title      0
text       0
subject    0
date       0
class      0
dtype: int64


In [5]:
      # dropping unwanted columns
data_t.drop(['subject'],axis=1,inplace=True)
data_t.drop(['date'],axis=1,inplace=True)
data_t.drop(['title'],axis=1,inplace=True)
data_f.drop(['title'],axis=1,inplace=True)         
data_f.drop(['subject'],axis=1,inplace=True)
data_f.drop(['date'],axis=1,inplace=True)

In [6]:
    # combining true and fake news datasets
data=pd.concat([data_t,data_f],axis=0)
data.columns

Index(['text', 'class'], dtype='object')

In [7]:
  # suffle the data
data=data.sample(frac=1)

In [8]:
data.head()

Unnamed: 0,text,class
609,WASHINGTON (Reuters) - President Donald Trump ...,1
11784,Muslims take time away from protest to pray at...,0
15269,HAMBURG (Reuters) - Many Syrian refugees in Ge...,1
5852,BAGHDAD/CAIRO (Reuters) - A global backlash ag...,1
9342,"Earlier today, President Trump tweeted: We sh...",0


In [9]:
data.reset_index(inplace=True)

In [10]:
data.drop(['index'],axis=1,inplace=True)
data.columns

Index(['text', 'class'], dtype='object')

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    44898 non-null  object
 1   class   44898 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 701.7+ KB


In [12]:
data=data.head(20000)

In [13]:
   # spliting the data into train test split
x_train,x_test,y_train,y_test=train_test_split(data['text'],data['class'],test_size=0.2,random_state=0)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(16000,)
(16000,)
(4000,)
(4000,)


In [14]:
#    removing unwanted characters ,urls,special characters,blank spaces etc

def clean(data):
    normalized=[]
    for i in data:
        i=i.lower()
        i=re.sub('https?://\S+|www\.\S+','',i)
        i=re.sub('\\W',' ',i)
        i=re.sub('\n','',i)
        i=re.sub(' +',' ',i)
        i=re.sub('^ ','',i)
        i=re.sub(' $','',i)
        normalized.append(i)
    return normalized

In [15]:
x_train=clean(x_train)
x_test=clean(x_test)

In [16]:
vocabs=10000
token=Tokenizer(vocabs)
token.fit_on_texts(x_train)

In [17]:
# tokenize the text into vectors
x_train=token.texts_to_sequences(x_train)
x_test=token.texts_to_sequences(x_test)

In [18]:
x_train=pad_sequences(x_train,padding='post',maxlen=256)
x_test=pad_sequences(x_test,padding='post',maxlen=256)

In [19]:
# training model

model=Sequential()
model.add(Embedding(vocabs,100))
model.add(Dropout(0.2))
model.add(LSTM(128,return_sequences=True))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(512))
model.add(Dropout(0.2))
model.add(Dense(256))
model.add(Dense(1,activation="sigmoid"))

In [20]:
model.compile(loss="binary_crossentropy",optimizer="adam",metrics='accuracy')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         1000000   
                                                                 
 dropout (Dropout)           (None, None, 100)         0         
                                                                 
 lstm (LSTM)                 (None, None, 128)         117248    
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 512)               66048     
                                                                 
 dropout_2 (Dropout)         (None, 512)               0

In [21]:
result=model.fit(x_train,y_train,epochs=15,batch_size=2000,validation_data=(x_test,y_test))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [24]:
accuracy=pd.DataFrame(model.history.history)
accuracy

Unnamed: 0,loss,accuracy,val_loss,val_accuracy
0,0.671908,0.548187,0.597056,0.72525
1,0.481856,0.751562,0.39173,0.7565
2,0.341353,0.806375,0.331968,0.82625
3,0.277347,0.843562,0.297994,0.84575
4,0.249641,0.862125,0.272929,0.864
5,0.226403,0.887563,0.258844,0.8845
6,0.238651,0.867938,0.291843,0.8455
7,0.213238,0.873937,0.297431,0.82625
8,0.244148,0.868187,0.273908,0.8585
9,0.207803,0.879438,0.266925,0.86775


In [35]:
accuracy['accuracy'].tail()

10    0.928125
11    0.989125
12    0.995687
13    0.998625
14    0.999375
Name: accuracy, dtype: float64

In [26]:
model.save("fakenews_detection.h5")

  saving_api.save_model(


In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
mod=tf.keras.models.load_model("fakenews_detection.h5")
news=input("Enter the News:")
vocabs=10000
token=Tokenizer(vocabs)
token.fit_on_texts([news])
news_input=token.texts_to_sequences([news])
news_input=pad_sequences(news_input,maxlen=256)

prediction=mod.predict(news_input)
prediction

Enter the News:"Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest


array([[0.00527592]], dtype=float32)

In [3]:
# 0.5 is a default threshold value for binary classification and the threshold value may differ based on our datasets
if prediction[0]>0.06:
    print("it is real news")
else:
    print("it is fake news")

it is fake news
