Fake News Prediction Using Bidirectional LSTM RNN

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding , Bidirectional , LSTM , Dropout , Dense

Loading the Dataset

In [2]:
df = pd.read_csv('fake_news_dataset.csv')
df = df[['title' ,'text' ,'label']].dropna()    ##We will not consider unnecessary features like 'date' ,'source','author' ,'category'

df['content'] = df['title'] + " " + df['text']

In [3]:
df['content'].head()

Unnamed: 0,content
0,Foreign Democrat final. more tax development b...
1,To offer down resource great point. probably g...
2,Himself church myself carry. them identify for...
3,You unit its should. phone which item yard Rep...
4,Billion believe employee summer how. wonder my...


Operation on labels

In [4]:
print(df['label'].unique())
print(df['label'].value_counts())

['real' 'fake']
label
fake    10056
real     9944
Name: count, dtype: int64


In [5]:
df['label'] = df['label'].map({'fake' : 1 , 'real' : 0})
##In classification, 1 usually means “alert”, “suspicious”, or “positive” for detection.

In [6]:
df['label']

Unnamed: 0,label
0,0
1,1
2,1
3,1
4,1
...,...
19995,1
19996,0
19997,0
19998,1


Text Preprocessing

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet') ## Download wordnet
stop_words = set(stopwords.words('english'))
le = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [11]:
import re

def clean_text(text):
    text = re.sub('^[a-zA-Z]' , ' ' , text)        ##Remove all numbers and symbols
    text = text.lower().split()                    ##Convert it to lowercase and split it
    text = [le.lemmatize(words) for words in text if words not in stop_words]         ##Remove stop words and lemmatize the remaining words
    return " ".join(text)

In [12]:
df['cleaned'] = df['content'].apply(clean_text)

In [13]:
df['cleaned']

Unnamed: 0,cleaned
0,oreign democrat final. tax development store a...
1,offer resource great point. probably guess wes...
2,imself church carry. identify forward present ...
3,ou unit should. phone item yard republican saf...
4,illion believe employee summer how. wonder fac...
...,...
19995,ouse party born. hit television change happy d...
19996,hough nation people maybe price box. fear meet...
19997,et exist experience unit. activity loss provid...
19998,chool wide item. term point general common tra...


Tokenization and Padding

In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [15]:
MAX_VOCAB = 10000    ##let us choose 10000 unique words from our dataset
MAX_LEN = 300        ##let us make maximum length of the sentence as 300 after padding

##Tokenization
tokenizer = Tokenizer(num_words= MAX_VOCAB)
tokenizer.fit_on_texts(df['cleaned'])

X = tokenizer.texts_to_sequences(df['cleaned'])
X = pad_sequences(X , maxlen=MAX_LEN)

In [16]:
X

array([[  0,   0,   0, ..., 147, 255, 338],
       [  0,   0,   0, ..., 762, 720, 277],
       [  0,   0,   0, ..., 122, 565, 280],
       ...,
       [  0,   0,   0, ..., 148, 525, 563],
       [  0,   0,   0, ..., 353, 434, 186],
       [  0,   0,   0, ..., 476, 379, 740]], dtype=int32)

Operations on output

In [17]:
y = df['label'].values

In [18]:
y

array([0, 1, 1, ..., 0, 1, 1])

In [19]:
##Checking shape
print(f"Shape of X : {X.shape}")
print(f"Shape od y : {y.shape}")

Shape of X : (20000, 300)
Shape od y : (20000,)


Train-Test Split

In [20]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.2 , random_state=42)

In [21]:
##Checking shape
print(f"Shape of training data : {X_train.shape} and {y_train.shape}")
print(f"Shape of test data : {X_test.shape} and {y_test.shape}")


Shape of training data : (16000, 300) and (16000,)
Shape of test data : (4000, 300) and (4000,)


Model Buildinng

In [22]:
model = Sequential([
    Embedding(input_dim= MAX_VOCAB , output_dim=128 , input_length = MAX_LEN),
    Bidirectional(LSTM(128 , return_sequences= False)),
    Dropout(0.3),
    Dense(64 , activation='relu'),
    Dropout(0.3),
    Dense(1 , activation='sigmoid')    ##As it is a 'binary classification' so we use 'sigmoid'
])



In [23]:
# The model isn't "built" yet if no input was passed
model.build(input_shape=(None, 300))  # Batch size can be None

In [24]:
model.summary()

Compile the Model

In [25]:
model.compile(optimizer = 'adam' , loss = 'categorical_crossentropy' , metrics = ['accuracy'])

Training the model

In [28]:
model.fit(X_train , y_train ,
          epochs = 25 ,batch_size = 64, validation_data = (X_test , y_test))

Epoch 1/25
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 36ms/step - accuracy: 0.5023 - loss: 0.0000e+00 - val_accuracy: 0.4927 - val_loss: 0.0000e+00
Epoch 2/25
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 37ms/step - accuracy: 0.4940 - loss: 0.0000e+00 - val_accuracy: 0.4927 - val_loss: 0.0000e+00
Epoch 3/25
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 34ms/step - accuracy: 0.4970 - loss: 0.0000e+00 - val_accuracy: 0.4927 - val_loss: 0.0000e+00
Epoch 4/25
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 34ms/step - accuracy: 0.4974 - loss: 0.0000e+00 - val_accuracy: 0.4927 - val_loss: 0.0000e+00
Epoch 5/25
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 35ms/step - accuracy: 0.4930 - loss: 0.0000e+00 - val_accuracy: 0.4927 - val_loss: 0.0000e+00
Epoch 6/25
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 36ms/step - accuracy: 0.5031 - loss: 0.0000e+00 - val_accuracy: 0

<keras.src.callbacks.history.History at 0x79bf865c5190>

Evaluating the model

In [29]:
y_pred = model.predict(X_test)      ##It will provide results like 0.85 ,023,0.97 , but we want results between 0 to 1
y_pred_labels = (y_pred > 0.5).astype(int)


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step


Performance Metrics

In [30]:
from sklearn.metrics import classification_report , accuracy_score
print(f"Accuracy : {accuracy_score(y_test , y_pred_labels)}")
print(f"{classification_report(y_test , y_pred_labels)}")

Accuracy : 0.49275
              precision    recall  f1-score   support

           0       0.49      1.00      0.66      1971
           1       0.00      0.00      0.00      2029

    accuracy                           0.49      4000
   macro avg       0.25      0.50      0.33      4000
weighted avg       0.24      0.49      0.33      4000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Prediction on random samples

In [31]:
def predict_news(news_text):
    text = clean_text(news_text)
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=MAX_LEN)
    pred = model.predict(padded)[0][0]
    return "FAKE" if pred > 0.5 else "REAL"


print(predict_news("Breaking: Government launches nationwide AI initiative."))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 323ms/step
REAL


In [32]:
print(predict_news("Breaking : ISIS camps destroyed by US Marines"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
REAL


In [33]:
print(predict_news("Breaking : Covid on a rise again"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
REAL
