# Fake News Classification Using Bidirectional LSTM RNN

#### dataset :- https://www.kaggle.com/datasets/saurabhshahane/fake-news-classification

In [1]:
import pandas as pd

In [3]:
df=pd.read_csv('Dataset.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [5]:
df.isnull().sum()

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

In [6]:
df.shape

(72134, 4)

In [7]:
df=df.dropna()

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
5,5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1


In [None]:
## independent features
X=df.drop('label',axis=1)

In [10]:
y=df['label']

In [11]:
## Check whether dataset is balanced or not
y.value_counts()

label
1    36509
0    35028
Name: count, dtype: int64

In [12]:
import tensorflow as tf




In [13]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional


In [14]:
## vocabulary size
voc_size=5000

In [15]:
messages=X.copy()

In [16]:
messages.head(10)

Unnamed: 0.1,Unnamed: 0,title,text
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ..."
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will..."
5,5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...
6,6,DR BEN CARSON TARGETED BY THE IRS: “I never ha...,DR. BEN CARSON TELLS THE STORY OF WHAT HAPPENE...
7,7,HOUSE INTEL CHAIR On Trump-Russia Fake Story: ...,
8,8,Sports Bar Owner Bans NFL Games…Will Show Only...,"The owner of the Ringling Bar, located south o..."
9,9,Latest Pipeline Leak Underscores Dangers Of Da...,"FILE – In this Sept. 15, 2005 file photo, the ..."
10,10,GOP Senator Just Smacked Down The Most Puncha...,The most punchable Alt-Right Nazi on the inter...


In [17]:
import nltk
import re
from nltk.corpus import stopwords

In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\supra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
messages.reset_index(inplace=True)

### Dataset Preprocessing

In [21]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [22]:
corpus

['law enforc high alert follow threat cop white blacklivesmatt fyf terrorist video',
 'unbeliev obama attorney gener say charlott rioter peac protest home state north carolina video',
 'bobbi jindal rais hindu use stori christian convers woo evangel potenti bid',
 'satan russia unv imag terrifi new supernuk western world take notic',
 'time christian group sue amazon splc design hate group',
 'dr ben carson target ir never audit spoke nation prayer breakfast',
 'hous intel chair trump russia fake stori evid anyth video',
 'sport bar owner ban nfl game show true american sport like speak rural america video',
 'latest pipelin leak underscor danger dakota access pipelin',
 'gop senat smack punchabl alt right nazi internet',
 'may brexit offer would hurt cost eu citizen eu parliament',
 'schumer call trump appoint offici overse puerto rico relief',
 'watch hilari ad call question health age clinton crime famili boss',
 'chang expect espn polit agenda despit huge subscrib declin breitbart'

## One Hot Encoding

In [23]:
onehot_repr=[one_hot(words,voc_size) for words in corpus]
onehot_repr

[[3843, 1033, 2667, 2121, 2945, 211, 238, 2978, 4218, 869, 175, 1462],
 [1228,
  2274,
  3268,
  1968,
  2939,
  1206,
  893,
  4125,
  3803,
  4236,
  195,
  984,
  4582,
  1462],
 [3679, 2993, 1917, 2211, 3456, 358, 3738, 3232, 2501, 1437, 92, 3838],
 [471, 2586, 3847, 766, 3570, 4739, 4675, 2937, 2192, 3115, 451],
 [730, 3738, 4002, 566, 3982, 3951, 1083, 2791, 4002],
 [1311, 3904, 4523, 4758, 1386, 1206, 4095, 785, 3547, 4657, 4336],
 [915, 3119, 44, 866, 2586, 1671, 358, 2868, 1416, 1462],
 [2545,
  2233,
  2536,
  530,
  4018,
  4496,
  346,
  3413,
  4961,
  2545,
  225,
  4775,
  2684,
  4089,
  1462],
 [1986, 3613, 2841, 4650, 249, 3336, 3946, 3613],
 [1419, 2641, 4054, 4510, 1017, 4827, 2480, 3810],
 [350, 2497, 372, 3550, 4568, 1535, 1914, 305, 1914, 3871],
 [2272, 1371, 866, 1083, 1471, 4456, 4717, 4135, 4555],
 [2316, 3927, 4801, 1371, 195, 3357, 3436, 508, 3500, 2951, 2881],
 [2306, 4498, 4188, 601, 562, 3530, 3536, 194, 213, 767],
 [1245, 3281, 943, 723, 182, 915, 131],
