## Fake News Classifier Using Bidirectional LSTM

Dataset: https://www.kaggle.com/c/fake-news/data#

In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('data/task_3a_sample_data.csv')

In [3]:
df.head()

Unnamed: 0,public_id,title,text,our rating
0,f2182a54,HUGE! Attorney Sidney Powell CONFIRMS Alleged ...,Last week Rep. Louie Gohmert told Chris Salced...,FALSE
1,c5175d8d,Paul Ryan’s Worst Ally - The New York Times,WHATEVER drama plays out when Republicans meet...,TRUE
2,213a870b,"You Can Get Jail Time Or $3,000 Fine For Not W...",Source page URL Title You Can Get Jail Time O...,FALSE
3,392886ea,Antifa gearing up for false flag violence disg...,With merchants in Democrat-run cities boarding...,FALSE
4,bc6d5d55,Remarks by President Biden on the Administrati...,State Dining Room 4:22 P.M. EST THE PRESIDEN...,partially false


In [38]:
df['class'] = df['our rating'].apply(lambda x: 1 if 'true' in x.lower() else ( 0 if 'partially false' in x.lower() else -1))
df.head()

Unnamed: 0,public_id,title,text,our rating,class
0,f2182a54,HUGE! Attorney Sidney Powell CONFIRMS Alleged ...,Last week Rep. Louie Gohmert told Chris Salced...,FALSE,-1
1,c5175d8d,Paul Ryan’s Worst Ally - The New York Times,WHATEVER drama plays out when Republicans meet...,TRUE,1
2,213a870b,"You Can Get Jail Time Or $3,000 Fine For Not W...",Source page URL Title You Can Get Jail Time O...,FALSE,-1
3,392886ea,Antifa gearing up for false flag violence disg...,With merchants in Democrat-run cities boarding...,FALSE,-1
4,bc6d5d55,Remarks by President Biden on the Administrati...,State Dining Room 4:22 P.M. EST THE PRESIDEN...,partially false,0


In [39]:
###Drop Nan Values
df=df.dropna()


In [40]:
## Get the Independent Features

X=df.drop('class',axis=1)

In [41]:
## Get the Dependent features
y=df['class']

In [42]:
y.value_counts()

-1    20
 0    20
 1    10
Name: class, dtype: int64

In [43]:
X.shape

(50, 4)

In [44]:
y.shape

(50,)

In [45]:
import tensorflow as tf

In [46]:
tf.__version__

'2.4.1'

In [47]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout

In [48]:
### Vocabulary size
voc_size=5000

### Onehot Representation

In [49]:
messages=X.copy()

In [50]:
messages['title'][1]

'Paul Ryan’s Worst Ally - The New York Times'

In [51]:
messages.reset_index(inplace=True)

In [52]:
import nltk
import re
from nltk.corpus import stopwords

In [53]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\smsoh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [54]:
### Dataset Preprocessing
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    print(i)
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [55]:
corpus

['huge attorney sidney powel confirm alleg dominion server germani confisc video',
 'paul ryan worst alli new york time',
 'get jail time fine wear face mask canada start today canada eh',
 'antifa gear fals flag violenc disguis trump support',
 'remark presid biden administr covid vaccin effort',
 'infowar articl',
 'bombshel covid infect rate may higher among children receiv flu shot health author madli push shot come flu season',
 'marin corp rebuk pelosi work',
 'fine ban drive smoke drive canada start today',
 'scott walker still owe million presidenti campaign',
 'blaylock face mask pose seriou risk healthi',
 'kamala harri say support second amend right record prove otherwis',
 'wisconsin elect commiss direct allow clerk fix ballot updat',
 'trump worst kind socialist',
 'wisdem farmer crush trump fail covid respons drag entir presid',
 'former presid barack obama arrest espionag conserv beaver',
 'merck scrap covid vaccin say effect get viru recov',
 'u senat tammi baldwin wisc

In [56]:
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_repr

[[1747, 3406, 4564, 4916, 447, 3135, 4584, 792, 2907, 693, 3448],
 [1952, 2222, 434, 4405, 382, 222, 1079],
 [277, 1367, 1079, 1030, 1135, 434, 3446, 819, 3370, 3937, 819, 2773],
 [23, 3998, 4779, 3317, 3266, 2752, 3085, 3906],
 [3458, 3476, 4118, 2801, 1675, 1413, 3090],
 [3847, 3933],
 [2075,
  1675,
  2990,
  4571,
  3348,
  164,
  4315,
  2520,
  1155,
  4562,
  436,
  4213,
  4511,
  3680,
  3369,
  436,
  4949,
  4562,
  2037],
 [1506, 3702, 1318, 2280, 2685],
 [1030, 2001, 1173, 4076, 1173, 819, 3370, 3937],
 [773, 3472, 3883, 1783, 117, 4415, 3234],
 [4035, 434, 3446, 1942, 2393, 490, 4409],
 [2946, 3142, 2596, 3906, 199, 2724, 2699, 2208, 165, 2235],
 [473, 1865, 4798, 4463, 3403, 4278, 1516, 1529, 4505],
 [3085, 434, 4950, 2536],
 [518, 4354, 3596, 3085, 516, 1675, 328, 532, 3575, 3476],
 [984, 3476, 998, 3632, 2373, 2798, 4155, 3497],
 [688, 3594, 1675, 1413, 2596, 2042, 277, 1636, 4468],
 [3965, 4872, 4637, 3056, 473],
 [2380, 224, 1079, 1548, 3245, 1960, 2995, 3635, 1786],

### Embedding Representation

In [57]:
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0    0    0    0    0    0    0 1747 3406 4564 4916  447
  3135 4584  792 2907  693 3448]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0 1952
  2222  434 4405  382  222 1079]
 [   0    0    0    0    0    0    0    0  277 1367 1079 1030 1135  434
  3446  819 3370 3937  819 2773]
 [   0    0    0    0    0    0    0    0    0    0    0    0   23 3998
  4779 3317 3266 2752 3085 3906]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0 3458
  3476 4118 2801 1675 1413 3090]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0 3847 3933]
 [   0 2075 1675 2990 4571 3348  164 4315 2520 1155 4562  436 4213 4511
  3680 3369  436 4949 4562 2037]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0 1506 3702 1318 2280 2685]
 [   0    0    0    0    0    0    0    0    0    0    0    0 1030 2001
  1173 4076 1173  819 3370 3937]
 [   0    0    0    0    0    0    0    0    0    0    

In [58]:
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0, 1747, 3406,
       4564, 4916,  447, 3135, 4584,  792, 2907,  693, 3448])

In [59]:
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 40)            200000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               56400     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [60]:
## Creating model
embedding_vector_features=40
model1=Sequential()
model1.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model1.add(Bidirectional(LSTM(100)))
model1.add(Dropout(0.3))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model1.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 20, 40)            200000    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               112800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 201       
Total params: 313,001
Trainable params: 313,001
Non-trainable params: 0
_________________________________________________________________
None


In [64]:
len(embedded_docs),y.shape

(50, (50,))

In [65]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [66]:
X_final.shape,y_final.shape

((50, 20), (50,))

In [67]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=8)

In [69]:
y_train

array([-1,  0, -1, -1,  0,  0, -1,  0, -1,  0,  0,  1, -1,  1, -1,  0,  0,
        0, -1, -1,  0,  1,  1,  0,  0,  1, -1, -1, -1, -1, -1, -1, -1],
      dtype=int64)

### Model Training

In [76]:
### Finally Training
model1.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x17ba663fca0>

### Performance Metrics And Accuracy

In [71]:

y_pred1=model1.predict_classes(X_test)



In [72]:
from sklearn.metrics import confusion_matrix

In [73]:
confusion_matrix(y_test,y_pred1)

array([[0, 4, 0],
       [0, 8, 0],
       [0, 5, 0]], dtype=int64)

In [74]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred1)

0.47058823529411764

In [75]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred1))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         4
           0       0.47      1.00      0.64         8
           1       0.00      0.00      0.00         5

    accuracy                           0.47        17
   macro avg       0.16      0.33      0.21        17
weighted avg       0.22      0.47      0.30        17

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
