## Fake News Classifier Using LSTM

Dataset: https://www.kaggle.com/c/fake-news/data#

In [4]:
import pandas as pd

In [5]:
df=pd.read_csv('data/task_3a_sample_data.csv')

In [6]:
df.head()

Unnamed: 0,public_id,title,text,our rating
0,f2182a54,HUGE! Attorney Sidney Powell CONFIRMS Alleged ...,Last week Rep. Louie Gohmert told Chris Salced...,FALSE
1,c5175d8d,Paul Ryan’s Worst Ally - The New York Times,WHATEVER drama plays out when Republicans meet...,TRUE
2,213a870b,"You Can Get Jail Time Or $3,000 Fine For Not W...",Source page URL Title You Can Get Jail Time O...,FALSE
3,392886ea,Antifa gearing up for false flag violence disg...,With merchants in Democrat-run cities boarding...,FALSE
4,bc6d5d55,Remarks by President Biden on the Administrati...,State Dining Room 4:22 P.M. EST THE PRESIDEN...,partially false


In [8]:
df['label'] = df['our rating'].apply(lambda x: 1 if 'true' in x.lower() else ( 0 if 'partially false' in x.lower() else -1))
df.head()

Unnamed: 0,public_id,title,text,our rating,label
0,f2182a54,HUGE! Attorney Sidney Powell CONFIRMS Alleged ...,Last week Rep. Louie Gohmert told Chris Salced...,FALSE,-1
1,c5175d8d,Paul Ryan’s Worst Ally - The New York Times,WHATEVER drama plays out when Republicans meet...,TRUE,1
2,213a870b,"You Can Get Jail Time Or $3,000 Fine For Not W...",Source page URL Title You Can Get Jail Time O...,FALSE,-1
3,392886ea,Antifa gearing up for false flag violence disg...,With merchants in Democrat-run cities boarding...,FALSE,-1
4,bc6d5d55,Remarks by President Biden on the Administrati...,State Dining Room 4:22 P.M. EST THE PRESIDEN...,partially false,0


In [9]:
###Drop Nan Values
df=df.dropna()


In [10]:
## Get the Independent Features

X=df.drop('label',axis=1)

In [11]:
## Get the Dependent features
y=df['label']

In [12]:
X.shape

(50, 4)

In [13]:
y.shape

(50,)

In [14]:
import tensorflow as tf

In [15]:
tf.__version__

'2.4.1'

In [16]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [17]:
### Vocabulary size
voc_size=5000

### Onehot Representation

In [18]:
messages=X.copy()

In [19]:
messages['title'][1]

'Paul Ryan’s Worst Ally - The New York Times'

In [20]:
messages.reset_index(inplace=True)

In [23]:
import nltk
import re
from nltk.corpus import stopwords

In [24]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\smsoh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [25]:
### Dataset Preprocessing
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    print(i)
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [26]:
corpus

['huge attorney sidney powel confirm alleg dominion server germani confisc video',
 'paul ryan worst alli new york time',
 'get jail time fine wear face mask canada start today canada eh',
 'antifa gear fals flag violenc disguis trump support',
 'remark presid biden administr covid vaccin effort',
 'infowar articl',
 'bombshel covid infect rate may higher among children receiv flu shot health author madli push shot come flu season',
 'marin corp rebuk pelosi work',
 'fine ban drive smoke drive canada start today',
 'scott walker still owe million presidenti campaign',
 'blaylock face mask pose seriou risk healthi',
 'kamala harri say support second amend right record prove otherwis',
 'wisconsin elect commiss direct allow clerk fix ballot updat',
 'trump worst kind socialist',
 'wisdem farmer crush trump fail covid respons drag entir presid',
 'former presid barack obama arrest espionag conserv beaver',
 'merck scrap covid vaccin say effect get viru recov',
 'u senat tammi baldwin wisc

In [28]:
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_repr

[[2733, 3010, 1245, 4982, 781, 1153, 4997, 3913, 3985, 2722, 250],
 [4605, 2516, 2490, 4585, 4000, 4216, 617],
 [2525, 2656, 617, 1093, 482, 3261, 4536, 4980, 4905, 54, 4980, 808],
 [2893, 1218, 2166, 4708, 594, 2914, 3482, 3056],
 [2538, 2712, 374, 1615, 1501, 2232, 3593],
 [2356, 589],
 [401,
  1501,
  4753,
  3745,
  4280,
  1695,
  2997,
  10,
  982,
  4266,
  747,
  2242,
  4869,
  2831,
  4156,
  747,
  45,
  4266,
  4377],
 [2967, 3894, 1130, 3482, 129],
 [1093, 3351, 3759, 4267, 3759, 4980, 4905, 54],
 [1613, 4877, 3453, 1798, 4892, 2476, 2850],
 [4303, 3261, 4536, 1147, 4432, 2614, 3617],
 [1428, 355, 4257, 3056, 2427, 712, 1316, 3650, 2181, 696],
 [3702, 3745, 3340, 2895, 611, 2727, 478, 3197, 380],
 [3482, 2490, 2837, 225],
 [1385, 1093, 860, 3482, 2488, 1501, 3895, 1490, 2870, 2712],
 [3748, 2712, 3288, 1279, 4650, 170, 1129, 1449],
 [266, 2939, 1501, 2232, 4257, 3448, 2525, 4109, 3591],
 [106, 4486, 707, 2414, 3702],
 [4378, 1681, 617, 1090, 2981, 4247, 2737, 2738, 2188],


### Embedding Representation

In [29]:
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0    0    0    0    0    0    0 2733 3010 1245 4982  781
  1153 4997 3913 3985 2722  250]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0 4605
  2516 2490 4585 4000 4216  617]
 [   0    0    0    0    0    0    0    0 2525 2656  617 1093  482 3261
  4536 4980 4905   54 4980  808]
 [   0    0    0    0    0    0    0    0    0    0    0    0 2893 1218
  2166 4708  594 2914 3482 3056]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0 2538
  2712  374 1615 1501 2232 3593]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0 2356  589]
 [   0  401 1501 4753 3745 4280 1695 2997   10  982 4266  747 2242 4869
  2831 4156  747   45 4266 4377]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0 2967 3894 1130 3482  129]
 [   0    0    0    0    0    0    0    0    0    0    0    0 1093 3351
  3759 4267 3759 4980 4905   54]
 [   0    0    0    0    0    0    0    0    0    0    

In [30]:
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0, 2733, 3010,
       1245, 4982,  781, 1153, 4997, 3913, 3985, 2722,  250])

In [31]:
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 40)            200000    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               56400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [32]:
len(embedded_docs),y.shape

(50, (50,))

In [33]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [34]:
X_final.shape,y_final.shape

((50, 20), (50,))

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

### Model Training

In [37]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=50,batch_size=64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1dad79f3190>

### Adding Dropout 

In [38]:
from tensorflow.keras.layers import Dropout
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

### Performance Metrics And Accuracy

In [39]:
y_pred=model.predict_classes(X_test)



In [40]:
from sklearn.metrics import confusion_matrix

In [41]:
confusion_matrix(y_test,y_pred)

array([[0, 6, 1],
       [0, 4, 1],
       [0, 4, 1]], dtype=int64)

In [42]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.29411764705882354