## Fake News Classifier Using LSTM
#### dataset :- https://www.kaggle.com/datasets/saurabhshahane/fake-news-classification

In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('Dataset.csv')
df.shape

(72134, 4)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [4]:
df.isnull().sum()

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

In [5]:
df=df.dropna()

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
5,5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1


In [7]:
## Get the Independent Features

X=df.drop('label',axis=1)

In [8]:
## Get the Dependent features
y=df['label']

In [9]:
X.shape

(71537, 3)

In [10]:
y.shape

(71537,)

In [11]:
import tensorflow as tf




In [12]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

# One Hot Encoding

In [13]:
### Vocabulary size
voc_size=5000

In [14]:
messages=X.copy()

In [15]:
messages['title'].iloc[1]

'UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MOST CHARLOTTE RIOTERS WERE “PEACEFUL” PROTESTERS…In Her Home State Of North Carolina [VIDEO]'

In [16]:
messages.reset_index(inplace=True)

In [17]:
import nltk
import re
from nltk.corpus import stopwords

In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\supra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
### Dataset Preprocessing
from nltk.stem.porter import PorterStemmer ##stemming purpose
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [20]:
corpus

['law enforc high alert follow threat cop white blacklivesmatt fyf terrorist video',
 'unbeliev obama attorney gener say charlott rioter peac protest home state north carolina video',
 'bobbi jindal rais hindu use stori christian convers woo evangel potenti bid',
 'satan russia unv imag terrifi new supernuk western world take notic',
 'time christian group sue amazon splc design hate group',
 'dr ben carson target ir never audit spoke nation prayer breakfast',
 'hous intel chair trump russia fake stori evid anyth video',
 'sport bar owner ban nfl game show true american sport like speak rural america video',
 'latest pipelin leak underscor danger dakota access pipelin',
 'gop senat smack punchabl alt right nazi internet',
 'may brexit offer would hurt cost eu citizen eu parliament',
 'schumer call trump appoint offici overse puerto rico relief',
 'watch hilari ad call question health age clinton crime famili boss',
 'chang expect espn polit agenda despit huge subscrib declin breitbart'

In [21]:
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_repr

[[1166, 3919, 635, 2171, 856, 831, 3750, 76, 2344, 1779, 2918, 710],
 [936,
  2799,
  2774,
  2111,
  439,
  2738,
  1710,
  3378,
  2501,
  3046,
  1169,
  1112,
  1710,
  710],
 [1486, 823, 2473, 2491, 1097, 3916, 1010, 3519, 184, 4457, 2119, 4007],
 [1985, 1, 4909, 3576, 4483, 4572, 1119, 1828, 2367, 2469, 2771],
 [2361, 1010, 1124, 2622, 4733, 260, 4915, 3177, 1124],
 [3095, 4893, 3965, 1583, 4806, 4502, 145, 4822, 2454, 837, 4194],
 [4020, 357, 3755, 257, 1, 1213, 3916, 1380, 2245, 710],
 [564,
  2719,
  2955,
  4419,
  3034,
  1841,
  3517,
  2001,
  2204,
  564,
  4752,
  558,
  2297,
  701,
  710],
 [462, 4111, 2113, 3896, 2051, 1290, 2673, 4111],
 [3948, 3574, 1296, 3053, 3110, 3313, 3831, 4325],
 [4784, 2422, 3831, 3454, 2701, 2957, 3564, 3472, 3564, 599],
 [1877, 2266, 257, 641, 1493, 538, 4656, 2341, 4905],
 [1401, 4815, 101, 2266, 2087, 2812, 4614, 2981, 4766, 517, 1972],
 [3019, 786, 1907, 4875, 1753, 4287, 4493, 3335, 471, 62],
 [4849, 4313, 3650, 787, 4137, 4020, 217],


In [22]:
## corpus[1]= 'unbeliev obama attorney gener say charlott rioter peac protest home state north carolina video'
onehot_repr[1]

[936,
 2799,
 2774,
 2111,
 439,
 2738,
 1710,
 3378,
 2501,
 3046,
 1169,
 1112,
 1710,
 710]

# Embedding Representation

In [23]:
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='post',maxlen=sent_length)
print(embedded_docs)

[[1166 3919  635 ...    0    0    0]
 [ 936 2799 2774 ...    0    0    0]
 [1486  823 2473 ...    0    0    0]
 ...
 [3814 1547 2136 ...    0    0    0]
 [ 257 2744 2795 ...    0    0    0]
 [ 300 2489 1926 ...    0    0    0]]


In [24]:
embedded_docs[1]

array([ 936, 2799, 2774, 2111,  439, 2738, 1710, 3378, 2501, 3046, 1169,
       1112, 1710,  710,    0,    0,    0,    0,    0,    0])

In [25]:
## Creating model
embedding_vector_features=40 ##features representation
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(200))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 40)            200000    
                                                                 
 lstm (LSTM)                 (None, 200)               192800    
                                                                 
 dense (Dense)               (None, 1)                 201       
                                                                 
Total params: 393001 (1.50 MB)
Trainable params: 393001 (1.50 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [26]:
len(embedded_docs),y.shape

(71537, (71537,))

In [27]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [28]:
X_final.shape,y_final.shape

((71537, 20), (71537,))

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

## Model Training

##### Adding Dropouts

In [31]:
## Creating model
from tensorflow.keras.layers import Dropout
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [33]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
### Finally Training
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=64, callbacks=[early_stop])

Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


<keras.src.callbacks.History at 0x1c95204c310>

##### Performance Matrix and Accuracy

In [34]:
y_pred=model.predict(X_test)



In [35]:
y_pred=np.where(y_pred > 0.6, 1,0) ##AUC ROC Curve

In [36]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[10692,  1001],
       [ 1559, 10356]], dtype=int64)

In [37]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.891562182311081

In [39]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.91      0.89     11693
           1       0.91      0.87      0.89     11915

    accuracy                           0.89     23608
   macro avg       0.89      0.89      0.89     23608
weighted avg       0.89      0.89      0.89     23608

