#Fake News Classifier Using LSTM
##### Dataset: https://www.kaggle.com/c/fake-news/data#




In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('/content/drive/MyDrive/DataScience/train.csv')
df.tail(5)

Unnamed: 0,id,title,author,text,label
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1
20799,20799,What Keeps the F-35 Alive,David Swanson,"David Swanson is an author, activist, journa...",1


#### Checking Duplicates

In [3]:
df.duplicated().sum()

0

#### Checking Null Values

In [4]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [5]:
df.shape

(20800, 5)

We have decided to drop the null values as the dataset size is big and practically we cannot replace the text with randomly anything

In [6]:
df.dropna(inplace=True)

In [7]:
df.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

#### Combining all the features in X to get 1 single feature

In [8]:
df['news_details'] = df['author']+' '+df['title']
df.head(3)

Unnamed: 0,id,title,author,text,label,news_details
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Consortiumnews.com Why the Truth Might Get You...


In [9]:
df = df.reset_index()
df.drop(['index','id'],axis=1,inplace=True)

### Text Pre-Processing

In [10]:
df['news_details'] = df['news_details'].str.lower()

##### - Removing Punctuations

In [11]:
import string
exclude = string.punctuation+ ('·')+('•')
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~·•'

In [12]:
def remove_punctuations(text):
    return text.translate(str.maketrans('','',exclude))

In [13]:
df['news_details'] = df['news_details'].apply((remove_punctuations))

##### - Removing Email's,Websites,digits

In [14]:
from tqdm import tqdm
import re

In [15]:
for i in tqdm(range(len(df))):
  df['news_details'][i] = re.sub('\d+','',df['news_details'][i])
  df['news_details'][i] = re.sub('[a-z0-9]+@[a-z0-9]+.[0-9a-z]+','',df['news_details'][i])
  df['news_details'][i] = re.sub('http\S+|www\S+','',df['news_details'][i])

100%|██████████| 18285/18285 [00:32<00:00, 562.01it/s]


##### - Removing Stop words and Stemming

In [16]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [17]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
stoplist = stopwords.words('english')

In [19]:
def remove_stopwords(text):
    new_text = []
    for words in text.split():
        if not words in stoplist:
            stem_word = ps.stem(words)
            new_text.append(stem_word)
    return " ".join(new_text)

In [20]:
df['news_details'] = df['news_details'].apply(remove_stopwords)

In [21]:
corpus = []
for i in tqdm(range(len(df))):
  corpus.append(df['news_details'][i])

100%|██████████| 18285/18285 [00:00<00:00, 194503.80it/s]


In [22]:
corpus

['darrel lucu hous dem aid didn’t even see comey’ letter jason chaffetz tweet',
 'daniel j flynn flynn hillari clinton big woman campu breitbart',
 'consortiumnewscom truth might get fire',
 'jessica purkiss civilian kill singl us airstrik identifi',
 'howard portnoy iranian woman jail fiction unpublish stori woman stone death adulteri',
 'daniel nussbaum jacki mason hollywood would love trump bomb north korea lack tran bathroom exclus video breitbart',
 'alissa j rubin benoît hamon win french socialist party’ presidenti nomin new york time',
 'megan twohey scott shane backchannel plan ukrain russia courtesi trump associ new york time',
 'aaron klein obama’ organ action partner soroslink ‘indivisible’ disrupt trump’ agenda',
 'chri tomlinson bbc comedi sketch real housew isi caus outrag',
 'amando flavio russian research discov secret nazi militari base ‘treasur hunter’ arctic photo',
 'jason ditz us offici see link trump russia',
 'anotheranni ye paid govern troll social media blog fo

Importing TensorFlow and necessary Libraries

In [23]:
import tensorflow as tf

In [24]:
tf.__version__

'2.8.2'

In [25]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM,Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import RMSprop

#### Determining Vocabulary Size

In [26]:
vocab_size = 10000

### One-Hot Representation

In [27]:
corpus[1]

'daniel j flynn flynn hillari clinton big woman campu breitbart'

In [28]:
onehot_repr = [one_hot(words,vocab_size)for words in corpus]
onehot_repr[1]

[7607, 2878, 6539, 6539, 6261, 7780, 6338, 6857, 8445, 4342]

### Embedding Representation

In [29]:
sent_len = 20
embedded_docs = pad_sequences(onehot_repr,padding='post',maxlen=sent_len)
print(embedded_docs)

[[2874 3439 4908 ...    0    0    0]
 [7607 2878 6539 ...    0    0    0]
 [1897 6383 3143 ...    0    0    0]
 ...
 [2998 2878 6159 ...    0    0    0]
 [  95 7747 4773 ...    0    0    0]
 [8414 5902 1786 ...    0    0    0]]


In [30]:
embedded_docs[1]

array([7607, 2878, 6539, 6539, 6261, 7780, 6338, 6857, 8445, 4342,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0], dtype=int32)

In [31]:
embedded_docs[0]

array([2874, 3439, 4908,  140, 1418, 6153, 3772, 3527, 2712,  184, 8744,
       6379, 7182,    0,    0,    0,    0,    0,    0,    0], dtype=int32)

### Creating Model

In [32]:
from nltk.translate import metrics
features = 100

model = Sequential()
model.add(Embedding(vocab_size,features,input_length=sent_len))
model.add(LSTM(100))
model.add(Dense(256,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 100)           1000000   
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 256)               25856     
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 257       
                                                                 
Total params: 1,106,513
Trainable params: 1,106,513
Non-trainable params: 0
_________________________________________________________________
None


In [33]:
y = df['label']

In [34]:
len(embedded_docs),y.shape

(18285, (18285,))

In [35]:
X_final = np.array(embedded_docs)
y_final = np.array(y)

In [36]:
X_final.shape,y_final.shape

((18285, 20), (18285,))

In [37]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

In [38]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=20,batch_size=64, callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Epoch 1/20
Epoch 2/20


<keras.callbacks.History at 0x7fd5b2185f90>

In [39]:
y_pred = model.predict(X_test)

In [40]:
y_pred = np.where(y_pred>0.5,1,0)

In [41]:
from sklearn.metrics import confusion_matrix

In [42]:
confusion_matrix(y_test,y_pred)

array([[3362,   57],
       [  48, 2568]])

In [43]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9826014913007457

In [44]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98      3419
           1       0.98      0.98      0.98      2616

    accuracy                           0.98      6035
   macro avg       0.98      0.98      0.98      6035
weighted avg       0.98      0.98      0.98      6035

