## PROJECT : Detecting Fake News with LSTM and Word Embedding

OBJECTIVE: 

> To train a machine learning model that can accurately categorize news articles as either fake or real, using LSTM and word embedding techniques.

DATASET : https://www.kaggle.com/c/fake-news/data#

In [63]:
# importing the libraries 
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split


# tensorflow library
import tensorflow as tf
print (tf.__version__)
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential

2.11.0


In [86]:
## GETTING THE DATASET :

df = pd.read_csv('/content/drive/MyDrive/UNIV.AI/NLP Intro /Datasets/FAKE NEWS DATASET/train.csv', usecols = ['title','text', 'label'])
display(df.shape, df.head())

(20800, 3)

Unnamed: 0,title,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...,1


In [87]:
# Removing all the rows having null values

df = df.dropna()

# # Resetting the indexes 
df.reset_index(drop = True, inplace = True)

display(df.shape, df.head())

(20203, 3)

Unnamed: 0,title,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...,1


In [94]:
# Spliting the Predictor and Label:
X = df['title']
y = df['label']

## CLEANING THE TEXTS AND PERFORMING WORD_EMBEDDING using Tensorflow

In [96]:
## CLEANING THE texts (X) / Preprocessing the Data
ps = PorterStemmer()
corpus = []
for i in range (len(X)):
  plain_text = re.sub('[^a-zA-Z]', ' ', X[i]) 
  plain_text = plain_text.lower()
  plain_text = plain_text.split()

  plain_text = [ps.stem(word) for word in plain_text if not word in stopwords.words('english')]
  corpus.append(" ".join(plain_text))


In [99]:
# Initializing the vocabulary size/ dictionary Size:

voc_size  = 5000

### One-hot representation

In [1]:
one_hot_rep = [one_hot(words, voc_size) for words in corpus]
# print(one_hot_rep)

## Embedding Representation

In [102]:
sent_length  = 20
embedded_text  = pad_sequences(one_hot_rep, padding = 'post', maxlen = sent_length)
print (embedded_text[:5])

[[4584 2633  409 3335 1208 2516 1727 4752  791  426    0    0    0    0
     0    0    0    0    0    0]
 [4405 1939   92 1752  446 4979 2442    0    0    0    0    0    0    0
     0    0    0    0    0    0]
 [4995 2642 4654 4014    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0]
 [1064 2301 1734 1317 4022 3762    0    0    0    0    0    0    0    0
     0    0    0    0    0    0]
 [2066  446 1513 1957 3865  363  446 2583 4089 4082    0    0    0    0
     0    0    0    0    0    0]]


In [103]:
# initializing our Sequential model with 20 dimensions/features :
dim = 40

model = Sequential(name= 'LSTM_Embedded_Model')
model.add(Embedding(voc_size, dim, input_length = sent_length)) # Adding our embedding layer
model.add(LSTM(100)) # adding 1 LSTM Layer
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()

Model: "LSTM_Embedded_Model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 20, 40)            200000    
                                                                 
 lstm_3 (LSTM)               (None, 100)               56400     
                                                                 
 dense_3 (Dense)             (None, 1)                 101       
                                                                 
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________


In [104]:
x_final = np.array(embedded_text)
y_final = np.array(y)

In [105]:
display(x_final.shape, y_final.shape)

(20203, 20)

(20203,)

In [106]:
## Splitting the training and testing data:
x_train, x_test, y_train, y_test = train_test_split(x_final, y_final, train_size = 0.75, random_state =  41)

## Training the model:


In [107]:
# we already defined the model:
# Fitting the model with training data:

model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs= 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f9086f705e0>

## Performance Matrix and Accuracy

In [108]:
from sklearn.metrics  import accuracy_score , classification_report

y_pred = model.predict(x_test)



In [109]:
final_predictions = []
for i in y_pred:
  if (i[0]) > 0.5:
    final_predictions.append(1)
  else:
    final_predictions.append(0)

In [111]:
print (f'''
MODEL REPORTS:

1) Test Accuracy : {accuracy_score(y_test, final_predictions)}

2) Classification Report: 
{classification_report(y_test, final_predictions)}

''')


MODEL REPORTS:

1) Test Accuracy : 0.9164521876856068

2) Classification Report: 
              precision    recall  f1-score   support

           0       0.91      0.92      0.92      2589
           1       0.92      0.91      0.91      2462

    accuracy                           0.92      5051
   macro avg       0.92      0.92      0.92      5051
weighted avg       0.92      0.92      0.92      5051



