#**Fake news Classifier Using LSTM**

In [5]:
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
dataframe = pd.read_csv('train.csv')
print(dataframe.head)

<bound method NDFrame.head of           id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
0                                  Darrell Lucus   
1                                Daniel J. Flynn   
2        

## Data cleaning

In [9]:
dataframe = dataframe.dropna()
X = dataframe.drop('label', axis = 1)
y = dataframe['label']

In [10]:
print(X.shape)
print(y.shape)

(18285, 4)
(18285,)


##Onehot Representation

In [11]:
news = X.copy()
news.reset_index(inplace=True)

### Stemming

In [17]:
ps = PorterStemmer()

corpus = []

for i in range(0, len(news)):
  article = re.sub('[^a-zA-Z]', ' ', news['title'][i])
  article = article.lower()
  article = article.split()

  article = [ps.stem(word) for word in article if word not in stopwords.words('english')]
  article = ' '.join(article)
  corpus.append(article)

In [18]:
vocabulary_size = 5000
onehot_representation = [one_hot(words, vocabulary_size) for words in corpus]
onehot_representation

[[3672, 1636, 3313, 3304, 3017, 998, 111, 1896, 670, 4362],
 [1489, 4705, 103, 965, 685, 1663, 4153],
 [3042, 1914, 2587, 122],
 [3660, 4808, 1972, 4240, 2073, 1047],
 [3712, 685, 1157, 3838, 732, 3270, 685, 205, 438, 921],
 [2232,
  3961,
  1400,
  2242,
  58,
  2495,
  1523,
  4587,
  3776,
  2692,
  960,
  4739,
  51,
  1278,
  4153],
 [1761, 1644, 3689, 1462, 4175, 2203, 3433, 1977, 4185, 2835, 504],
 [4475, 490, 2619, 2264, 1372, 2545, 2495, 3070, 4185, 2835, 504],
 [2042, 1392, 3384, 2242, 2557, 3521, 3107, 566, 2495, 1392],
 [764, 1885, 4732, 4951, 3981, 3953, 4755, 4929],
 [961, 2048, 4377, 2405, 768, 1904, 2699, 861, 3884, 4172, 4220],
 [4240, 134, 3017, 3521, 2495, 1372],
 [1593, 4584, 3116, 1576, 3235, 1253, 700, 1267, 357],
 [4444, 3300, 813, 541, 1716, 958, 3640, 4185, 2835, 504],
 [4019, 3468, 3831, 2092, 604, 4185, 2835, 504],
 [2270, 3130, 2565, 3959, 1421, 4349, 194, 3609, 147, 3386],
 [1409, 1858, 4705],
 [4212, 2527, 339, 4040, 2495, 1122, 4385, 4153],
 [878, 220, 10

### Word Embedding Representation

In [19]:
sentence_length = 20
embedded_document = pad_sequences(onehot_representation, padding = 'pre', maxlen = sentence_length)
print(embedded_document)

[[   0    0    0 ... 1896  670 4362]
 [   0    0    0 ...  685 1663 4153]
 [   0    0    0 ... 1914 2587  122]
 ...
 [   0    0    0 ... 4185 2835  504]
 [   0    0    0 ...  629 2569  394]
 [   0    0    0 ... 3181 3702 1099]]


## Training RNN model with LSTM and Dense layers

In [20]:
embedding_vector_features = 40
model = Sequential()
model.add(Embedding(vocabulary_size, embedding_vector_features, input_length = sentence_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 40)            200000    
                                                                 
 lstm (LSTM)                 (None, 100)               56400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
X_final = np.array(embedded_document)
y_final = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=0)

In [22]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fd9f81da550>

In [23]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")



## Checking Accuracy

In [24]:
confusion_matrix(y_test, y_pred)

array([[1848,  192],
       [ 127, 1490]])

In [25]:
print("Accuracy => %.2f" % (accuracy_score(y_test, y_pred)*100))

Accuracy => 91.28
