# **CLASSIFYING FAKE NEWS USING LSTM**



In [None]:
#importing libraries
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
ds = pd.read_csv("../input/fake-news/train.csv")
ds.head()

In [None]:
ds.isnull().sum()

**DROPPING NULL VALUES**

In [None]:
ds = ds.dropna()
ds.isnull().sum()

In [None]:
#after dropping null values,indexes will be unordered therfore resetting indexes
ds.reset_index(inplace = True,drop = True)
ds.head()

In [None]:
#defining dependent and independent vectors
#taking only title for prediction
x = ds.iloc[:,1:2]
y = ds['label']

In [None]:
x.head()

In [None]:
#checking number of real and fake news
sns.countplot(x = 'label',data = ds)

**ALMOST 10000 ARE RELIABLE(0) AND 8000 ARE UNRELIABLE(1)**

In [None]:
#Text Cleaning and preprocessing

cleaned = []
for i in range(0,len(ds)):
    
    #removing words any other than (a-z) and (A-Z)
    text = re.sub('[^a-zA-Z]',' ', x['title'][i])
    
    #converting all words into lower case
    text = text.lower()
    
    #tokenizing 
    text = text.split()
    
    #stemming and removing stopwords
    ps = PorterStemmer()
    text = [ps.stem(words) for words in text if words not in stopwords.words('english')]
    text = ' '.join(text)
    cleaned.append(text)

In [None]:
#cleaned text
cleaned[:5]

In [None]:
#taking dictionary size 5000
vocab_size = 5000

#one hot encoding
one_hot_dir = [one_hot(words,vocab_size) for words in cleaned]

#length of all rows should be equal therefore applying padding
#this will adjust size by adding 0 at staring of the shorter rows
embedded_layer = pad_sequences(one_hot_dir,padding = 'pre')
embedded_layer

**LENGTH OF ALL ROWS IS EQUAL NOW**

In [None]:
#converting into numpy arrays.
x = np.array(embedded_layer)
y = np.array(y)

In [None]:
#splitting the Dataset into Train and Test set
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [None]:
#creating model using LSTM
model = Sequential()

#taking number features as 50
model.add(Embedding(vocab_size,50,input_length = len(embedded_layer[0])))
model.add(Dropout(0.5))

#adding LSTM layers with 100 neurons
model.add(LSTM(100))
model.add(Dropout(0.5))

#adding output layer 
model.add(Dense(1,activation="sigmoid"))

#compiling the model
model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])

In [None]:
#summary of model
model.summary()

In [None]:
#training the model
model.fit(x_train, y_train, validation_data = (x_test,y_test), epochs = 5, batch_size = 32)

In [None]:
#predicting and getting accuracy
y_pred = model.predict(x_test)
y_pred = (y_pred > 0.5)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

In [None]:
#getting confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)