## Fake News Classifier Using LSTM

Author - Sagnick Bhar  
Dataset: https://www.kaggle.com/datasets/hassanamin/textdb3  
Accuracy = 75.27%

### Importing Libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

###  Importing Dataset

In [2]:
#Importing Dataset
df=pd.read_csv('../input/textdb3/fake_or_real_news.csv')

In [3]:
df.head()

In [4]:
##Drop Nan Values
df=df.dropna()

In [5]:
## Get the Independent Features
X=df.drop('label',axis=1)

In [6]:
## Get the Dependent features
y=df['label']

In [7]:
X.shape

In [8]:
y.shape

###  Data Preprocessing

In [9]:
### Vocabulary size
voc_size=6256

In [10]:
messages=X.copy()

In [11]:
messages['title'][1]

In [12]:
messages.reset_index(inplace=True)

In [13]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
print("Process Completed")

In [14]:
corpus[1]

In [15]:
# One Hot Representation of Sentence
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
for i in range (len(y)):
    if(y[i]=="REAL"):
        y[i]=1
    else:
        y[i]=0

onehot_repr[1]

In [16]:
y

In [17]:
list_len = [len(i) for i in onehot_repr]
print(max(list_len))

### Embedding Representation

In [18]:
sent_length=26
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)

print(embedded_docs)

In [19]:
embedded_docs[0]

In [20]:
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

In [21]:
len(embedded_docs),y.shape

In [22]:
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [23]:
X_final.shape,y_final.shape

### Splitting Dataset into Train and Test 

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=0)

In [25]:
y_train1 =tf.convert_to_tensor(y_train, dtype=tf.int64)
y_test1 =tf.convert_to_tensor(y_test, dtype=tf.int64)

### Model Training

In [27]:
# Training
model.fit(X_train,y_train1 ,validation_data=(X_test,y_test1),epochs=50,batch_size=64)

### Performance Metrics And Accuracy

In [28]:
y_pred=(model.predict(X_test) > 0.5).astype("int32")

In [29]:
from sklearn.metrics import confusion_matrix

In [30]:
confusion_matrix(tf.convert_to_tensor(y_test, dtype=tf.int64),y_pred)

In [31]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test1,y_pred)

**The End**