## Fake News Classifier Using LSTM

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(
    '/content/trained.csv',
    engine='python',
    on_bad_lines='skip'
)


In [3]:
df.head()

Unnamed: 0,index,title,text,subject,date,class,Unnamed: 6
0,0,PRESIDENT TRUMP Explains New “America First” R...,That s what we re talking about! Another campa...,,"Aug 2, 2017",Fake,
1,1,TERMINALLY ILL FORMER MISS WI: “Until my last ...,How is it that Sean Hannity is the only media ...,politics,"Oct 4, 2016",Fake,
2,2,Cruz Humiliated By Moderator After Lie About ...,Almost immediately after learning that longtim...,,"February 13, 2016",Fake,
3,3,"Russia revels in Trump victory, looks to sanct...",MOSCOW (Reuters) - For all their mutual praise...,politicsNews,"November 9, 2016",,
4,4,Trump's bid to open U.S. monuments to developm...,WASHINGTON (Reuters) - The Trump administratio...,politicsNews,"May 26, 2017",Real,


In [4]:
df.drop(columns=['Unnamed: 6'], inplace=True)
df.rename(columns={'class': 'label'}, inplace=True)
df['label'] = (
    df['label']
    .astype(str)
    .str.strip()
    .str.upper()
    .map({'FAKE': 1, 'REAL': 0})
)

# Drop rows where label could not be mapped
df = df.dropna(subset=['label'])

# Convert safely to int
df['label'] = df['label'].astype(int)

df.head()

Unnamed: 0,index,title,text,subject,date,label
0,0,PRESIDENT TRUMP Explains New “America First” R...,That s what we re talking about! Another campa...,,"Aug 2, 2017",1
1,1,TERMINALLY ILL FORMER MISS WI: “Until my last ...,How is it that Sean Hannity is the only media ...,politics,"Oct 4, 2016",1
2,2,Cruz Humiliated By Moderator After Lie About ...,Almost immediately after learning that longtim...,,"February 13, 2016",1
4,4,Trump's bid to open U.S. monuments to developm...,WASHINGTON (Reuters) - The Trump administratio...,politicsNews,"May 26, 2017",0
5,5,UNREAL! HERE’S WHY ICE RELEASED BUT DIDN’T DEP...,THANK GOODNESS FOR THE CENTER FOR IMMIGRATION ...,Government News,"Apr 28, 2016",1


In [5]:
df.shape

(39998, 6)

In [6]:
df.isnull().sum()

Unnamed: 0,0
index,0
title,0
text,0
subject,2
date,0
label,0


In [7]:
###Drop Nan Values
df=df.dropna()


In [8]:
df.head()

Unnamed: 0,index,title,text,subject,date,label
1,1,TERMINALLY ILL FORMER MISS WI: “Until my last ...,How is it that Sean Hannity is the only media ...,politics,"Oct 4, 2016",1
4,4,Trump's bid to open U.S. monuments to developm...,WASHINGTON (Reuters) - The Trump administratio...,politicsNews,"May 26, 2017",0
5,5,UNREAL! HERE’S WHY ICE RELEASED BUT DIDN’T DEP...,THANK GOODNESS FOR THE CENTER FOR IMMIGRATION ...,Government News,"Apr 28, 2016",1
6,6,CONSEQUENCES OF LIBERAL TOLERANCE: He Had An I...,Terrified concert goers fled an Ariana Grande ...,politics,"May 25, 2017",1
7,7,U.S. NEWS and WORLD REPORT Publishes List Of T...,"More than 21,000 people from all regions of th...",politics,"Jul 13, 2017",1


In [9]:
## Get the Independent Features

X=df.drop('label',axis=1)

In [10]:
## Get the Dependent features
y=df['label']

In [11]:
X.shape

(39996, 5)

In [12]:
y.shape

(39996,)

In [13]:
import tensorflow as tf

In [14]:
tf.__version__

'2.19.0'

In [15]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [16]:
### Vocabulary size
voc_size=5000

### Onehot Representation

In [17]:
messages=X.copy()

In [18]:
messages['title'][1]

'TERMINALLY ILL FORMER MISS WI: “Until my last breath, I will use this voice to tell who Mr. Trump really is” [VIDEO]'

In [19]:
messages

Unnamed: 0,index,title,text,subject,date
1,1,TERMINALLY ILL FORMER MISS WI: “Until my last ...,How is it that Sean Hannity is the only media ...,politics,"Oct 4, 2016"
4,4,Trump's bid to open U.S. monuments to developm...,WASHINGTON (Reuters) - The Trump administratio...,politicsNews,"May 26, 2017"
5,5,UNREAL! HERE’S WHY ICE RELEASED BUT DIDN’T DEP...,THANK GOODNESS FOR THE CENTER FOR IMMIGRATION ...,Government News,"Apr 28, 2016"
6,6,CONSEQUENCES OF LIBERAL TOLERANCE: He Had An I...,Terrified concert goers fled an Ariana Grande ...,politics,"May 25, 2017"
7,7,U.S. NEWS and WORLD REPORT Publishes List Of T...,"More than 21,000 people from all regions of th...",politics,"Jul 13, 2017"
...,...,...,...,...,...
39995,39995,U.S. officials step up rhetoric on Russia link...,(Reuters) - Republican vice presidential candi...,politicsNews,"October 16, 2016"
39996,39996,McDonald’s Manager To Hispanic Customer: ‘Can...,An employee working at a North Carolina McDona...,News,"September 13, 2017"
39997,39997,SMUG LIBERAL LAW PROFESSOR SHUT DOWN BY TUCKER...,Tucker to sanctuary city supporter: how did a ...,politics,"Mar 29, 2017"
39998,39998,Robert Parry: Sorting Out the Russia Mess,Consortium News Exclusive: The U.S. mainstream...,US_News,"October 31, 2017"


In [20]:
messages.reset_index(inplace=True)

In [21]:
messages

Unnamed: 0,level_0,index,title,text,subject,date
0,1,1,TERMINALLY ILL FORMER MISS WI: “Until my last ...,How is it that Sean Hannity is the only media ...,politics,"Oct 4, 2016"
1,4,4,Trump's bid to open U.S. monuments to developm...,WASHINGTON (Reuters) - The Trump administratio...,politicsNews,"May 26, 2017"
2,5,5,UNREAL! HERE’S WHY ICE RELEASED BUT DIDN’T DEP...,THANK GOODNESS FOR THE CENTER FOR IMMIGRATION ...,Government News,"Apr 28, 2016"
3,6,6,CONSEQUENCES OF LIBERAL TOLERANCE: He Had An I...,Terrified concert goers fled an Ariana Grande ...,politics,"May 25, 2017"
4,7,7,U.S. NEWS and WORLD REPORT Publishes List Of T...,"More than 21,000 people from all regions of th...",politics,"Jul 13, 2017"
...,...,...,...,...,...,...
39991,39995,39995,U.S. officials step up rhetoric on Russia link...,(Reuters) - Republican vice presidential candi...,politicsNews,"October 16, 2016"
39992,39996,39996,McDonald’s Manager To Hispanic Customer: ‘Can...,An employee working at a North Carolina McDona...,News,"September 13, 2017"
39993,39997,39997,SMUG LIBERAL LAW PROFESSOR SHUT DOWN BY TUCKER...,Tucker to sanctuary city supporter: how did a ...,politics,"Mar 29, 2017"
39994,39998,39998,Robert Parry: Sorting Out the Russia Mess,Consortium News Exclusive: The U.S. mainstream...,US_News,"October 31, 2017"


In [22]:
import nltk
import re
from nltk.corpus import stopwords

In [23]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [24]:
### Dataset Preprocessing
from nltk.stem.porter import PorterStemmer ##stemming purpose
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'].iloc[i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [25]:
corpus

['termin ill former miss wi last breath use voic tell mr trump realli video',
 'trump bid open u monument develop draw call protect',
 'unreal ice releas deport crimin illeg video',
 'consequ liber toler isi flag hang roof yet one report video',
 'u news world report publish list top popular nation refuge want live',
 'trump administr nafta demand make sens union pacif ceo',
 'turk flock social media gold trader sanction case',
 'trump longtim advis offici ban appear cnn racist',
 'eu state push reform labor rule sought franc macron',
 'sit gop senat enough donat alabama democrat senat imag',
 'boom harri faulkner blow russia collus theori one smart question video',
 'angri leftist caught video steal student trump hat demand school make stop wear hat f f ing freedom speech boy',
 'mcmaster gave susan rice continu access classifi info still clearanc',
 'pastor shot kill middl church servic ohio',
 'china take action thousand websit harm obscen content',
 'van load illeg move releas away

In [26]:
corpus[1]

'trump bid open u monument develop draw call protect'

In [27]:
onehot_repr=[one_hot(words,voc_size)for words in corpus]
onehot_repr

[[3146,
  3949,
  3829,
  1700,
  866,
  3221,
  1214,
  3906,
  1621,
  4707,
  1273,
  2443,
  1738,
  1499],
 [2443, 1388, 4969, 2143, 3098, 4112, 3603, 2377, 2300],
 [3574, 4258, 1415, 732, 4476, 4981, 1499],
 [4302, 582, 190, 4567, 4540, 628, 4935, 706, 835, 2030, 1499],
 [2143, 1263, 940, 2030, 3918, 1394, 1055, 1620, 657, 3360, 4980, 2165],
 [2443, 4811, 1900, 2852, 1492, 4849, 4757, 2485, 3102],
 [2226, 2446, 4205, 356, 3699, 4242, 3827, 4484],
 [2443, 2519, 4048, 4414, 1010, 2670, 3100, 1185],
 [2376, 2069, 648, 1546, 2750, 874, 1983, 4233, 3691],
 [4994, 2626, 1685, 2211, 1917, 4268, 33, 1685, 4246],
 [304, 3284, 638, 170, 3957, 3868, 4744, 835, 62, 463, 1499],
 [3592,
  272,
  1893,
  1499,
  2507,
  761,
  2443,
  2891,
  2852,
  3474,
  1492,
  1065,
  3072,
  2891,
  2143,
  2143,
  2406,
  339,
  1182,
  2219],
 [2814, 2656, 2778, 4358, 2580, 2362, 2277, 1898, 2756, 1915],
 [1912, 4453, 4944, 244, 1583, 4304, 4845],
 [680, 395, 3320, 1937, 304, 1932, 2615, 3890],
 [3765,

In [28]:
corpus[1]

'trump bid open u monument develop draw call protect'

In [29]:
onehot_repr[1]

[2443, 1388, 4969, 2143, 3098, 4112, 3603, 2377, 2300]

### Embedding Representation

In [30]:
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='post',maxlen=sent_length)
print(embedded_docs)

[[3146 3949 3829 ...    0    0    0]
 [2443 1388 4969 ...    0    0    0]
 [3574 4258 1415 ...    0    0    0]
 ...
 [ 347  582   79 ... 2750 1499    0]
 [1586 2211 4320 ...    0    0    0]
 [3464 1679 4726 ...    0    0    0]]


In [31]:
embedded_docs[1]

array([2443, 1388, 4969, 2143, 3098, 4112, 3603, 2377, 2300,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0], dtype=int32)

In [32]:
embedded_docs[0]

array([3146, 3949, 3829, 1700,  866, 3221, 1214, 3906, 1621, 4707, 1273,
       2443, 1738, 1499,    0,    0,    0,    0,    0,    0], dtype=int32)

In [33]:
## Creating model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input

embedding_vector_features = 40  # feature representation

model = Sequential([
    Input(shape=(sent_length,)),
    Embedding(voc_size, embedding_vector_features),
    LSTM(100),
    # Bidirectional(LSTM(100)),     // if want to use bidirectional instead of simple LSTM
    Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()


In [34]:
len(embedded_docs),y.shape

(39996, (39996,))

In [35]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [36]:
X_final.shape,y_final.shape

((39996, 20), (39996,))

In [37]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

### Model Training

In [38]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
[1m419/419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 63ms/step - accuracy: 0.8321 - loss: 0.3383 - val_accuracy: 0.9258 - val_loss: 0.1911
Epoch 2/10
[1m419/419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 38ms/step - accuracy: 0.9448 - loss: 0.1471 - val_accuracy: 0.9338 - val_loss: 0.1693
Epoch 3/10
[1m419/419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 37ms/step - accuracy: 0.9580 - loss: 0.1167 - val_accuracy: 0.9353 - val_loss: 0.1672
Epoch 4/10
[1m419/419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 40ms/step - accuracy: 0.9646 - loss: 0.0986 - val_accuracy: 0.9303 - val_loss: 0.1946
Epoch 5/10
[1m419/419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 40ms/step - accuracy: 0.9665 - loss: 0.0964 - val_accuracy: 0.9347 - val_loss: 0.1889
Epoch 6/10
[1m419/419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 39ms/step - accuracy: 0.9687 - loss: 0.0892 - val_accuracy: 0.9344 - val_loss: 0.2213
Epoch 7/10
[1m4

<keras.src.callbacks.history.History at 0x78ff41885e20>

### Performance Metrics And Accuracy

In [39]:
y_pred=model.predict(X_test)

[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step


In [40]:
y_pred=np.where(y_pred > 0.5, 1,0) ##AUC ROC Curve

In [41]:
from sklearn.metrics import confusion_matrix

In [42]:
confusion_matrix(y_test,y_pred)

array([[5779,  518],
       [ 339, 6563]])

In [43]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9350708386999015

In [44]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93      6297
           1       0.93      0.95      0.94      6902

    accuracy                           0.94     13199
   macro avg       0.94      0.93      0.93     13199
weighted avg       0.94      0.94      0.94     13199



In [45]:
model.save("fake_news_model.keras")