In [37]:
import pandas as pd
import numpy as np
import tensorflow as tf 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional,LSTM,Dense,Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


 

In [38]:
data = pd.read_csv('train.csv')

In [39]:
data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [40]:
data.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [41]:
data.dropna(inplace=True)

In [42]:
data.shape

(18285, 5)

In [43]:
X = data.drop('label',axis=1)
y = data['label']

In [44]:
X.head()

Unnamed: 0,id,title,author,text
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ..."
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...


In [45]:
y.head()

0    1
1    0
2    1
3    1
4    1
Name: label, dtype: int64

In [46]:
y.value_counts()

label
0    10361
1     7924
Name: count, dtype: int64

In [47]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sanja\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [48]:
messages = X.copy()

In [49]:
messages['title'][0]

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'

In [54]:
ps = PorterStemmer()
corpus = []

if 'title' in messages.columns:
    for i in range(len(messages)):  
        review = messages['title'].iloc[i]
        review = review.lower()
        review = review.split()
        review = [ps.stem(word) for word in review if word.isalpha() and word not in stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)


In [55]:
voc_size = 10000

In [56]:
corpus

['hous dem even see letter jason chaffetz tweet',
 'hillari big woman campu breitbart',
 'truth might get fire',
 'civilian kill singl us airstrik identifi',
 'iranian woman jail fiction unpublish stori woman stone death adulteri',
 'jacki hollywood would love trump bomb north korea lack tran bathroom breitbart',
 'benoît hamon win french socialist presidenti nomin new york time',
 'plan ukrain courtesi trump associ new york time',
 'organ action partner disrupt agenda',
 'bbc comedi sketch housew caus outrag',
 'russian research discov secret nazi militari base arctic',
 'us offici see link trump russia',
 'paid govern troll social forum websit',
 'major leagu argentin find home success new york time',
 'well fargo chief abruptli step new york time',
 'anonym donor pay million releas everyon arrest dakota access pipelin',
 'fbi close',
 'chuck donald trump polit breitbart',
 'monica clinton sex scandal set crime',
 'rob trump breitbart',
 'abort pill order rise latin american nation z

In [57]:
onehot_repr = [one_hot(words,voc_size) for words in corpus]
onehot_repr

[[6810, 9479, 9324, 5955, 7054, 9377, 6078, 1853],
 [3945, 9235, 5957, 2587, 2685],
 [6053, 5276, 2295, 8423],
 [7093, 9132, 2270, 2734, 387, 3673],
 [1497, 5957, 9876, 5199, 5409, 4609, 5957, 849, 1439, 1343],
 [2420, 1097, 9561, 4770, 7563, 4410, 3929, 6364, 982, 7783, 446, 2685],
 [9339, 3674, 8489, 4193, 5193, 3834, 8138, 3439, 6013, 5690],
 [5968, 8402, 6965, 7563, 2734, 3439, 6013, 5690],
 [7555, 2823, 149, 6435, 7022],
 [6630, 3844, 6838, 5762, 4711, 5655],
 [1307, 9850, 389, 8943, 1382, 8920, 3386, 1416],
 [2734, 7701, 5955, 105, 7563, 389],
 [3333, 9277, 7990, 8619, 4978, 9171],
 [6312, 6514, 8815, 2264, 1680, 844, 3439, 6013, 5690],
 [2064, 1290, 4960, 9340, 6289, 3439, 6013, 5690],
 [665, 3405, 3336, 2427, 2654, 5424, 1454, 3416, 2353, 2976],
 [7199, 6950],
 [7126, 174, 7563, 8264, 2685],
 [5882, 733, 3201, 459, 8682, 2586],
 [1595, 7563, 2685],
 [9934, 1444, 4225, 7066, 3697, 6579, 1051, 5854, 3833, 3439, 6013, 5690],
 [923, 263, 78, 848, 3428, 2315],
 [3014, 3708, 386, 915

In [58]:
sent_length = 100

Embedding_docs = pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
Embedding_docs

array([[   0,    0,    0, ..., 9377, 6078, 1853],
       [   0,    0,    0, ..., 5957, 2587, 2685],
       [   0,    0,    0, ..., 5276, 2295, 8423],
       ...,
       [   0,    0,    0, ..., 3439, 6013, 5690],
       [   0,    0,    0, ..., 6971, 6224,  215],
       [   0,    0,    0, ...,    0,  966, 6750]])

In [59]:
Embedding_features= 40
model = Sequential()
model.add(Embedding(voc_size,Embedding_features,input_length=sent_length))
model.add(Bidirectional(LSTM(100)))
model.add(Dense(1,activation='sigmoid'))
model.build(input_shape=(None, sent_length))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()



In [60]:
x_final = np.array(Embedding_docs)
y_final = np.array(y)

In [61]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x_final,y_final,test_size=0.33,random_state=42)


In [62]:
model.fit(x_train,y_train,validation_data=(x_train,y_train),epochs=10,batch_size=32)

Epoch 1/10
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 67ms/step - accuracy: 0.8065 - loss: 0.3863 - val_accuracy: 0.9479 - val_loss: 0.1592
Epoch 2/10
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 107ms/step - accuracy: 0.9416 - loss: 0.1423 - val_accuracy: 0.9757 - val_loss: 0.0831
Epoch 3/10
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 141ms/step - accuracy: 0.9673 - loss: 0.0894 - val_accuracy: 0.9839 - val_loss: 0.0527
Epoch 4/10
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 104ms/step - accuracy: 0.9825 - loss: 0.0564 - val_accuracy: 0.9904 - val_loss: 0.0374
Epoch 5/10
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 147ms/step - accuracy: 0.9877 - loss: 0.0408 - val_accuracy: 0.9930 - val_loss: 0.0280
Epoch 6/10
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 86ms/step - accuracy: 0.9912 - loss: 0.0290 - val_accuracy: 0.9952 - val_loss: 0.0208
Epoch 7/10


<keras.src.callbacks.history.History at 0x2a5306d7110>

In [67]:
y_pred = model.predict(x_test)
y_pred = np.where(y_pred>=0.5,1,0)
y_pred


[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step


array([[1],
       [0],
       [0],
       ...,
       [0],
       [1],
       [0]])

In [69]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

confusion_matrix(y_test,y_pred)

array([[3143,  276],
       [ 325, 2291]], dtype=int64)

In [70]:
accuracy_score(y_test,y_pred)


0.9004142502071251

In [74]:
classification_report(y_test,y_pred)

'              precision    recall  f1-score   support\n\n           0       0.91      0.92      0.91      3419\n           1       0.89      0.88      0.88      2616\n\n    accuracy                           0.90      6035\n   macro avg       0.90      0.90      0.90      6035\nweighted avg       0.90      0.90      0.90      6035\n'

In [76]:


model.save('Fake_News_Detection.h5')

