In [1]:
import pandas as pd

In [2]:
real_data = pd.read_csv('/content/True.csv')
fake_data = pd.read_csv('/content/Fake.csv')
#add column
real_data['target'] = 0
fake_data['target'] = 1
real_data.tail()
#Merging the 2 datasets
df = pd.concat([real_data, fake_data], ignore_index=True, sort=False)
df.rename(columns={'target': 'label'}, inplace=True)


In [3]:
df.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0


In [4]:
df.shape

(44898, 5)

In [5]:
df.isnull().sum()

Unnamed: 0,0
title,0
text,0
subject,0
date,0
label,0


In [6]:
## Get the Independent Features

X=df.drop('label',axis=1)

In [7]:
## Get the Dependent features
y=df['label']

In [8]:
X.shape

(44898, 4)

In [9]:
y.shape

(44898,)

In [10]:
import tensorflow as tf

In [11]:
tf.__version__

'2.19.0'

In [12]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [13]:
### Vocabulary size
voc_size=5000

### Onehot Representation

In [14]:
messages=X.copy()

In [15]:
messages['title'][1]

'U.S. military to accept transgender recruits on Monday: Pentagon'

In [16]:
messages

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"
...,...,...,...,...
44893,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016"
44894,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016"
44895,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016"
44896,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016"


In [17]:
import nltk
import re
from nltk.corpus import stopwords

In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [19]:
### Dataset Preprocessing
from nltk.stem.porter import PorterStemmer ##stemming purpose
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [20]:
corpus

['u budget fight loom republican flip fiscal script',
 'u militari accept transgend recruit monday pentagon',
 'senior u republican senat let mr mueller job',
 'fbi russia probe help australian diplomat tip nyt',
 'trump want postal servic charg much amazon shipment',
 'white hous congress prepar talk spend immigr',
 'trump say russia probe fair timelin unclear nyt',
 'factbox trump twitter dec approv rate amazon',
 'trump twitter dec global warm',
 'alabama offici certifi senat elect jone today despit challeng cnn',
 'jone certifi u senat winner despit moor challeng',
 'new york governor question constitution feder tax overhaul',
 'factbox trump twitter dec vaniti fair hillari clinton',
 'trump twitter dec trump iraq syria',
 'man say deliv manur mnuchin protest new u tax law',
 'virginia offici postpon lotteri draw decid tie statehous elect',
 'u lawmak question businessman trump tower meet sourc',
 'trump twitter dec hillari clinton tax cut bill',
 'u appeal court reject challeng tr

In [21]:
corpus[1]

'u militari accept transgend recruit monday pentagon'

In [22]:
onehot_repr=[one_hot(words,voc_size)for words in corpus]
onehot_repr

[[4108, 1026, 2492, 45, 4970, 2495, 3565, 3912],
 [4108, 4075, 3138, 1523, 3967, 4213, 2199],
 [4148, 4108, 4970, 3174, 4023, 1576, 318, 4284],
 [2479, 4888, 24, 551, 4905, 1139, 2991, 2013],
 [4235, 1759, 2695, 956, 1408, 4651, 1929, 1668],
 [3808, 2915, 3359, 95, 4310, 4249, 2266],
 [4235, 2573, 4888, 24, 3105, 1936, 2148, 2013],
 [2485, 4235, 1961, 1444, 711, 319, 1929],
 [4235, 1961, 1444, 740, 181],
 [1976, 2081, 1849, 3174, 2978, 1898, 4585, 852, 2932, 1494],
 [1898, 1849, 4108, 3174, 2193, 852, 2158, 2932],
 [1903, 1271, 1988, 3148, 2250, 4290, 1398, 750],
 [2485, 4235, 1961, 1444, 1924, 3105, 2476, 4217],
 [4235, 1961, 1444, 4235, 4601, 656],
 [4498, 2573, 751, 4782, 4286, 3189, 1903, 4108, 1398, 2475],
 [190, 2081, 1485, 776, 4120, 42, 1871, 4327, 2978],
 [4108, 548, 3148, 2899, 4235, 2860, 1414, 2224],
 [4235, 1961, 1444, 2476, 4217, 1398, 485, 3010],
 [4108, 1358, 2504, 1068, 2932, 4235, 2572, 1055, 271],
 [284, 3511, 4286, 2130, 303, 3661, 3988, 888, 4782, 3539],
 [4290, 27

In [23]:
corpus[1]

'u militari accept transgend recruit monday pentagon'

In [24]:
onehot_repr[1]

[4108, 4075, 3138, 1523, 3967, 4213, 2199]

### Embedding Representation

In [25]:
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='post',maxlen=sent_length)
print(embedded_docs)

[[4108 1026 2492 ...    0    0    0]
 [4108 4075 3138 ...    0    0    0]
 [4148 4108 4970 ...    0    0    0]
 ...
 [2358 2778 1269 ...    0    0    0]
 [2720  770 3862 ...    0    0    0]
 [4108  419 1842 ...    0    0    0]]


In [26]:
embedded_docs[1]

array([4108, 4075, 3138, 1523, 3967, 4213, 2199,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0], dtype=int32)

In [27]:
embedded_docs[0]

array([4108, 1026, 2492,   45, 4970, 2495, 3565, 3912,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0], dtype=int32)

In [28]:
## Creating model
embedding_vector_features=40 ##features representation
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_shape=(sent_length,)))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

  super().__init__(**kwargs)


None


In [29]:
len(embedded_docs),y.shape

(44898, (44898,))

In [30]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [31]:
X_final.shape,y_final.shape

((44898, 20), (44898,))

In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

### Model Training

In [33]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
[1m471/471[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 35ms/step - accuracy: 0.8546 - loss: 0.3159 - val_accuracy: 0.9309 - val_loss: 0.1854
Epoch 2/10
[1m471/471[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 35ms/step - accuracy: 0.9477 - loss: 0.1404 - val_accuracy: 0.9353 - val_loss: 0.1649
Epoch 3/10
[1m471/471[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 42ms/step - accuracy: 0.9560 - loss: 0.1178 - val_accuracy: 0.9349 - val_loss: 0.1633
Epoch 4/10
[1m471/471[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 53ms/step - accuracy: 0.9613 - loss: 0.1046 - val_accuracy: 0.9333 - val_loss: 0.1862
Epoch 5/10
[1m471/471[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 35ms/step - accuracy: 0.9639 - loss: 0.0887 - val_accuracy: 0.9369 - val_loss: 0.1707
Epoch 6/10
[1m471/471[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 38ms/step - accuracy: 0.9710 - loss: 0.0714 - val_accuracy: 0.9396 - val_loss: 0.1767
Epoch 7/10
[1m4

<keras.src.callbacks.history.History at 0x7dd7497692e0>

### Adding Dropout

### Performance Metrics And Accuracy

In [34]:
# y_pred=model.predict(X_test)
# Get probability predictions
y_prob = model.predict(X_test).reshape(-1)

# Convert probabilities to class labels (better threshold)
y_pred = (y_prob >= 0.5).astype(int)


[1m464/464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step


In [35]:
# y_pred=np.where(y_pred > 0.5, 1,0)# y_pred=np.where(y_pred > 0.6, 1,0) ##AUC ROC Curve

In [36]:
from sklearn.metrics import confusion_matrix

In [37]:
confusion_matrix(y_test,y_pred)

array([[6655,  445],
       [ 406, 7311]])

In [38]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.942565971519201

In [39]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94      7100
           1       0.94      0.95      0.95      7717

    accuracy                           0.94     14817
   macro avg       0.94      0.94      0.94     14817
weighted avg       0.94      0.94      0.94     14817



In [41]:
model.save("fake_news_lstm_model.keras")
