# <font color = 'blue'> Fake News Detector <font/> 
# <font color = 'brown'> 1. Preprocess, 2. one_hot encoding, 3. create LSTM Model, 4. oberve the alteration in shape 5. observe total weights <font/> 

In [1]:
# Import relevant libraries
import re
import pandas as pd
import nltk
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kwabe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kwabe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
df = pd.read_csv(r"C:\Users\kwabe\Desktop\DS Projects\NLP\082 fakenews\train.csv") 
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [3]:
df['title'][100]

'Technocracy: The Real Reason Why The UN Wants Control Over The Internet'

In [4]:
df['label'][100]

1

In [5]:
df.shape

(20800, 5)

In [6]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [7]:
# Drop title and text roles with null values
df = df.dropna(subset=['title', 'text'])
df.isnull().sum()

id           0
title        0
author    1918
text         0
label        0
dtype: int64

In [8]:
# Prepare and create the model
x = df.drop(columns = 'label') #Drop the target class (label)
y = df['label']

In [9]:
x.shape, y.shape

((20203, 4), (20203,))

In [10]:
# Data Preprocessing

In [11]:
# Create a shallow copy - create a new object
copy = x.copy()
#copies of the reference of nested objects
copy.reset_index(inplace=True)

In [12]:
ws = WordNetLemmatizer()
list_titles = []
for i in range(0, len(copy)):
    #print(i) #Print copy[title - i]
    headline = re.sub('[^a-zA-Z]', ' ', copy['title'][i])
    #Matching the text string for any lowercase or uppercase
    # When the ^(hat symbol) is on the inside of [], it matches any character that doesn't appear inside []
    # When ^ is on the outside of the []; it matches the beginning of the line/string - title
    headline = headline.lower()
    headline = headline.split()
    headline = [ws.lemmatize(word) for word in headline if word not in stopwords.words("english")]
    headline = ' '.join(headline)
    list_titles.append(headline)

In [13]:
list_titles[:4]

['house dem aide even see comey letter jason chaffetz tweeted',
 'flynn hillary clinton big woman campus breitbart',
 'truth might get fired',
 'civilian killed single u airstrike identified']

# <font color = 'blue'> One Hot Encoding<br><font color = 'green'> Encoding each word in the titles into an integer.

In [46]:
vocab = 10000
hot_title  = [one_hot(i, vocab) for i in list_titles]
hot_title[:4]

[[1640, 1398, 4228, 3223, 8163, 7302, 1033, 3338, 6626, 3398],
 [9605, 7503, 5381, 6605, 9301, 2851, 8115],
 [8806, 368, 8512, 8931],
 [1153, 1782, 7807, 1517, 1417, 6120]]

In [42]:
#Longest sentence
longest = len(max(list_titles, key = len))
longest

356

In [47]:
# Make every sentence to have the same length
max_length = 356
embed_input = pad_sequences(hot_title, maxlen = max_length, padding = 'pre')
print(embed_input)

[[   0    0    0 ... 3338 6626 3398]
 [   0    0    0 ... 9301 2851 8115]
 [   0    0    0 ...  368 8512 8931]
 ...
 [   0    0    0 ... 6301 7036 6004]
 [   0    0    0 ... 3275 1404 5396]
 [   0    0    0 ... 7672 6308 6212]]


In [44]:
print(embed_input.shape)

(20203, 356)


In [50]:
# Develop the Model
model = Sequential() # Create the sequential model
model.add(Embedding(input_dim=vocab, output_dim=40, input_length=max_length))
model.add(LSTM(150))
model.add(Dense(1, activation='sigmoid'))

#Build the model
model.build(input_shape=(None, max_length))
#Loss Function
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
print(model.summary())

None


In [62]:
len(embed_input), y.shape

(20203, (20203,))

In [63]:
x_final = np.array(embed_input)
y_final = np.array(y)
x_final.shape, y_final.shape

((20203, 356), (20203,))

In [64]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_final, y_final, test_size = 0.33, random_state=42)

In [65]:
# Fit the model
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size = 64)

Epoch 1/10
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 426ms/step - accuracy: 0.8402 - loss: 0.4020 - val_accuracy: 0.9235 - val_loss: 0.1827
Epoch 2/10
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 443ms/step - accuracy: 0.9486 - loss: 0.1315 - val_accuracy: 0.9259 - val_loss: 0.2034
Epoch 3/10
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 437ms/step - accuracy: 0.9741 - loss: 0.0703 - val_accuracy: 0.9226 - val_loss: 0.2076
Epoch 4/10
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 418ms/step - accuracy: 0.9876 - loss: 0.0403 - val_accuracy: 0.9225 - val_loss: 0.2418
Epoch 5/10
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 439ms/step - accuracy: 0.9930 - loss: 0.0230 - val_accuracy: 0.9202 - val_loss: 0.3347
Epoch 6/10
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 441ms/step - accuracy: 0.9977 - loss: 0.0095 - val_accuracy: 0.8725 - val_loss: 0.3226
Epoch 7/10

<keras.src.callbacks.history.History at 0x2ae2ba7b350>

In [72]:
# Apply a threshold to the predictions
y_pred = (model.predict(x_test) > 0.5).astype(int)

[1m209/209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 72ms/step


In [73]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[3101,  319],
       [ 242, 3005]], dtype=int64)

In [75]:
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[3101  319]
 [ 242 3005]]


In [76]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Accuracy: Proportion of correctly predicted instances
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Precision: Proportion of positive predictions that were correct
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision:.4f}")

# Recall: Proportion of actual positive instances that were predicted correctly
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall:.4f}")

# F1-Score: Harmonic mean of precision and recall
f1 = f1_score(y_test, y_pred)
print(f"F1-Score: {f1:.4f}")


Accuracy: 0.9159
Precision: 0.9040
Recall: 0.9255
F1-Score: 0.9146
