In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [6]:
fake_df = pd.read_csv('/home/david/projects/fake_news_detector/data/Fake.csv')
true_df = pd.read_csv('/home/david/projects/fake_news_detector/data/True.csv')

# Add labels: 0 for Fake, 1 for True
fake_df['label'] = 0
true_df['label'] = 1

# Combine and shuffle
df = pd.concat([fake_df, true_df], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)


In [7]:
df['content'] = df['title'] + " " + df['text']

# Optional: remove missing or extremely short entries
df = df[df['content'].str.len() > 50]

# Tokenization
max_words = 10000
max_len = 500

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['content'])

X = tokenizer.texts_to_sequences(df['content'])
X = pad_sequences(X, maxlen=max_len)

y = df['label'].values


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)


In [9]:
model = Sequential([
    Embedding(max_words, 64, input_length=max_len),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(32)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


I0000 00:00:1744412546.709182    5508 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5564 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3070, pci bus id: 0000:2b:00.0, compute capability: 8.6


In [10]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  1


In [11]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Callbacks
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True,
    verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    patience=2,
    verbose=1,
    min_lr=1e-6
)

# Model Training
history = model.fit(
    X_train, y_train,
    batch_size=128,
    epochs=10,                          
    validation_split=0.2,
    callbacks=[early_stop, reduce_lr]
)



Epoch 1/10


I0000 00:00:1744412551.128064    6184 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 183ms/step - accuracy: 0.8401 - loss: 0.3064 - val_accuracy: 0.9861 - val_loss: 0.0419 - learning_rate: 0.0010
Epoch 2/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 202ms/step - accuracy: 0.9821 - loss: 0.0633 - val_accuracy: 0.9876 - val_loss: 0.0397 - learning_rate: 0.0010
Epoch 3/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 173ms/step - accuracy: 0.9926 - loss: 0.0248 - val_accuracy: 0.9858 - val_loss: 0.0570 - learning_rate: 0.0010
Epoch 4/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step - accuracy: 0.9792 - loss: 0.0693
Epoch 4: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 190ms/step - accuracy: 0.9792 - loss: 0.0694 - val_accuracy: 0.9868 - val_loss: 0.0444 - learning_rate: 0.0010
Epoch 5/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [12]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))


[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 82ms/step
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4695
           1       0.99      0.99      0.99      4283

    accuracy                           0.99      8978
   macro avg       0.99      0.99      0.99      8978
weighted avg       0.99      0.99      0.99      8978



In [13]:
train_acc = history.history['accuracy'][-1]
val_acc = history.history['val_accuracy'][-1]

print(f"Final Training Accuracy: {train_acc * 100:.2f}%")
print(f"Final Validation Accuracy: {val_acc * 100:.2f}%")


Final Training Accuracy: 99.83%
Final Validation Accuracy: 99.37%


In [14]:
# Save the model to a file (e.g., 'fake_news_model.h5')
model.save('saved_models/fake_news_model.h5')




In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts([str(x) for x in X_train])  # ensures each element is a string


In [17]:
import pickle

with open('/home/david/projects/fake_news_detector/saved_models/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

