In [3]:
import pandas as pd
import os
fake_news = pd.read_csv(os.path.join("News_dataset", "Fake.csv"))
real_news = pd.read_csv(os.path.join("News_dataset", "True.csv"))

fake_news['label'] = 0
real_news['label'] = 1

dataset = pd.concat([fake_news, real_news], axis=0)
dataset['text'] = dataset['title'] + ' ' + dataset['text']

dataset = dataset.drop(['subject', 'date', 'title'], axis = 1)

In [4]:
import tensorflow as tf
print(tf.__version__)

2.20.0


In [5]:
# Requires: tensorflow, sklearn, pandas, numpy
# pip install tensorflow scikit-learn pandas numpy

import os
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import tensorflow as tf
from tensorflow.keras.layers import (Input, Embedding, Conv1D, GlobalMaxPooling1D,
                                     Bidirectional, LSTM, Dense, Dropout, Layer)
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

# --------------------------
# 1) Parameters (tweakable)
# --------------------------
RANDOM_STATE = 42
TEST_SIZE = 0.15
VAL_SIZE = 0.10   # fraction of training to use as validation
VOCAB_SIZE = 30000    # top words to keep
MAX_LEN = 60          # max tokens per news title (titles are short)
EMBEDDING_DIM = 128
BATCH_SIZE = 64
EPOCHS = 15
PATIENCE = 3
MODEL_DIR = "saved_model"
os.makedirs(MODEL_DIR, exist_ok=True)

# --------------------------
# 2) Utility: clean text
# --------------------------
def clean_text(text):
    """Simple text cleaning: lower, remove urls, emails, non-alphanum (keep spaces)."""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    # remove urls
    text = re.sub(r'http\S+|www\.\S+', ' ', text)
    # remove emails
    text = re.sub(r'\S+@\S+', ' ', text)
    # remove non-alphanumeric (keep spaces)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    # collapse whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --------------------------
# 3) Prepare dataset
# --------------------------
# Expect dataset to be present with columns 'title' and 'label'
# Example check:
# dataset = pd.read_csv("your_dataset.csv")  # if needed

# assert 'title' in dataset.columns and 'label' in dataset.columns, "DataFrame must contain 'title' and 'label' columns"

# Clean text
dataset['text'] = dataset['text'].astype(str).map(clean_text)

# Train/test split
X = dataset['text'].values
y = dataset['label'].values
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=TEST_SIZE,
                                                              random_state=RANDOM_STATE, stratify=y)
# Further split train->train+val
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full,
                                                  test_size=VAL_SIZE, random_state=RANDOM_STATE,
                                                  stratify=y_train_full)

print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

# --------------------------
# 4) Tokenizer and sequences
# --------------------------
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)  # fit only on train

def texts_to_padded_sequences(texts):
    seqs = tokenizer.texts_to_sequences(texts)
    seqs = pad_sequences(seqs, maxlen=MAX_LEN, padding='post', truncating='post')
    return seqs

X_train_seq = texts_to_padded_sequences(X_train)
X_val_seq = texts_to_padded_sequences(X_val)
X_test_seq = texts_to_padded_sequences(X_test)

# --------------------------
# 5) Attention layer (simple)
# --------------------------
class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        # input_shape: (batch_size, time_steps, features)
        self.W = self.add_weight(name='att_weight', shape=(input_shape[-1],),
                                 initializer='random_normal', trainable=True)
        super(AttentionLayer, self).build(input_shape)

    def call(self, inputs):
        # inputs: (batch_size, time_steps, features)
        # compute scores
        scores = tf.tensordot(inputs, self.W, axes=1)  # (batch_size, time_steps)
        weights = tf.nn.softmax(scores, axis=1)  # (batch_size, time_steps)
        # weighted sum
        weighted = tf.reduce_sum(inputs * tf.expand_dims(weights, -1), axis=1)
        return weighted

    def get_config(self):
        base_config = super(AttentionLayer, self).get_config()
        return base_config

# --------------------------
# 6) Model architecture
# --------------------------
def build_model(vocab_size=VOCAB_SIZE, max_len=MAX_LEN, embedding_dim=EMBEDDING_DIM):
    inp = Input(shape=(max_len,), dtype='int32', name='input_ids')
    x = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len, name='embedding')(inp)
    # small Conv block to capture n-gram features
    x = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')(x)
    x = GlobalMaxPooling1D()(x)  # global pooling gives a quick baseline
    # Also create a sequential branch with BiLSTM to capture order
    y = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len, name='embegging2')(inp)
    y = Bidirectional(LSTM(64, return_sequences=True))(y)
    # Attention to compress LSTM outputs
    y = AttentionLayer()(y)
    # Concatenate both representations
    concat = tf.keras.layers.Concatenate()([x, y])
    z = Dense(128, activation='relu')(concat)
    z = Dropout(0.4)(z)
    z = Dense(64, activation='relu')(z)
    z = Dropout(0.3)(z)
    out = Dense(1, activation='sigmoid')(z)

    model = Model(inputs=inp, outputs=out)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

model = build_model()
model.summary()

# --------------------------
# 7) Class weights (handles imbalance)
# --------------------------
# Compute class weights
from sklearn.utils.class_weight import compute_class_weight
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = {int(c): w for c, w in zip(classes, class_weights)}
print("Class weights:", class_weights)

# --------------------------
# 8) Callbacks
# --------------------------
checkpoint_path = os.path.join(MODEL_DIR, "best_model_checkpoint.h5")
callbacks = [
    EarlyStopping(monitor='val_loss', patience=PATIENCE, restore_best_weights=True, verbose=1),
    ModelCheckpoint(checkpoint_path, monitor='val_loss', save_best_only=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1)
]

# --------------------------
# 9) Training
# --------------------------
history = model.fit(
    X_train_seq, y_train,
    validation_data=(X_val_seq, y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=callbacks,
    class_weight=class_weights,
    verbose=2
)

# --------------------------
# 10) Evaluation on test set
# --------------------------
y_pred_prob = model.predict(X_test_seq).ravel()
y_pred = (y_pred_prob >= 0.5).astype(int)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
try:
    roc_auc = roc_auc_score(y_test, y_pred_prob)
except Exception:
    roc_auc = None

print("\nTest metrics:")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1: {f1:.4f}")
if roc_auc is not None:
    print(f"ROC AUC: {roc_auc:.4f}")

# --------------------------
# 11) Save final model + tokenizer
# --------------------------
model.save(os.path.join(MODEL_DIR, "best_model.keras"))
import pickle
with open(os.path.join(MODEL_DIR, "tokenizer.pkl"), "wb") as f:
    pickle.dump(tokenizer, f)

print(f"Model, tokenizer and label encoder saved to '{MODEL_DIR}' ")

Train: 34346, Val: 3817, Test: 6735







Class weights: {0: np.float64(0.9560207092356511), 1: np.float64(1.0482207165964719)}
Epoch 1/15

Epoch 1: val_loss improved from None to 0.00409, saving model to saved_model\best_model_checkpoint.h5




537/537 - 93s - 172ms/step - accuracy: 0.9866 - loss: 0.0370 - val_accuracy: 0.9987 - val_loss: 0.0041 - learning_rate: 1.0000e-03
Epoch 2/15

Epoch 2: val_loss did not improve from 0.00409
537/537 - 72s - 134ms/step - accuracy: 0.9997 - loss: 0.0019 - val_accuracy: 0.9982 - val_loss: 0.0082 - learning_rate: 1.0000e-03
Epoch 3/15

Epoch 3: val_loss did not improve from 0.00409

Epoch 3: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
537/537 - 69s - 128ms/step - accuracy: 0.9998 - loss: 8.3669e-04 - val_accuracy: 0.9990 - val_loss: 0.0052 - learning_rate: 1.0000e-03
Epoch 4/15

Epoch 4: val_loss did not improve from 0.00409
537/537 - 62s - 116ms/step - accuracy: 0.9999 - loss: 1.6684e-04 - val_accuracy: 0.9987 - val_loss: 0.0076 - learning_rate: 5.0000e-04
Epoch 4: early stopping
Restoring model weights from the end of the best epoch: 1.
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step

Test metrics:
Accuracy: 0.9996
Precision: 0.9994
Recal

In [6]:
from keras.models import load_model
model1 = load_model("saved_model/best_model.keras", custom_objects={"AttentionLayer":AttentionLayer})
model1.summary()

  saveable.load_own_variables(weights_store.get(inner_path))


In [7]:
with open('saved_model/tokenizer.pkl', 'rb') as file:
    tk = pickle.load(file)

In [8]:
# --------------------------
# 12) Predict helper
# --------------------------
def predict_texts(texts, model, tokenizer, max_len=MAX_LEN):
    cleaned = [clean_text(t) for t in texts]
    seqs = tokenizer.texts_to_sequences(cleaned)
    seqs = pad_sequences(seqs, maxlen=max_len, padding='post', truncating='post')
    probs = model.predict(seqs).ravel()
    preds = (probs >= 0.5).astype(int)
    return "Real" if preds == 1 else "Fake", probs[0]


# Example usage:
preds, probs = predict_texts(['''hange isn’t going to be handed to us from the elite power brokers and donors controlling the political parties that got us here, and it sure won’t come from the same oligarchs and zealots pillaging and plundering our societies, our democratic institutions, our economy, and our planet. If we expect to see a future that’s still worth living in, then poor, working-class, and oppressed people across the global underclass will need to fight for it. And TRNN will be there on the front lines of the fight with cameras and microphones. 

Continuing our longstanding commitment to making media that empowers people and movements to make change, TRNN is responding to these societal crises by expanding our coverage with a slate of new and returning programs that uplift the voices and struggles of working people around the world, challenge power, and amplify resistance to exploitation, injustice, and domination. From Rattling the Bars, Police Accountability Report, The Marc Steiner Show, Inequality Watch, and Working People to Solidarity Without Exception, Stories of Resistance, Edge of Sports, and more, we are launching groundbreaking new series while reviving and elevating the storytelling formats of fan-favorite shows to engage and activate more people around the world, pierce the algorithmic noise and misinformation, and cut through the corporate media silence. 

This isn’t just about expanding our content—it’s about deepening our commitment to using our resources and talents as journalists and media makers to serve, inform, connect, and empower people at a time when the fate of our society and our planet hangs on the people’s willingness and ability to fight for them. We’re levelling up to meet the moment, telling the stories that corporate media won’t touch and amplifying the voices of those fighting for justice.'''], model1, tk)
print("Predicted label:", preds, probs)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 650ms/step
Predicted label: Fake 8.470616e-06
