Project: Fake News Detection Using BERT

Notebook Path: Code/model_bert.ipynb
Dataset Location: ../dataset/*.csv
Output Model Directory: ../models/*_model/

Note: This notebook starts from already cleaned datasets, which are generated by running other Python scripts in the same project directory

Recommended Platform: Google Colab (GPU-enabled)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import os

# Base path
base_dir = os.path.join("..", "dataset")
data_path = os.path.join(base_dir, "cleaned_fakenewsnet.csv")

# Load cleaned data
df = pd.read_csv(data_path)

# Use clean_text if available, else clean_title
text_column = 'clean_text' if 'clean_text' in df.columns else 'title'

# Features and labels
X = df[text_column].astype(str)
y = df['label']

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("📊 Classification Report:\n", classification_report(y_test, y_pred))



# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Fake', 'Real'], yticklabels=['Fake', 'Real'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Naive Bayes Confusion Matrix')
plt.show()


# Define local directory to save models
model_dir = os.path.join("..", "models")
os.makedirs(model_dir, exist_ok=True)

# Save trained model and vectorizer to local directory
joblib.dump(model, os.path.join(model_dir, "naive_bayes_model.pkl"))
joblib.dump(vectorizer, os.path.join(model_dir, "tfidf_vectorizer.pkl"))

In [None]:
# !pip install textattack
# !pip install transformers
# !pip install torch
# !pip install tensorflow

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import tensorflow as tf
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

import os

# Parameters
MAX_VOCAB = 10000
MAX_LENGTH = 300
EMBEDDING_DIM = 100

# Load preprocessed dataset from local path
df = pd.read_csv(os.path.join("..", "dataset", "cleaned_fakenewsnet.csv"))

# Use 'clean_text' if available, fallback to 'clean_title'
text_column = 'clean_text' if 'clean_text' in df.columns else 'clean_title'
X = df[text_column].astype(str).values
y = df['label'].values

# Tokenize and pad text sequences
tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
padded_sequences = pad_sequences(sequences, maxlen=MAX_LENGTH, padding='post')

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y, test_size=0.2, random_state=42)

# Build BiLSTM classification model
model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBEDDING_DIM, input_length=MAX_LENGTH),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=128)

# Evaluation
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', xticklabels=['Fake', 'Real'], yticklabels=['Fake', 'Real'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('BiLSTM Confusion Matrix')
plt.show()

# Save model locally
model_dir = os.path.join("..", "models")
os.makedirs(model_dir, exist_ok=True)
model.save(os.path.join(model_dir, "bilstm_model.h5"))


In [None]:
import nltk
# nltk.download('averaged_perceptron_tagger')


import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm



# Parameters
MODEL_NAME = "bert-base-uncased"
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 3
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load Data
df = pd.read_csv(os.path.join("..", "dataset", "cleaned_fakenewsnet.csv"))

text_column = 'clean_text' if 'clean_text' in df.columns else 'clean_title'
texts = df[text_column].astype(str).tolist()
labels = df['label'].tolist()

# Train/Test Split
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=MAX_LEN)
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }

    def __len__(self):
        return len(self.labels)

train_dataset = FakeNewsDataset(train_texts, train_labels)
test_dataset = FakeNewsDataset(test_texts, test_labels)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Load BERT Model
model = BertForSequenceClassification.from_pretrained(MODEL_NAME)
model.to(DEVICE)

optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
model.train()
for epoch in range(EPOCHS):
    print(f"\n⏳ Epoch {epoch + 1}/{EPOCHS}")
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# Evaluation
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, axis=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Metrics
print("\n📊 Classification Report:\n", classification_report(all_labels, all_preds))

cm = confusion_matrix(all_labels, all_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Purples', xticklabels=['Fake', 'Real'], yticklabels=['Fake', 'Real'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('BERT Confusion Matrix')
plt.show()


# Save model and tokenizer locally
save_path = os.path.join("..", "models", "bert_model")
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)



In [None]:
import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from torch import nn
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Parameters
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 3
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load preprocessed data from dataset
df = pd.read_csv(os.path.join("..", "dataset", "cleaned_fakenewsnet.csv"))


text_column = 'clean_text' if 'clean_text' in df.columns else 'clean_title'
texts = df[text_column].astype(str).tolist()
labels = df['label'].tolist()

# Simulate image embeddings (2048-dim like ResNet output)
np.random.seed(42)
image_embeds = np.random.rand(len(df), 2048).astype(np.float32)

# Train/test split
train_texts, test_texts, train_labels, test_labels, train_imgs, test_imgs = train_test_split(
    texts, labels, image_embeds, test_size=0.2, random_state=42, stratify=labels)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class MultimodalFakeNewsDataset(Dataset):
    def __init__(self, texts, labels, image_embeds):
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=MAX_LEN)
        self.labels = labels
        self.image_embeds = image_embeds

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'image_embed': torch.tensor(self.image_embeds[idx]),
            'label': torch.tensor(self.labels[idx])
        }

    def __len__(self):
        return len(self.labels)

train_dataset = MultimodalFakeNewsDataset(train_texts, train_labels, train_imgs)
test_dataset = MultimodalFakeNewsDataset(test_texts, test_labels, test_imgs)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Multimodal Model
class SimulatedVisualBERT(nn.Module):
    def __init__(self):
        super(SimulatedVisualBERT, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.image_fc = nn.Linear(2048, 768)
        self.classifier = nn.Sequential(
            nn.ReLU(),
            nn.Linear(768 * 2, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, input_ids, attention_mask, image_embed):
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_text = bert_out.last_hidden_state[:, 0]  # [CLS] token
        image_proj = self.image_fc(image_embed)
        combined = torch.cat((cls_text, image_proj), dim=1)
        out = self.classifier(combined)
        return out

model = SimulatedVisualBERT().to(DEVICE)
criterion = nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Training
model.train()
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        image_embed = batch['image_embed'].to(DEVICE)
        labels = batch['label'].float().to(DEVICE)

        outputs = model(input_ids, attention_mask, image_embed).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# Evaluation
model.eval()
all_preds = []
all_true = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        image_embed = batch['image_embed'].to(DEVICE)
        labels = batch['label'].to(DEVICE)

        outputs = model(input_ids, attention_mask, image_embed).squeeze()
        preds = (outputs > 0.5).long()
        all_preds.extend(preds.cpu().numpy())
        all_true.extend(labels.cpu().numpy())

print("\n📊 Classification Report:\n", classification_report(all_true, all_preds))
cm = confusion_matrix(all_true, all_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Oranges', xticklabels=['Fake', 'Real'], yticklabels=['Fake', 'Real'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Simulated VisualBERT Confusion Matrix')
plt.show()

# Path to save model locally
output_model_path = os.path.join("..", "models", "visualbert_simulated.pth")
torch.save(model.state_dict(), output_model_path)

In [None]:
!pip install nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

# !pip install textattack
# !pip install transformers
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from textattack.attack_recipes import TextFoolerJin2019
from textattack.models.wrappers import HuggingFaceModelWrapper
from textattack.datasets import Dataset
from textattack import Attacker, AttackArgs
import torch

# Load model and tokenizer
model_path = os.path.join("..", "models", "bert_model")
# tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()

# Wrap model for TextAttack
wrapped_model = HuggingFaceModelWrapper(model, tokenizer)

# Load your test data (sample 100 for speed)
df = pd.read_csv(os.path.join("..", "dataset", "cleaned_fakenewsnet.csv"))


df = df.sample(n=2000, random_state=42)
text_column = 'clean_text' if 'clean_text' in df.columns else 'clean_title'
texts = df[text_column].astype(str).tolist()
labels = df['label'].tolist()

# Format into TextAttack Dataset
attack_dataset = Dataset(list(zip(texts, labels)))

# Choose attack recipe
attack = TextFoolerJin2019.build(wrapped_model)

# Attack arguments
attack_args = AttackArgs(
    num_examples=20,
    log_to_csv=os.path.join("..", "dataset", "bert_textfooler_attack_log.csv"),
    disable_stdout=True,
    attack_n=True
)

# Run attack
attacker = Attacker(attack, attack_dataset, attack_args)
attacker.attack_dataset()


In [None]:
import pandas as pd

# Load attack log
log_path = os.path.join("..", "dataset", "bert_textfooler_attack_log.csv")
df = pd.read_csv(log_path)

# Filter out rows where attack was successful
successful_attacks = df[df['result_type'] == 'Successful']
failed_attacks = df[df['result_type'] == 'Failed']

# Metrics
total = len(df)
success = len(successful_attacks)
failed = len(failed_attacks)

original_correct = total  # all examples were correct originally
after_attack_correct = failed  # only those not fooled

print("📊 Adversarial Robustness Report (TextFooler on BERT):")
print(f"🔹 Total Examples Attacked: {total}")
print(f"✅ Correct Before Attack: {original_correct}")
print(f"❌ Fooled (Misclassified): {success}")
print(f"✅ Resisted Attack: {failed}")
print(f"📉 Adversarial Accuracy: {failed / total:.2f}")
print(f"⚠️ Attack Success Rate: {success / total:.2f}")


labels = ['Correct After Attack', 'Fooled by Attack']
values = [failed, success]

plt.bar(labels, values, color=['green', 'red'])
plt.title('BERT Robustness to TextFooler Attack')
plt.ylabel('Number of Samples')
plt.show()


In [14]:
import pandas as pd

# Load original dataset (small subset for now)
clean_path = os.path.join("..", "dataset", "cleaned_fakenewsnet.csv")
clean_df = pd.read_csv(clean_path)

clean_df = clean_df.sample(n=100, random_state=42)

# Load adversarial log
adv_path = os.path.join("..", "dataset", "bert_textfooler_attack_log.csv")
adv_df = pd.read_csv(adv_path)

# Only keep successful attacks
adv_df = adv_df[adv_df['result_type'] == 'Successful']

# Rename columns to match format
adv_examples = pd.DataFrame({
    'text': adv_df['perturbed_text'],
    'label': adv_df['ground_truth_output']
})
adv_examples['label'] = adv_examples['label'].astype(int)

# Use same column name as original clean text column
text_column = 'clean_text' if 'clean_text' in clean_df.columns else 'clean_title'
clean_examples = clean_df[[text_column, 'label']].rename(columns={text_column: 'text'})

# Combine both datasets
combined_df = pd.concat([clean_examples, adv_examples], ignore_index=True)


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Parameters
BATCH_SIZE = 8
EPOCHS = 3
MAX_LEN = 256
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and model
model_path = os.path.join("..", "models", "bert_model_adversarial")
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.to(DEVICE)

# Prepare dataset
texts = combined_df['text'].astype(str).tolist()
labels = combined_df['label'].tolist()

class BERTDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=MAX_LEN)
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }

    def __len__(self):
        return len(self.labels)

dataset = BERTDataset(texts, labels)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Training loop
model.train()
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    total_loss = 0
    for batch in tqdm(dataloader):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"🔁 Epoch {epoch + 1} average loss: {total_loss / len(dataloader):.4f}")

# Save updated model
output_path = os.path.join("..", "models", "bert_model_adversarial")
model.save_pretrained(output_path)
tokenizer.save_pretrained(output_path)

print("✅ Adversarially trained BERT model saved.")


In [None]:
# !pip install textattack
# !pip install transformers

import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from textattack.attack_recipes import TextFoolerJin2019
from textattack.models.wrappers import HuggingFaceModelWrapper
from textattack.datasets import Dataset
from textattack import Attacker, AttackArgs
import torch

# Load adversarially fine-tuned model
model_path = os.path.join("..", "models", "bert_model_adversarial")
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()

# Wrap with TextAttack
wrapped_model = HuggingFaceModelWrapper(model, tokenizer)

# Load same test sample as before
csv_path = os.path.join("..", "dataset", "cleaned_fakenewsnet.csv")
df = pd.read_csv(csv_path)

df = df.sample(n=200, random_state=42)  # Match the previous attack count
text_column = 'clean_text' if 'clean_text' in df.columns else 'clean_title'
texts = df[text_column].astype(str).tolist()
labels = df['label'].tolist()

attack_dataset = Dataset(list(zip(texts, labels)))

# Use TextFooler again
attack = TextFoolerJin2019.build(wrapped_model)

# Save new results to different file
attack_args = AttackArgs(
    num_examples=200,
    log_to_csv=os.path.join("..", "dataset", "bert_adversarially_trained_textfooler.csv"),
    disable_stdout=True,
    attack_n=True
)

attacker = Attacker(attack, attack_dataset, attack_args)
attacker.attack_dataset()


In [None]:
import pandas as pd

# Load attack log
df = pd.read_csv(os.path.join("..", "dataset", "bert_adversarially_trained_textfooler.csv"))

# Filter out rows where attack was successful
successful_attacks = df[df['result_type'] == 'Successful']
failed_attacks = df[df['result_type'] == 'Failed']

# Metrics
total = len(df)
success = len(successful_attacks)
failed = len(failed_attacks)

original_correct = total  # all examples were correct originally
after_attack_correct = failed  # only those not fooled

print("📊 Adversarial Robustness Report (TextFooler on BERT):")
print(f"🔹 Total Examples Attacked: {total}")
print(f"✅ Correct Before Attack: {original_correct}")
print(f"❌ Fooled (Misclassified): {success}")
print(f"✅ Resisted Attack: {failed}")
print(f"📉 Adversarial Accuracy: {failed / total:.2f}")
print(f"⚠️ Attack Success Rate: {success / total:.2f}")


labels = ['Correct After Attack', 'Fooled by Attack']
values = [failed, success]

plt.bar(labels, values, color=['green', 'red'])
plt.title('BERT Robustness to TextFooler Attack')
plt.ylabel('Number of Samples')
plt.show()


In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Step 1: Load preprocessed dataset
df = pd.read_csv(os.path.join("..", "dataset", "cleaned_test_data.csv"))  # adjust path
texts = df['clean_text'].astype(str).tolist()

# Step 2: Load BERT model and tokenizer
model_path = os.path.join("..", "models", "bert_model")  # or switch to "bert_model_adversarial"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Step 3: Tokenize preprocessed text
inputs = tokenizer(
    texts,
    return_tensors='pt',
    padding=True,
    truncation=True,
    max_length=256
)
inputs = {key: val.to(device) for key, val in inputs.items()}

# Step 4: Run prediction
with torch.no_grad():
    outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=1)
    preds = torch.argmax(probs, dim=1).cpu().numpy()

# Step 5: Add predictions to DataFrame
df['prediction'] = preds
df['label'] = df['prediction'].map({0: 'Fake', 1: 'Real'})

# Step 6: Save or view results
df[['clean_text', 'label']].to_csv(os.path.join("..", "dataset", "bert_predictions_on_cleaned_news.csv"), index=False)
print(df[['clean_text', 'label']])
print(df['label'].value_counts())


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Prediction Distribution
sns.countplot(x='label', data=df, palette='Set2')
plt.title('Distribution of BERT Predictions')
plt.xlabel('Predicted Label')
plt.ylabel('Count')
plt.show()


# Confidence Score Distribution
df['confidence'] = probs.max(dim=1).values.cpu().numpy()

sns.histplot(df['confidence'], bins=20, kde=True, color='skyblue')
plt.title('BERT Prediction Confidence Distribution')
plt.xlabel('Confidence')
plt.ylabel('Number of Samples')
plt.show()


# Top N Fake and Real News by Confidence
# Most confidently predicted Fake news
print("\n🔹 Most Confident Fake News Predictions:")
print(df[df['label'] == 'Fake'].sort_values(by='confidence', ascending=False)[['clean_text', 'confidence']].head(5))
# Most confidently predicted Real news
print("\n🔹 Most Confident Real News Predictions:")
print(df[df['label'] == 'Real'].sort_values(by='confidence', ascending=False)[['clean_text', 'confidence']].head(5))


# Save With Confidence Score
df[['clean_text', 'label', 'confidence']].to_csv(os.path.join("..", "dataset", "bert_predictions_with_confidence.csv"), index=False)
