In [None]:
import pandas as pd
import numpy as np
import re
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
import requests
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import json
from torch.utils.data import DataLoader, Dataset
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')
df_fake = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Fake.csv')
df_true = pd.read_csv('/content/drive/My Drive/Colab Notebooks/True.csv')

# Label datasets
df_true['status'] = 0  # 0 for true
df_fake['status'] = 1  # 1 for fake

# Combine datasets and shuffle
df = pd.concat([df_true, df_fake]).sample(frac=1).reset_index(drop=True)


Mounted at /content/drive


In [None]:
# Define text cleaning function
text_cleaning = r"\b0\S*|\b[^A-Za-z0-9]+"
def preprocess_filter(text):
    text = re.sub(text_cleaning, " ", str(text.lower()).strip())
    return text

# Apply preprocessing to the titles
df['title'] = df['title'].apply(preprocess_filter)

In [None]:
# Tokenization and padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['title'])
max_length = 40
X = pad_sequences(tokenizer.texts_to_sequences(df['title']), maxlen=max_length, padding='pre')
y = df['status'].values

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
#  Set up BERT Model

# Load DistilBERT tokenizer and model
tokenizer_bert = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Step 3: Dataset Class for BERT

class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [None]:
# Create DataLoader
train_dataset = NewsDataset(texts=X_train, labels=y_train, tokenizer=tokenizer_bert, max_len=max_length)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = NewsDataset(texts=X_test, labels=y_test, tokenizer=tokenizer_bert, max_len=max_length)
test_loader = DataLoader(test_dataset, batch_size=32)

In [None]:
# Tokenization using DistilBERT
def tokenize_data(texts):
    return tokenizer(texts, truncation=True, padding=True, max_length=256, return_tensors='pt')

In [None]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=2e-5)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Step 3: Training Function
def train_epoch(model, data_loader, optimizer, device):
    model = model.train()  # Set model to training mode
    losses = []
    correct_predictions = 0

    for batch in data_loader:
        optimizer.zero_grad()  # Zero out gradients from previous step

        # Move batch to GPU if available
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass through the model
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        # Extract loss and logits (outputs.logits are the predictions)
        loss = outputs.loss
        logits = outputs.logits

        # Calculate predictions
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)

        # Backpropagation and optimization
        loss.backward()
        optimizer.step()

        losses.append(loss.item())

    # Calculate accuracy for the epoch
    accuracy = correct_predictions.double() / len(data_loader.dataset)
    avg_loss = np.mean(losses)

    return accuracy, avg_loss


In [None]:
def eval_model(model, data_loader, device):
    model = model.eval()  # Set model to evaluation mode
    correct_predictions = 0
    losses = []
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

            # Store predictions and labels for metrics calculation
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate accuracy for the evaluation
    accuracy = correct_predictions.double() / len(data_loader.dataset)
    avg_loss = np.mean(losses)

    # Compute additional metrics
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    return accuracy, avg_loss, precision, recall, f1


In [None]:
def train_model(model, train_loader, test_loader, optimizer, device, epochs):
    train_accuracies = []
    train_losses = []
    test_accuracies = []
    test_losses = []
    best_accuracy = 0
    best_model_state = None
    best_metrics = {}

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")

        # Set model to training mode
        model = model.to(device)
        model.train()

        # Training phase
        train_acc, train_loss = train_epoch(model, train_loader, optimizer, device)

        # Evaluation phase
        test_acc, test_loss, test_precision, test_recall, test_f1 = eval_model(model, test_loader, device)

        print(f"Train loss: {train_loss:.4f}, accuracy: {train_acc:.4f}")
        print(f"Test loss: {test_loss:.4f}, accuracy: {test_acc:.4f}")

        # Store training metrics
        train_accuracies.append(train_acc)
        train_losses.append(train_loss)
        test_accuracies.append(test_acc)
        test_losses.append(test_loss)

        # Check if this is the best model so far
        if test_acc > best_accuracy:
            best_accuracy = test_acc
            best_model_state = model.state_dict()  # Save best model state
            best_metrics = {
                'accuracy': test_acc,
                'precision': test_precision,
                'recall': test_recall,
                'f1': test_f1
            }
            print(f"Saved best model with accuracy: {best_accuracy:.4f}")

    # Save the best model after training
    if best_model_state:
        torch.save(best_model_state, 'best_model.pth')

    return train_accuracies, train_losses, test_accuracies, test_losses, best_metrics


In [None]:
epochs = 3
train_accuracies, train_losses, test_accuracies, test_losses = train_model(
    model, train_loader, test_loader, optimizer, device, epochs
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/3




In [None]:
print("\nBest Model Metrics:")
print(f"Accuracy: {best_metrics['accuracy']:.4f}")
print(f"Precision: {best_metrics['precision']:.4f}")
print(f"Recall: {best_metrics['recall']:.4f}")
print(f"F1 Score: {best_metrics['f1']:.4f}")

In [None]:
# Plot accuracy
plt.plot(range(epochs), train_accuracies, label='Training Accuracy', marker='o')
plt.plot(range(epochs), test_accuracies, label='Test Accuracy', marker='o')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Model Accuracy')
plt.legend()
plt.show()

# Plot loss (loss values are already numpy.float64)
plt.plot(range(epochs), train_losses, label='Training Loss', marker='o')
plt.plot(range(epochs), test_losses, label='Test Loss', marker='o')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Model Loss')
plt.legend()
plt.show()

In [None]:
model.save_pretrained('fine_tuned_distilbert')
from transformers import DistilBertTokenizer

# Load the tokenizer from Hugging Face
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Save the tokenizer
tokenizer.save_pretrained('fine_tuned_distilbert')



In [None]:
import requests
import torch

# News API key and URL
NEWS_API_KEY = '6c4a12629c064457946e7c14d7040671'
NEWS_API_URL = 'https://newsapi.org/v2/top-headlines?apiKey=' + NEWS_API_KEY

# Fetch news function
def fetch_news(keyword=""):
    url = f'https://newsapi.org/v2/top-headlines?apiKey={NEWS_API_KEY}'
    response = requests.get(url)
    if response.status_code == 200:
        articles = response.json().get('articles', [])
        headlines = [article['title'] for article in articles]
        return headlines
    else:
        print(f"Error fetching news: {response.status_code}")
        return []

# BERT Model Prediction
def predict_news(headline):
    # Preprocess the headline using the tokenizer
    inputs = tokenizer_bert.encode_plus(
        headline,
        add_special_tokens=True,  # Add [CLS] and [SEP] tokens
        max_length=128,           # Maximum input sequence length
        padding='max_length',     # Pad to the maximum length
        truncation=True,          # Truncate if the input is too long
        return_tensors="pt"       # Return as PyTorch tensors
    )

    # Move inputs to the same device as the model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Predict using the model
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1).item()  # Get the predicted class

    # Convert prediction to readable output
    return "Fake News" if prediction == 1 else "True News"

# Predict on Live News
def predict_live_news():
    headlines = fetch_news()  # Fetch global news
    if not headlines:
        print("No news fetched.")
        return

    # Predict
    for headline in headlines:
        prediction = predict_news(headline)
        print(f"Headline: {headline}")
        print(f"Prediction: {prediction}\n")

# Manual Testing (using BERT)
def manual_test():
    print("Enter a headline to predict (or type 'exit' to quit):")
    while True:
        headline = input("Headline: ")
        if headline.lower() == 'exit':
            break

        # Predict using BERT model
        prediction = predict_news(headline)
        print(f"Prediction: {prediction}\n")

# Example usage: Uncomment the function you want to use

print("Select an option:")
print("1: Predict on live news fetched from the News API")
print("2: Perform manual testing by entering your own headlines")
choice = input("Enter your choice (1 or 2): ")

if choice == '1':
    print("Fetching live news and predicting...")
    predict_live_news()
elif choice == '2':
    print("Manual testing mode. Enter headlines to predict:")
    manual_test()
else:
    print("Invalid choice. Please run the program again and select 1 or 2.")


In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch

# Load the fine-tuned model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained('fine_tuned_distilbert')
tokenizer = DistilBertTokenizer.from_pretrained('fine_tuned_distilbert')

# Define a prediction function
def predict_news(headline):
    # Preprocess the input
    inputs = tokenizer.encode_plus(
        headline,
        max_length=128,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )
    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1).squeeze().tolist()

    # Determine the label and confidence
    label = "Real News" if torch.argmax(logits) == 1 else "Fake News"
    confidence = max(probabilities) * 100

    return f"{label} (Confidence: {confidence:.2f}%)"


In [None]:
pip install --upgrade gradio

In [None]:
import gradio as gr

# Create the Gradio interface
interface = gr.Interface(
    fn=predict_news,
    inputs=gr.Textbox(label="Enter News Headline", placeholder="Type your news headline here..."),
    outputs=gr.Textbox(label="Prediction"),
    title="Fake News Detection",
    description="Enter a news headline, and the model will predict whether it's real or fake."
)

# Launch the Gradio app
interface.launch()
