In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch import nn, optim
from transformers import BertModel, BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, TensorDataset


In [2]:
# Paths to datasets
fake_data_path = "E:/Computer Science/Computer Science Fall 2024/Information Retrival/Jupter/Jupyter_Practice/Fake.csv"
real_data_path = "E:/Computer Science/Computer Science Fall 2024/Information Retrival/Jupter/Jupyter_Practice/True.csv"

# Load datasets
fake_data = pd.read_csv(fake_data_path)
real_data = pd.read_csv(real_data_path)

# Label datasets
fake_data['label'] = 0
real_data['label'] = 1

# Combine datasets
combined_data = pd.concat([fake_data[['text', 'label']], real_data[['text', 'label']]], ignore_index=True)


In [3]:
# Split into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    combined_data['text'], combined_data['label'], test_size=0.2, random_state=42
)


In [4]:
# Load pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


In [5]:
# Function to tokenize data
def tokenize_data(texts, max_len=100):
    tokens = tokenizer(texts, padding=True, truncation=True, max_length=max_len, return_tensors="pt")
    return tokens['input_ids'], tokens['attention_mask']

In [6]:
# Define DataLoader creation function
def create_dataloader(texts, labels, batch_size=8, max_len=64):
    input_ids, attention_masks = tokenize_data(texts.tolist(), max_len)
    labels = torch.tensor(labels.values)
    dataset = TensorDataset(input_ids, attention_masks, labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [7]:
# Create DataLoaders for training and validation
train_loader = create_dataloader(train_texts, train_labels, batch_size=8)
val_loader = create_dataloader(val_texts, val_labels, batch_size=8)

In [8]:
# Define BERT-based fake news detector model
class BertBasedFakeNewsDetector(nn.Module):
    def __init__(self):
        super(BertBasedFakeNewsDetector, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        for param in self.bert.parameters():
            param.requires_grad = False
        self.dropout = nn.Dropout(0.5)
        self.dense1 = nn.Linear(self.bert.config.hidden_size, 64)
        self.tanh = nn.Tanh()
        self.output = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, input_ids, attention_mask):
        pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output
        x = self.dropout(pooled_output)
        x = self.tanh(self.dense1(x))
        x = self.dropout(x)
        x = self.sigmoid(self.output(x))
        return x

In [9]:
# Set up GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertBasedFakeNewsDetector().to(device)
print("CUDA available:", torch.cuda.is_available())


CUDA available: True


In [10]:
# Optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.BCELoss()

In [11]:
# Training function
def train_model(model, train_loader, val_loader, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for input_ids, attention_masks, labels in train_loader:
            input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device, dtype=torch.float)
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_masks).squeeze()
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_train_loss:.4f}")
        evaluate_model(model, val_loader)

In [12]:
# Evaluation function
def evaluate_model(model, val_loader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for input_ids, attention_masks, labels in val_loader:
            input_ids, attention_masks = input_ids.to(device), attention_masks.to(device)
            outputs = model(input_ids, attention_mask=attention_masks).squeeze()
            preds = (outputs > 0.5).long()
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.numpy())
    accuracy = accuracy_score(true_labels, predictions)
    print(f"Validation Accuracy: {accuracy:.4f}")

In [13]:
# Prediction function
def predict(model, text):
    model.eval()
    input_ids, attention_mask = tokenize_data([text])
    input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
    with torch.no_grad():
        output = model(input_ids, attention_mask=attention_mask).squeeze()
        prediction = "Real News" if output > 0.5 else "Fake News"
    return prediction

In [None]:
# Train the model
train_model(model, train_loader, val_loader, epochs=2)


In [None]:
# Example prediction
test_text = "Example news content to classify"
print("Prediction:", predict(model, test_text))