In [1]:
%pip install datasets torch numpy pandas matplotlib scikit-learn seaborn transformers protobuf

Note: you may need to restart the kernel to use updated packages.


In [2]:
# use accelerator if available
import torch
device = torch.accelerator.current_accelerator()
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [3]:
from transformers import DebertaTokenizer
from sklearn.model_selection import train_test_split
import torch
import pandas as pd

# Load and prepare data
true_data = pd.read_csv('News_dataset/True.csv')
true_data['label'] = 1
false_data = pd.read_csv('News_dataset/Fake.csv')
false_data['label'] = 0
data = pd.concat([true_data, false_data]).reset_index(drop=True)

print(f"Total dataset size: {len(data)}")
print(f"True news articles: {len(true_data)}")
print(f"Fake news articles: {len(false_data)}")

tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')

# Modified train-test split with smaller test size (10% instead of 20%)
# You can change test_size to any value you want:
# - 0.1 = 10% test, 90% train
# - 0.15 = 15% test, 85% train  
# - 0.05 = 5% test, 95% train
TEST_SIZE = 0.1  # Change this value as needed

x_train, x_test, y_train, y_test = train_test_split(
    data['text'], data['label'], test_size=TEST_SIZE, random_state=42
)

# Print the actual sizes
print(f"\nAfter train-test split (test_size={TEST_SIZE}):")
print(f"Training set size: {len(x_train)}")
print(f"Test set size: {len(x_test)}")
print(f"Training set percentage: {len(x_train)/len(data)*100:.1f}%")
print(f"Test set percentage: {len(x_test)/len(data)*100:.1f}%")

# sampling a subset:
SAMPLE_SIZE = 1000  # Set to None to use full dataset, or any number to limit size

if SAMPLE_SIZE is not None and len(x_train) > SAMPLE_SIZE:
    # Sample from training set
    sample_indices = x_train.sample(n=SAMPLE_SIZE, random_state=42).index
    x_train_sampled = x_train[sample_indices]
    y_train_sampled = y_train[sample_indices]
    
    # Proportionally sample from test set
    test_sample_size = int(SAMPLE_SIZE * TEST_SIZE / (1 - TEST_SIZE))
    test_sample_indices = x_test.sample(n=min(test_sample_size, len(x_test)), random_state=42).index
    x_test_sampled = x_test[test_sample_indices]
    y_test_sampled = y_test[test_sample_indices]
    
    x_train, y_train = x_train_sampled, y_train_sampled
    x_test, y_test = x_test_sampled, y_test_sampled
    
    print(f"\nAfter sampling (limited to {SAMPLE_SIZE} training examples):")
    print(f"Training set size: {len(x_train)}")
    print(f"Test set size: {len(x_test)}")

tokenizer.encode_plus(
    x_train.iloc[0],
    add_special_tokens=True,
    max_length=512,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

MAX_LEN = 256  # Reduced from 512 for faster training

train_dataset = NewsDataset(x_train, y_train, tokenizer, max_len=MAX_LEN)
test_dataset = NewsDataset(x_test, y_test, tokenizer, max_len=MAX_LEN)

BATCH_SIZE = 16  # Increased from 8

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Number of training batches: {len(train_loader)}")
print(f"Number of test batches: {len(test_loader)}")

from transformers import DebertaForSequenceClassification
model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels=2)
model.to(device)

from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

from sklearn.metrics import accuracy_score, precision_recall_fscore_support 
def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        model.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    return total_loss / len(data_loader)

def eval_model(model, data_loader, device):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).flatten()
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')
    return accuracy, precision, recall, f1

EPOCHS = 3

for epoch in range(EPOCHS):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    accuracy, precision, recall, f1 = eval_model(model, test_loader, device)
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print(f'Train Loss: {train_loss:.4f}')
    print(f'Validation Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}')

model.save_pretrained('deberta-news-classifier')


  from .autonotebook import tqdm as notebook_tqdm


Total dataset size: 44898
True news articles: 21417
Fake news articles: 23481

After train-test split (test_size=0.1):
Training set size: 40408
Test set size: 4490
Training set percentage: 90.0%
Test set percentage: 10.0%

After sampling (limited to 1000 training examples):
Training set size: 1000
Test set size: 111
Number of training batches: 63
Number of test batches: 7


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Train Loss: 0.2030
Validation Accuracy: 0.9910, Precision: 1.0000, Recall: 0.9818, F1 Score: 0.9908


KeyboardInterrupt: 