In [7]:
%pip install datasets torch numpy pandas matplotlib scikit-learn seaborn transformers protobuf

Collecting protobuf
  Downloading protobuf-6.33.1-cp39-abi3-macosx_10_9_universal2.whl.metadata (593 bytes)
Downloading protobuf-6.33.1-cp39-abi3-macosx_10_9_universal2.whl (427 kB)
Installing collected packages: protobuf
Successfully installed protobuf-6.33.1
Note: you may need to restart the kernel to use updated packages.


In [4]:
# use accelerator if available
import torch
device = torch.accelerator.current_accelerator()
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: mps


In [None]:
from transformers import DebertaTokenizer
from sklearn.model_selection import train_test_split
import torch
import pandas as pd

true_data = pd.read_csv('News_dataset/True.csv')
true_data['label'] = 1
false_data = pd.read_csv('News_dataset/Fake.csv')
false_data['label'] = 0
data = pd.concat([true_data, false_data]).reset_index(drop=True)

tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')

x_train, x_test, y_train, y_test = train_test_split(
    data['text'], data['label'], test_size=0.2, random_state=42
)

tokenizer.encode_plus(
    x_train.iloc[0],
    add_special_tokens=True,
    max_length=512,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }
train_dataset = NewsDataset(x_train, y_train, tokenizer, max_len=512)
test_dataset = NewsDataset(x_test, y_test, tokenizer, max_len=512)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8, shuffle=False)
from transformers import DebertaForSequenceClassification
model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels=2)
model.to(device)

from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

from sklearn.metrics import accuracy_score, precision_recall_fscore_support 
def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        model.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    return total_loss / len(data_loader)

def eval_model(model, data_loader, device):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).flatten()
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')
    return accuracy, precision, recall, f1

EPOCHS = 3

for epoch in range(EPOCHS):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    accuracy, precision, recall, f1 = eval_model(model, test_loader, device)
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print(f'Train Loss: {train_loss:.4f}')
    print(f'Validation Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}')

model.save_pretrained('deberta-news-classifier')



Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: MPS backend out of memory (MPS allocated: 15.45 GiB, other allocations: 2.50 GiB, max allowed: 18.13 GiB). Tried to allocate 192.00 MiB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).