In [None]:

# Installing necessary packages for PyTorch and transformers (only if not already installed)
!pip install torch transformers


In [1]:

import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import numpy as np

# Check for GPU availability
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device( "cpu")
print(f"Using device: {device}")

# Initialize tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased").to(device)


Using device: cuda


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

# Loading dataset
# Assuming 'data.csv' or similar file name, update path if necessary
df = pd.read_csv('data.csv')  # Replace 'data.csv' with the actual dataset filename
df = df[['text', 'label']]  # Adjust columns as per the dataset

# Splitting data into train and test sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['label'], test_size=0.2)


In [None]:

# Tokenize data
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=512)

# Convert labels to tensors
train_labels = torch.tensor(train_labels.values)
val_labels = torch.tensor(val_labels.values)

# Create Dataset class
from torch.utils.data import Dataset, DataLoader

class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = FakeNewsDataset(train_encodings, train_labels)
val_dataset = FakeNewsDataset(val_encodings, val_labels)

# Creating data loaders with smaller batch sizes to manage GPU load
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)  # Adjust batch size as needed
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)


In [None]:

# Training function
from transformers import AdamW

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop with controlled steps and temp-checking (assume num_epochs = 3)
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation phase
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()
    
    print(f"Epoch {epoch + 1}/{num_epochs}, Training loss: {loss.item()}, Validation loss: {val_loss / len(val_loader)}")


In [None]:

# Final evaluation
y_preds, y_true = [], []

model.eval()
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        y_preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
        y_true.extend(labels.cpu().numpy())

# Classification report
print(classification_report(y_true, y_preds))
print("Confusion Matrix:", confusion_matrix(y_true, y_preds))
