In [2]:
import torch
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm

# Set device to CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(f"Using device: {device}")

# Load the datasets
fake_data = pd.read_csv("E:/Computer Science/Computer Science Fall 2024/Information Retrival/Jupter/Jupyter_Practice/Fake.csv")
real_data = pd.read_csv("E:/Computer Science/Computer Science Fall 2024/Information Retrival/Jupter/Jupyter_Practice/True.csv")

# Label the data
fake_data["label"] = 0
real_data["label"] = 1
final_data = pd.concat([fake_data, real_data]).sample(frac=1).reset_index(drop=True)
final_data = final_data[['text', 'label']]

# Dataset class for PyTorch
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_attention_mask=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Initialize tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Split data and create DataLoaders
X_train, X_test, y_train, y_test = train_test_split(
    final_data['text'], final_data['label'], test_size=0.2, random_state=42
)
train_dataset = NewsDataset(X_train.tolist(), y_train.tolist(), tokenizer)
test_dataset = NewsDataset(X_test.tolist(), y_test.tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2)

# Load DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model.to(device)

# Training parameters
optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 3
accumulate_steps = 2

# Training loop with gradient accumulation
def train_model(model, dataloader, optimizer, num_epochs=3, accumulate_steps=2):
    model.train()
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        epoch_loss = 0
        optimizer.zero_grad()
        
        for i, batch in enumerate(tqdm(dataloader)):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss / accumulate_steps  # normalize by accumulate_steps
            loss.backward()
            
            if (i + 1) % accumulate_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
            
            epoch_loss += loss.item()
        
        print(f"Epoch loss: {epoch_loss:.4f}")
    return model

# Train the model
model = train_model(model, train_loader, optimizer)

# Evaluation function
def evaluate_model(model, dataloader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())

    print(classification_report(true_labels, predictions, target_names=['Fake', 'Real']))

# Evaluate the model
evaluate_model(model, test_loader)


Using device: cuda


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


  1%|          | 80/8980 [00:06<11:16, 13.16it/s] 


KeyboardInterrupt: 