In [1]:
# Install necessary libraries
!pip install transformers torch pandas sklearn

Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'


  error: subprocess-exited-with-error
  
  python setup.py egg_info did not run successfully.
  exit code: 1
  
  [15 lines of output]
  The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
  rather than 'sklearn' for pip commands.
  
  Here is how to fix this error in the main use cases:
  - use 'pip install scikit-learn' rather than 'pip install sklearn'
  - replace 'sklearn' by 'scikit-learn' in your pip requirements files
    (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
  - if the 'sklearn' package is used by one of your dependencies,
    it would be great if you take some time to track which package uses
    'sklearn' instead of 'scikit-learn' and report it to their issue tracker
  - as a last resort, set the environment variable
    SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
  
  More information is available at
  https://github.com/scikit-learn/sklearn-pypi-package
  [end of output]
  
  note: This error originates from a subpr

In [4]:
# Import necessary libraries
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

In [5]:
# Check if CUDA is available
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')
print(f'Using device: {device}')

Using device: cuda


In [6]:
# Paths to datasets
fake_data_path = "E:/Computer Science/Computer Science Fall 2024/Information Retrival/Jupter/Jupyter_Practice/Fake.csv"
real_data_path = "E:/Computer Science/Computer Science Fall 2024/Information Retrival/Jupter/Jupyter_Practice/True.csv"

# Load and preprocess data
def load_and_preprocess_data(fake_data_path, real_data_path):
    fake_data = pd.read_csv(fake_data_path)
    real_data = pd.read_csv(real_data_path)

    # Add target labels
    fake_data['label'] = 0  # Fake news
    real_data['label'] = 1  # Real news

    # Concatenate and shuffle
    data = pd.concat([fake_data, real_data], ignore_index=True).sample(frac=1).reset_index(drop=True)

    # Split into train and test sets
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    return train_data, test_data

In [7]:
train_data, test_data = load_and_preprocess_data(fake_data_path, real_data_path)
train_data.head(), test_data.head()

(                                                   title  \
 36335   Billy Joel Just Publicly Humiliated The Donal...   
 12384  Swiss dismiss ETA activist's asylum bid, but s...   
 24419   SHAME: Desperate Rubio Repeatedly Tries To Pi...   
 24740  North Korea meeting seeks 'better ideas' to so...   
 27039   Watch The Moment A White Teacher Gets SCHOOLE...   
 
                                                     text    subject  \
 36335  Billy Joel made a fool of Donald Trump this we...       News   
 12384  ZURICH (Reuters) - A Swiss federal court has d...  worldnews   
 24419  In what can only be seen as the worst attempt ...       News   
 24740  OTTAWA (Reuters) - An international meeting in...  worldnews   
 27039  A brilliant moment where a high school student...       News   
 
                      date  label  
 36335        May 29, 2016      0  
 12384   December 1, 2017       1  
 24419   February 14, 2016      0  
 24740  November 29, 2017       1  
 27039    February

In [8]:
# Define the PyTorch Dataset for news classification
class NewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['label']
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_tensors='pt',
            padding='max_length',
            truncation=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [None]:
# Initialize tokenizer and model for DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
model = model.to(device)
print('DistilBERT model loaded and moved to', device)


In [None]:
# Create PyTorch Datasets
train_dataset = NewsDataset(train_data, tokenizer)
test_dataset = NewsDataset(test_data, tokenizer)


In [None]:
# DataLoader setup for batching
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
# Define training function
def train_model(model, train_loader, optimizer, scheduler, num_epochs=3):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss}")


In [None]:
# Define evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    print("Classification Report:\n", classification_report(true_labels, predictions))
    print("Confusion Matrix:\n", confusion_matrix(true_labels, predictions))

In [None]:
# Set up optimizer and scheduler
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 3  # 3 epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)


In [None]:
# Train the model
train_model(model, train_loader, optimizer, scheduler, num_epochs=3)

In [None]:

# Evaluate the model
evaluate_model(model, test_loader)