In [None]:
# Step 1: Connect to Google Drive to save files if needed
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Step 2: Install necessary libraries
!pip install transformers
!pip install kaggle



In [None]:
"""# Step 3: Set up Kaggle API and download the dataset
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json  # Set permissions"""

# Download and unzip the dataset
!kaggle datasets download -d saurabhshahane/fake-news-classification
#You have to make folders
!unzip fake-news-classification.zip -d /content/drive/MyDrive/DL/datasets

Dataset URL: https://www.kaggle.com/datasets/saurabhshahane/fake-news-classification
License(s): Attribution 4.0 International (CC BY 4.0)
fake-news-classification.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  fake-news-classification.zip
  inflating: /content/drive/MyDrive/DL/datasets/WELFake_Dataset.csv  


In [None]:
# Step 4: Import necessary libraries
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler


In [None]:
# Step 5: Load the dataset
df = pd.read_csv("/content/drive/MyDrive/DL/datasets/WELFake_Dataset.csv")  # Ensure the correct file is loaded if the name differs

In [None]:
# Step 6: Preprocess the data (Handle NaN values in the 'text' column)
df['text'] = df['text'].fillna("")

In [None]:
# Step 7: Split the data into train, validation, and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)


In [None]:
# Step 8: Load RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Step 9: Define a function to tokenize the data
def tokenize_data(texts):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            max_length=256,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

In [None]:
# Step 10: Tokenize the train, validation, and test data
train_input_ids, train_attention_masks = tokenize_data(train_texts)
val_input_ids, val_attention_masks = tokenize_data(val_texts)
test_input_ids, test_attention_masks = tokenize_data(test_texts)

# Convert labels to tensors
train_labels = torch.tensor(train_labels.values)
val_labels = torch.tensor(val_labels.values)
test_labels = torch.tensor(test_labels.values)

In [None]:
# save in drive
torch.save(train_input_ids, '/content/drive/MyDrive/DL/datasets/fake_news/train_input_ids.pt')
torch.save(train_attention_masks, '/content/drive/MyDrive/DL/datasets/fake_news/train_attention_masks.pt')
torch.save(train_labels, '/content/drive/MyDrive/DL/datasets/fake_news/train_labels.pt')

torch.save(val_input_ids, '/content/drive/MyDrive/DL/datasets/fake_news/val_input_ids.pt')
torch.save(val_attention_masks, '/content/drive/MyDrive/DL/datasets/fake_news/val_attention_masks.pt')
torch.save(val_labels, '/content/drive/MyDrive/DL/datasets/fake_news/val_labels.pt')

torch.save(test_input_ids, '/content/drive/MyDrive/DL/datasets/fake_news/test_input_ids.pt')
torch.save(test_attention_masks, '/content/drive/MyDrive/DL/datasets/fake_news/test_attention_masks.pt')
torch.save(test_labels, '/content/drive/MyDrive/DL/datasets/fake_news/test_labels.pt')


In [None]:
# using saved tensor
train_input_ids = torch.load('/content/drive/MyDrive/DL/datasets/fake_news/train_input_ids.pt')
train_attention_masks = torch.load('/content/drive/MyDrive/DL/datasets/fake_news/train_attention_masks.pt')
train_labels = torch.load('/content/drive/MyDrive/DL/datasets/fake_news/train_labels.pt')

val_input_ids = torch.load('/content/drive/MyDrive/DL/datasets/fake_news/val_input_ids.pt')
val_attention_masks = torch.load('/content/drive/MyDrive/DL/datasets/fake_news/val_attention_masks.pt')
val_labels = torch.load('/content/drive/MyDrive/DL/datasets/fake_news/val_labels.pt')

test_input_ids = torch.load('/content/drive/MyDrive/DL/datasets/fake_news/test_input_ids.pt')
test_attention_masks = torch.load('/content/drive/MyDrive/DL/datasets/fake_news/test_attention_masks.pt')
test_labels = torch.load('/content/drive/MyDrive/DL/datasets/fake_news/test_labels.pt')

  train_input_ids = torch.load('/content/drive/MyDrive/DL/datasets/fake_news/train_input_ids.pt')
  train_attention_masks = torch.load('/content/drive/MyDrive/DL/datasets/fake_news/train_attention_masks.pt')
  train_labels = torch.load('/content/drive/MyDrive/DL/datasets/fake_news/train_labels.pt')
  val_input_ids = torch.load('/content/drive/MyDrive/DL/datasets/fake_news/val_input_ids.pt')
  val_attention_masks = torch.load('/content/drive/MyDrive/DL/datasets/fake_news/val_attention_masks.pt')
  val_labels = torch.load('/content/drive/MyDrive/DL/datasets/fake_news/val_labels.pt')
  test_input_ids = torch.load('/content/drive/MyDrive/DL/datasets/fake_news/test_input_ids.pt')
  test_attention_masks = torch.load('/content/drive/MyDrive/DL/datasets/fake_news/test_attention_masks.pt')
  test_labels = torch.load('/content/drive/MyDrive/DL/datasets/fake_news/test_labels.pt')


In [None]:
# Step 11: Create DataLoader for training, validation, and testing
batch_size = 16

train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)

val_data = TensorDataset(val_input_ids, val_attention_masks, val_labels)
val_dataloader = DataLoader(val_data, sampler=SequentialSampler(val_data), batch_size=batch_size)

test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_dataloader = DataLoader(test_data, sampler=SequentialSampler(test_data), batch_size=batch_size)

In [None]:
from transformers import AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Step 12: Set optimizer
# We use AdamW, which is an optimization algorithm commonly used with transformers
optimizer = AdamW(model.parameters(), lr=1e-5)



In [None]:
# Step 13: Define model save path on Google Drive
save_path = '/content/drive/MyDrive/DL/datasets/fake_news/model_epoch'

In [None]:
# Step 16: Load the last saved model
# After training, load the model from the last saved epoch for evaluation
last_epoch = 1  # Specify the last epoch number
model.load_state_dict(torch.load(f'{save_path}_{last_epoch}.pth'))
model.to(model.device)
print(f"Model loaded from epoch {last_epoch}")

  model.load_state_dict(torch.load(f'{save_path}_{last_epoch}.pth'))


Model loaded from epoch 1


In [None]:
# Step 14: Training loop
#epochs = 2  # Specify the number of epochs
for epoch in range(epochs):
    model.train()  # Set the model to training mode
    total_loss = 0  # Initialize total loss for the epoch

    # Loop over each batch in the training data
    for batch in train_dataloader:
        # Move batch data to the device (GPU or CPU)
        b_input_ids, b_attention_mask, b_labels = tuple(t.to(model.device) for t in batch)

        # Clear previously calculated gradients
        model.zero_grad()

        # Perform a forward pass
        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        loss = outputs.loss  # Calculate the loss
        total_loss += loss.item()  # Accumulate the loss for averaging later

        # Backward pass to calculate gradients
        loss.backward()
        optimizer.step()  # Update model parameters

    # Step 15: Save model at the end of each epoch
    # Save the model’s state_dict (parameters) to Google Drive
    torch.save(model.state_dict(), f'{save_path}_{epoch + 1}.pth')
    print(f"Model saved at epoch {epoch + 1}")

    # Calculate and print average loss for the epoch
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}, Loss: {avg_loss}")

print("Training complete.")

Model saved at epoch 1
Epoch 1, Loss: 0.006515987392745273
Model saved at epoch 2
Epoch 2, Loss: 0.0038751543420198786
Training complete.


In [None]:
# Step 17: Define evaluation function
# This function calculates accuracy, precision, recall, and F1 score
def evaluate(model, dataloader):
    model.eval()  # Set the model to evaluation mode
    predictions, true_labels = [], []

    # Loop over each batch in the evaluation data
    for batch in dataloader:
        # Move batch data to the device (GPU or CPU)
        b_input_ids, b_attention_mask, b_labels = tuple(t.to(model.device) for t in batch)

        # Disable gradient calculation for faster evaluation
        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_attention_mask)

        logits = outputs.logits  # Obtain prediction logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()  # Convert logits to predicted labels
        label_ids = b_labels.cpu().numpy()  # Convert true labels to numpy array

        # Store predictions and true labels for metric calculation
        predictions.extend(preds)
        true_labels.extend(label_ids)

    # Calculate evaluation metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')

    return accuracy, precision, recall, f1


In [None]:
# Step 18: Evaluate on validation and test sets
# Use the trained model to evaluate on validation and test data
val_accuracy, val_precision, val_recall, val_f1 = evaluate(model, val_dataloader)
print(f"Validation - Accuracy: {val_accuracy:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1 Score: {val_f1:.4f}")

test_accuracy, test_precision, test_recall, test_f1 = evaluate(model, test_dataloader)
print(f"Test - Accuracy: {test_accuracy:.4f}, Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1 Score: {test_f1:.4f}")

Validation - Accuracy: 0.9964, Precision: 0.9938, Recall: 0.9993, F1 Score: 0.9966
Test - Accuracy: 0.9970, Precision: 0.9944, Recall: 0.9996, F1 Score: 0.9970
