# Milestone #2: Ravi Raghavan

## Milestone Aim

The goal is to use a **pretrained BERT-Base, Uncased** model and **fine-tune it on the r/Fakeeddit dataset**.

This work presents evaluation results on:
- Pretrained BERT
- Fine-Tuned BERT


## Script Sanity Check

Please ensure your directory is structured as follows

```text
cleaned_data/
├── test_5k.csv
├── train.csv
└── validation_5k.csv

## Environment Setup

In [1]:
!pip install evaluate



In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import torch
import sys
import torch.nn as nn

## Ensure TensorFlow is not used
import os
os.environ["USE_TF"] = "0"

# Import Hugging Face Tooling
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import evaluate
from datasets import Dataset

# # Ravi's Laptop
# # Define data directory
# DATA_DIR = "cleaned_data"

# # Define file paths
# TRAIN_DATA_FILE = os.path.join(DATA_DIR, "train.csv")
# VALIDATION_DATA_FILE = os.path.join(DATA_DIR, "validation_5k.csv")
# TEST_DATA_FILE = os.path.join(DATA_DIR, "test_5k.csv")

# For reproducability
random_state = 42

# Use CPU/MPS if possible
device = None
if "google.colab" in sys.modules:
    # Running in Colab
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
else:
    # Not in Colab (e.g., Mac)
    device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

print("Using device:", device)

Using device: cuda


## Load Data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
train_data = "/content/drive/MyDrive/ESE 5460 Term Project/Cleaned Data/Text/train.csv"
test_data = "/content/drive/MyDrive/ESE 5460 Term Project/Cleaned Data/Text/test_5k.csv"
validation_data = "/content/drive/MyDrive/ESE 5460 Term Project/Cleaned Data/Text/validation_5k.csv"

In [5]:
import pandas as pd

TRAIN_DATA = pd.read_csv(train_data)
VALIDATION_DATA = pd.read_csv(validation_data, index_col=0)
TEST_DATA = pd.read_csv(test_data, index_col=0)

# Ravi's Laptop
# TRAIN_DATA = pd.read_csv(TRAIN_DATA_FILE)
# VALIDATION_DATA = pd.read_csv(VALIDATION_DATA_FILE, index_col = 0)
# TEST_DATA = pd.read_csv(TEST_DATA_FILE, index_col = 0)

In [6]:
# Ignore rows in corrupted_indices.txt files
def filter_out_corrupted_rows(split, DF):
    # File with corrupted indices
    if split == "train":
        # Dhruv's Laptop
        corrupted_indices_file = f"/content/drive/MyDrive/ESE 5460 Term Project/Cleaned Data/New Corrupted File Information/{split}_corrupted_indices.txt"
        # # Ravi's Laptop
        # corrupted_indices_file = f"{split}_corrupted_indices.txt"
    else:
        # Dhruv's Laptop
        corrupted_indices_file = f"/content/drive/MyDrive/ESE 5460 Term Project/Cleaned Data/New Corrupted File Information/{split}_5k_corrupted_indices.txt"
        # # Ravi's Laptop
        # corrupted_indices_file = f"{split}_5k_corrupted_indices.txt"

    # Store list of corrupted indices
    corrupted_indices = None

    # Get list of corrupted indices
    with open(corrupted_indices_file, "r") as f:
        corrupted_indices = list(int(line.strip()) for line in f if line.strip())

    print(f"Split: {split}, Corrupted Indices: {corrupted_indices}, Length: {len(corrupted_indices)}")

    # Filter out corrupted rows
    DF = DF.drop(index = corrupted_indices)

    return DF

In [7]:
TRAIN_DATA = filter_out_corrupted_rows("train", TRAIN_DATA)
VALIDATION_DATA = filter_out_corrupted_rows("validation", VALIDATION_DATA)
TEST_DATA = filter_out_corrupted_rows("test", TEST_DATA)

Split: train, Corrupted Indices: [2862, 26040, 28337, 18547, 13374, 11288, 31984, 18451, 19000, 22479, 8048, 32075, 22918, 5586, 19345, 12770, 32189, 14628, 9081, 6611, 2927], Length: 21
Split: validation, Corrupted Indices: [6568, 32176], Length: 2
Split: test, Corrupted Indices: [29133, 9437, 26504, 11394], Length: 4


In [8]:
TRAIN_DATA.head()

Unnamed: 0,clean_title,created_utc,domain,image_url,num_comments,score,subreddit,upvote_ratio,2_way_label,3_way_label,6_way_label
0,this spongebob squarepants branded battery,2019-07-30 20:00:50,i.redd.it,https://preview.redd.it/f39wxxk8yhd31.jpg?widt...,4.0,33,mildlyinteresting,0.95,1,0,0
1,award for careless talk,2011-09-03 17:26:23,i.imgur.com,https://external-preview.redd.it/KgPHCi1u3fY5j...,1.0,14,propagandaposters,1.0,0,1,5
2,four aligned airplanes,2017-11-20 06:05:45,i.redd.it,https://preview.redd.it/88v9axk19phx.jpg?width...,24.0,198,confusing_perspective,0.98,0,2,2
3,columbus discovers the new world,2019-08-28 15:40:17,i.redd.it,https://preview.redd.it/x4wzpd0am7j31.jpg?widt...,5.0,318,fakehistoryporn,0.98,0,2,2
4,feed me drummmmssssssss,2014-05-09 13:23:59,i.imgur.com,https://external-preview.redd.it/yNN57loQnVhLk...,0.0,3,pareidolia,0.62,0,2,2


In [9]:
VALIDATION_DATA.head()

Unnamed: 0,clean_title,created_utc,domain,image_url,num_comments,score,subreddit,upvote_ratio,2_way_label,3_way_label,6_way_label
8637,not as heartwarming as it could have been anth...,2019-09-19 17:48:33,lifestyle.clickhole.com,https://external-preview.redd.it/850kBbKdgMKfz...,0.0,15,theonion,0.86,0,2,1
20669,other discussions,2013-12-09 23:58:43,,http://i.dailymail.co.uk/i/pix/2013/12/09/arti...,,0,psbattle_artwork,,0,2,4
13179,on reflection oc,2015-01-12 17:14:55,i.imgur.com,https://external-preview.redd.it/tiFw8Ggb178E4...,0.0,3,pareidolia,0.67,0,2,2
20565,viet congo setting booby trap,2019-02-27 09:57:07,i.redd.it,https://preview.redd.it/mj81gkh533j21.jpg?widt...,4.0,14,fakehistoryporn,0.8,0,2,2
15504,chief has some happy shoulder armour,2013-02-20 01:00:02,i.imgur.com,https://external-preview.redd.it/kqqIlnAOZxzxA...,0.0,6,pareidolia,1.0,0,2,2


In [10]:
TEST_DATA.head()

Unnamed: 0,clean_title,created_utc,domain,image_url,num_comments,score,subreddit,upvote_ratio,2_way_label,3_way_label,6_way_label
19660,young homosexuals gather outside of a nightclu...,2018-07-29 13:59:38,i.redd.it,https://preview.redd.it/16j830998wc11.jpg?widt...,1.0,75,fakehistoryporn,0.92,0,2,2
32879,cara al sol facing the sun a series of posters...,2013-05-20 18:25:00,imgur.com,https://external-preview.redd.it/S_nXt5X8VMqZD...,4.0,19,propagandaposters,0.89,0,1,5
15071,he is awake and rises from the depths,2013-10-13 20:02:17,imgur.com,https://external-preview.redd.it/JetvyFQFm4fYt...,0.0,7,pareidolia,0.99,0,2,2
5198,the reason germany invades france and not spain,2018-09-03 08:22:07,i.redd.it,https://preview.redd.it/ws91cs7lgzj11.png?widt...,0.0,57,fakehistoryporn,0.97,0,2,2
15061,frozen body preserved on mount everest nsfw,2017-09-12 08:10:21,i.redd.it,https://preview.redd.it/zxwc8gq8uelz.jpg?width...,0.0,24,fakehistoryporn,1.0,0,2,2


## Compute Class Proportions

In [11]:
# Compute Class Proportions
p0 = (TRAIN_DATA['2_way_label'] == 0).mean() # Computes the percentage of our training dataset that has label = 0 [Fake News]
p1 = (TRAIN_DATA['2_way_label'] == 1).mean() # Computes the percentage of our training dataset that has label = 1 [Non-Fake News]
print(f"{p0  * 100}% of our dataset has label = 0 and {p1  * 100}% of our dataset has label = 1")

44.62961294778248% of our dataset has label = 0 and 55.37038705221752% of our dataset has label = 1


## Define Prior Adjusted Loss Criterion

In [12]:
# Define Weighted Loss Criterion
class_weights = torch.tensor([p1, p0]).float().to(device)
custom_criterion = nn.CrossEntropyLoss(weight = class_weights)
print(f"Class Weights: {class_weights}")

Class Weights: tensor([0.5537, 0.4463], device='cuda:0')


## Fetch BERT From HuggingFace

In [13]:
# Fetch BERT Model from HuggingFace
bert_model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
model = BertForSequenceClassification.from_pretrained(bert_model_name, num_labels = 2) # num_labels = 2 since we have 2 classes!

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Create `Hugging Face` Datasets [Train + Dev + Test]


In [14]:
train_hf_dataset = Dataset.from_pandas(TRAIN_DATA)
dev_hf_dataset = Dataset.from_pandas(VALIDATION_DATA)
test_hf_dataset = Dataset.from_pandas(TEST_DATA)

## Tokenize Text Data

In [15]:
def tokenize_function(row):
  tokens = tokenizer(row['clean_title'], truncation = True, padding = 'max_length', max_length = tokenizer.model_max_length)
  row['input_ids'] = tokens['input_ids']
  row['attention_mask'] = tokens['attention_mask']
  row['token_type_ids'] = tokens['token_type_ids']
  row['label'] = int(row['2_way_label'])
  return row

In [16]:
train_hf_dataset = train_hf_dataset.map(tokenize_function)
dev_hf_dataset = dev_hf_dataset.map(tokenize_function)
test_hf_dataset = test_hf_dataset.map(tokenize_function)

Map:   0%|          | 0/33303 [00:00<?, ? examples/s]

Map:   0%|          | 0/4998 [00:00<?, ? examples/s]

Map:   0%|          | 0/4996 [00:00<?, ? examples/s]

## Define Accuracy, Precision, Recall, and F1 Metrics from Hugging Face

In [17]:
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load('recall')
f1_metric = evaluate.load("f1")

## Define a compute_metrics function

In [18]:
def compute_metrics(eval_pred):
    # Get the model predictions
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Return Metrics
    return {
        "accuracy": accuracy_metric.compute(predictions=predictions, references=labels)['accuracy'], # Accuracy
        "pos_precision": precision_metric.compute(predictions=predictions, references=labels, pos_label = 1, average = 'binary', zero_division = 0)["precision"], # Precision on the Class w/ Label = 1 [Hate Samples]
        "pos_recall": recall_metric.compute(predictions=predictions, references=labels, pos_label = 1, average = 'binary', zero_division = 0)['recall'], # Recall on the Class w/ Label = 1 [Hate Samples]
        "pos_f1": f1_metric.compute(predictions=predictions, references=labels, pos_label = 1, average = 'binary')["f1"], # F1 Score on the Class w/ Label = 1 [Hate Samples]
        "neg_precision": precision_metric.compute(predictions=predictions, references=labels, pos_label = 0, average = 'binary', zero_division = 0)['precision'], # Precision on the Class w/ Label = 0 [Non-Hate Samples]
        "neg_recall": recall_metric.compute(predictions=predictions, references=labels, pos_label = 0, average = 'binary', zero_division = 0)['recall'], # Recall on the Class w/ Label = 0 [Non-Hate Samples]
        "neg_f1": f1_metric.compute(predictions=predictions, references=labels, pos_label = 0, average = 'binary')['f1'], # F1 Score on the Class w/ Label = 0 [Non-Hate Samples]
        "f1_macro": f1_metric.compute(predictions=predictions, references=labels, average='macro')['f1'], # Macro F1 Score
        "f1_micro": f1_metric.compute(predictions=predictions, references=labels, average='micro')['f1'], # Micro F1 Score
        "f1_weighted": f1_metric.compute(predictions=predictions, references=labels, average='weighted')['f1'], # Weighted F1 Score
    }


## Subclass the `Trainer` Class from HuggingFace to use Custom Loss Criterion


In [19]:
# Create a subclassed Trainer that enables us to use the custom loss function defined earlier
class SubTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs = False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = custom_criterion(logits, labels)
        return (loss, outputs) if return_outputs else loss

## **Initialize the `TrainingArguments` and `Trainer`**

In [20]:
training_args = TrainingArguments(
    output_dir="Milestone2-Baseline-BERT-PreTraining",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=5e-5,
    num_train_epochs=3,
    # save_strategy="steps",      # save checkpoints every N steps
    # save_steps=100,             # save every 100 steps
    eval_strategy="steps",      # evaluate every N steps
    eval_steps=100,             # evaluate every 100 steps
    logging_strategy="steps",
    logging_steps=100,          # log every 100 steps
    report_to="none",
    full_determinism=True
)

trainer = SubTrainer(
    model=model,
    args=training_args,
    train_dataset=train_hf_dataset,
    eval_dataset=dev_hf_dataset,
    compute_metrics=compute_metrics,
)

# Set Up: Freeze BERT and Update FC Weights

In [21]:
# Freeze all BERT parameters except the classification head
for param in model.bert.parameters():
  param.requires_grad = False

# Confirm that only classifier is trainable
for name, param in model.named_parameters():
  if param.requires_grad:
    print(name, param.requires_grad)

classifier.weight True
classifier.bias True


# **Train the Model: `Pre-Training`**

In [22]:
trainer.train() # Always Resume from Last Checkpoint to Save Time
trainer.save_model('Milestone2-Baseline-BERT-FinalModel(PreTrain)') # Save the Final Model
trainer.save_state() # Save the State of the Trainer (e.g. Losses, etc)

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

# **Evaluate Pre-Trained Model on Train, Dev, and Test Datasets**

In [None]:
# Split: Train, Dev, or Test
def generate_evaluation_results(split):
    dataset = None
    if split == "train":
        dataset = train_hf_dataset
    elif split == "dev" or split == "validation" or split == "val":
        dataset = dev_hf_dataset
    elif split == "test":
        dataset = test_hf_dataset

    results = trainer.evaluate(eval_dataset=dataset, metric_key_prefix=split)
    df_results = pd.DataFrame([results])
    df_results.to_csv(f"Milestone #2 Pre-Trained BERT Baseline {split} Results.csv", index=False)
    print(f"Saved {split} evaluation metrics to Milestone #2 Pre-Trained BERT Baseline {split} Results.csv")

# Generate Evaluation Results on Train, Dev, and Test Splits
generate_evaluation_results("train")
generate_evaluation_results("dev")
generate_evaluation_results("test")

## **Initialize the `TrainingArguments` and `Trainer`**

In [None]:
training_args = TrainingArguments(
    output_dir="Milestone2-Baseline-BERT-FineTuning",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=5e-5,
    num_train_epochs=3,
    # save_strategy="steps",      # save checkpoints every N steps
    # save_steps=100,             # save every 100 steps
    eval_strategy="steps",      # evaluate every N steps
    eval_steps=100,             # evaluate every 100 steps
    logging_strategy="steps",
    logging_steps=100,          # log every 100 steps
    report_to="none",
    full_determinism=True
)

trainer = SubTrainer(
    model=model,
    args=training_args,
    train_dataset=train_hf_dataset,
    eval_dataset=dev_hf_dataset,
    compute_metrics=compute_metrics,
)

# Set Up Model for Fine-Tune

In [None]:
# Fine-Tune BERT
for param in model.bert.parameters():
  param.requires_grad = True

# Confirm classifier is still trainable
for name, param in model.classifier.named_parameters():
    print(name, param.requires_grad)

# **Train the Model: `Fine-Tuning`**

In [None]:
trainer.train() # Always Resume from Last Checkpoint to Save Time
trainer.save_model('Milestone2-Baseline-BERT-FinalModel') # Save the Final Model
trainer.save_state() # Save the State of the Trainer (e.g. Losses, etc)

# **Evaluate Fine-Tuned Model on Train, Dev, and Test Datasets**

In [None]:
# Split: Train, Dev, or Test
def generate_evaluation_results(split):
    dataset = None
    if split == "train":
        dataset = train_hf_dataset
    elif split == "dev" or split == "validation" or split == "val":
        dataset = dev_hf_dataset
    elif split == "test":
        dataset = test_hf_dataset

    results = trainer.evaluate(eval_dataset=dataset, metric_key_prefix=split)
    df_results = pd.DataFrame([results])
    df_results.to_csv(f"Milestone #2 Fine-Tuned BERT Baseline {split} Results.csv", index=False)
    print(f"Saved {split} evaluation metrics to Milestone #2 Fine-Tuned BERT Baseline {split} Results.csv")

# Generate Evaluation Results on Train, Dev, and Test Splits
generate_evaluation_results("train")
generate_evaluation_results("dev")
generate_evaluation_results("test")