# Dataset

In [None]:
from datasets import load_dataset
import random
import torch
from torch import nn
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import AdamW, DistilBertTokenizerFast, DistilBertForSequenceClassification, get_scheduler

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
dataset = load_dataset("civil_comments")


class CivilCommentsDataset(torch.utils.data.Dataset):
    """
    Builds split instance of the `civil_comments` dataset: https://huggingface.co/datasets/civil_comments.
    """

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


def build_data_split(split, num_data_points):
    print(f"Generating {num_data_points} data points for {split} split...", end="", flush=True)

    civil_idx = []
    uncivil_idx = []
    num_civil = num_data_points / 2
    num_uncivil = num_data_points / 2

    for i, data in enumerate(dataset[split]):
        if data["toxicity"] < 0.5 and num_civil > 0:
            civil_idx.append(i)
            num_civil -= 1
        elif data["toxicity"] > 0.5 and num_uncivil > 0:
            uncivil_idx.append(i)
            num_uncivil -= 1

        if num_civil == 0 and num_uncivil == 0:
            break

    indexes = civil_idx + uncivil_idx
    random.shuffle(indexes)
    encodings = tokenizer(dataset[split][indexes]["text"], truncation=True, padding=True)
    labels = dataset[split][indexes]["toxicity"]

    print("done")
    return encodings, labels


encodings, labels = build_data_split("train", 500)
train_dataset = CivilCommentsDataset(encodings, labels)
encodings, labels = build_data_split("validation", 500)
val_dataset = CivilCommentsDataset(encodings, labels)

# Model

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=1,
)
model.dropout.p = 0
model.add_module(module=nn.Sigmoid(), name="sigmoid")

for param in model.base_model.parameters():
    param.requires_grad = False


train_data_loader = DataLoader(train_dataset, shuffle=True, batch_size=128)
eval_data_loader = DataLoader(val_dataset, batch_size=128)

optimizer = AdamW(model.parameters(), lr=1e-3)

num_epochs = 20
num_training_steps = num_epochs * len(train_data_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

progress_bar = tqdm(range(num_training_steps))


def eval_mod():
    mse_mean = []
    acc_mean = []
    for batch in eval_data_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        labels = batch["labels"]
        outputs = outputs.logits

        mse_mean.append(torch.mean(torch.square(outputs - labels)))
        acc_mean.append(
            torch.mean(torch.eq(outputs.transpose(0, 1) > 0.5, labels > 0.5).float())
        )

    return torch.mean(torch.stack(mse_mean)), torch.mean(torch.stack(acc_mean))

# Main Program

In [None]:
for epoch in range(num_epochs):
    losses = []
    for batch in train_data_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        losses.append(float(loss.data))
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    mse_mean, accuracy_mean = eval_mod()
    loss_mean = torch.mean(torch.tensor(losses))
    print(f" After epoch {epoch} | Train Loss: {loss_mean:.2f}, Val MSE: {mse_mean:.2f}, Val Accuracy: {accuracy_mean:.2f}")
    model.save_pretrained(f"./results/checkpoints/epoch-{epoch}")

model.save_pretrained("./results/final_model")
print("\nProgram complete")
