In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaForSequenceClassification, AutoTokenizer

# optimizer, scheduler
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

from torch import nn

# multi-label classification metrics
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#tqdm
from tqdm.auto import tqdm
#for ignoring warnings
import warnings
warnings.filterwarnings("ignore")
import os
os.environ['TOKENIZERS_PARALLELISM'] = "false"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('dataset/train.csv')
dev = pd.read_csv('dataset/dev.csv')
test = pd.read_csv('dataset/test.csv')
df.head()

Unnamed: 0,id,comment_text,toxicity,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,threat
0,427918,can't understand your gibberish but it is tota...,1,0.0,1,0,0,1,0
1,5052426,"""moderates in Trump leaning states"" and Trump'...",0,0.0,0,0,0,0,0
2,5778344,The real Trump came out to play.\n\nWhile bein...,1,0.0,1,0,1,1,0
3,5729330,There is somewhere locked in your mind that fo...,0,0.0,0,0,0,0,0
4,7057187,Acosta was nearly a complete idiot. Now he ha...,1,0.05,1,0,1,1,0


In [3]:
# show full text
pd.set_option('display.max_colwidth', None)
df.sample(5)

Unnamed: 0,id,comment_text,toxicity,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,threat
7927,5718871,No mention of Fowler? I am expecting/hoping for good things from him this year. It's now or never for Benny.,0,0.0,0,0,0,0,0
17765,579523,stupid dog.,1,0.030769,1,0,0,1,0
57,7035560,Hillary is an unlikeable murderer. Trump is sticking it to middle class families and lied when he said he is in favor of lgbt rights. \n\nMy helath insurance is going up 30% due to trump. Obamacare looks amazing right now. Good job Trumpy?,1,0.043478,0,0,1,1,0
17430,380235,"Ward 1: Without getting bogged down in blaming President Bush and his buddies for Iraq and Afghanistan or President Obama for continuing our involvement in the Middle East, may I wholeheartedly agree that we're failing our ""wounded warriors"" who have returned and are still returning from deployments in the MiddleEast. Psychological wounds are destroying the lives of thousands of returning servicepersons. For whatever reasons, the VA is not addressing the many issues the young men and women are facing. They need help and support which they're not receiving. We read of incidents when someone like Brian Babb dies......yet, we continue to fail to offer adequate support for those with PTSD. To me, and obviously as I read your posts, to you, this is a national tragedy. \n\nI'd like to join you in working to get that support for the young men and women. What can we do? thanks and regards, Gary",0,0.0,0,0,0,0,0
18751,833158,"Exactly. I bet all of the people he listed drank water, too. Should we conclude that drinking water makes people violent? One of the first things you learn in a research methods class is that you can't infer causation from a correlation. And in this case, i'm not sure there is really a significant correlation. The plural of anecdote is not data.",0,0.0,0,0,0,0,0


In [4]:
df.drop(columns=['id', 'severe_toxicity', 'toxicity'], inplace=True)
dev.drop(columns=['id', 'severe_toxicity', 'toxicity'], inplace=True)
test.drop(columns=['id', 'severe_toxicity', 'toxicity'], inplace=True)
df.shape

(36000, 6)

In [5]:
cols = df.columns[1:]
cols

Index(['obscene', 'sexual_explicit', 'identity_attack', 'insult', 'threat'], dtype='object')

In [6]:
# hyperparameters
NUM_LABELS=5
MAX_LENGTH=512
BATCH_SIZE=64
MODEL_NAME='saved_target_model'
LR=2e-5
NUM_WARMUP_STEPS=3600
NUM_TRAIN_EPOCHS=20
NUM_LOG_STEPS=2250
NUM_SAVE_STEPS=2250

In [7]:
# Multi-label dataset
class ToxicityDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.labels = df[['obscene', 'threat', 'sexual_explicit', 'insult', 'identity_attack']].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        comment = self.df.iloc[idx].comment_text
        inputs = self.tokenizer(
            comment,
            max_length=self.max_len,
            padding='max_length',
            truncation=True
        )
        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

model_name = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
dataset = ToxicityDataset(df, tokenizer, 128)
dataset[0]

{'input_ids': tensor([    0,  7424,    75,  1346,   110, 25863,  1943,  1173,    53,    24,
            16,   746, 20175,     4,  1437,  1437,   114,    79,    34,   143,
          2956,    59,   878,    13,   558,    11,  6467,     6,  4309,    24,
             6,    79,    16, 20260,     4,  1437,    42,  1160,    40, 28297,
            69,  6000,     4,  1437,    79,   429,    25,   157,   517,   124,
             7, 45968,   281,    50,   277,   194,     4,  1437,    82,   218,
            75,  4309, 27338,  2258,     4,     2,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,   

In [8]:
train_dataset = ToxicityDataset(df, tokenizer, MAX_LENGTH)
dev_dataset = ToxicityDataset(dev, tokenizer, MAX_LENGTH)
test_dataset = ToxicityDataset(test, tokenizer, MAX_LENGTH)

# dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

In [9]:
len(train_loader)

563

In [10]:
# model, optimizer, scheduler, loss_fn
model = RobertaForSequenceClassification.from_pretrained("saved_target_model", num_labels=NUM_LABELS, problem_type="multi_label_classification", ignore_mismatched_sizes=True)
model.to('cuda')
target_layer = 'roberta.encoder.layer.10'

unfreeze = False
for name, layer in model.named_modules():
    if name == target_layer:
        unfreeze = True
    if not unfreeze:
        for param in layer.parameters():
            param.requires_grad = False
    else:
        for param in layer.parameters():
            param.requires_grad = True  
            
criterion = nn.BCEWithLogitsLoss()
optimizer = AdamW(filter(lambda p:p.requires_grad, model.parameters()), lr=LR)
NUM_TRAIN_STEPS=len(train_loader)*NUM_TRAIN_EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=NUM_WARMUP_STEPS, num_training_steps=NUM_TRAIN_STEPS)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at saved_target_model and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
TOTAL_STEPS = len(train_loader) * NUM_TRAIN_EPOCHS
TOTAL_STEPS

11260

In [12]:
import torch
import numpy as np
from tqdm import tqdm

TOTAL_STEPS = len(train_loader) * NUM_TRAIN_EPOCHS
loss_i = []
val_loss, val_acc = [], []
saved_models = []
step = 0
train_iter = iter(train_loader)

for step in tqdm(range(TOTAL_STEPS)):
    try:
        batch = next(train_iter)
    except StopIteration:
        train_iter = iter(train_loader)
        batch = next(train_iter)
    
    model.train()
    optimizer.zero_grad()

    batch = {k: v.to('cuda') for k, v in batch.items()}
    
    logits = model(batch['input_ids'], attention_mask=batch['attention_mask']).logits
    loss = criterion(logits, batch['labels'])
    
    loss.backward()
    optimizer.step()
    scheduler.step()
    
    loss_i.append(loss.item())
    
    if step % NUM_LOG_STEPS == 0 and step > 0:
        model.eval()
        total_val_loss = 0
        correct_predictions = np.zeros(len(cols))
        total_predictions = np.zeros(len(cols))
        
        with torch.no_grad():
            for batch in tqdm(dev_loader, leave=False, total=len(dev_loader)):
                batch = {k: v.to('cuda') for k, v in batch.items()}
                logits = model(batch['input_ids'], attention_mask=batch['attention_mask']).logits
                loss = criterion(logits, batch['labels'])
                total_val_loss += loss.item()
                
                preds = torch.sigmoid(logits).cpu().numpy()
                labels = batch['labels'].cpu().numpy()
                
                preds = (preds > 0.5)
                correct_predictions += np.sum(preds == labels, axis=0)
                total_predictions += labels.shape[0]
        
        val_loss_epoch = total_val_loss / len(dev_loader)
        accuracy = correct_predictions / total_predictions
        
        val_loss.append(val_loss_epoch)
        val_acc.append(accuracy)
        
        print(f"Step {step} | Training Loss: {loss_i[-1]:.4f} | Validation Loss: {val_loss_epoch:.4f} | Validation Accuracy: {accuracy}")
        
        model.train()
    
    if step % NUM_SAVE_STEPS == 0 and step > 0:
        if len(saved_models) < 2 or val_loss_epoch < max(saved_models, key=lambda x: x[1])[1]:
            model_save_path = f'model_checkpoint_step_{step}.pt'
            torch.save(model.state_dict(), model_save_path)
            saved_models.append((model_save_path, val_loss_epoch))
            saved_models = sorted(saved_models, key=lambda x: x[1])[:2]
            
            print(f"Model checkpoint saved at {model_save_path}")


  0%|          | 0/11260 [00:00<?, ?it/s]

  1%|          | 86/11260 [01:49<3:57:18,  1.27s/it]


KeyboardInterrupt: 

In [None]:
# test the model
from collections import defaultdict

model.eval()
test_loss = 0
correct_predictions = np.zeros(len(cols))  
total_predictions=np.zeros(len(cols))
samples = defaultdict(list)
with torch.no_grad():
    for batch in tqdm(test_loader):
        batch = {k:v.to('cuda') for k,v in batch.items()}
        logits = model(batch['input_ids'], attention_mask=batch['attention_mask']).logits
        loss = criterion(logits, batch['labels'])
        test_loss += loss.item()
        preds = torch.sigmoid(logits).cpu().numpy()
        labels = batch['labels'].cpu().numpy()
        preds = (preds > 0.5)
        
        correct_predictions += (preds==labels).sum(axis=0)
        total_predictions += labels.shape[0]
        
        decoded_text = tokenizer.batch_decode(batch['input_ids'], skip_special_tokens=True)
        samples['text'].extend(decoded_text)
        samples['labels'].extend(labels)
        samples['preds'].extend(preds)
        
    total_loss = test_loss / len(test_loader)
    total_accuracy = correct_predictions / total_predictions
    
    print(f"Test Loss: {total_loss:.4f} | Test Accuracy: {total_accuracy}")
    

 15%|█▍        | 186/1250 [00:20<01:59,  8.90it/s]


KeyboardInterrupt: 

In [34]:
import random

random_100 = random.sample(list(zip(samples['text'], samples['labels'], samples['preds'])), 100)

dump = pd.DataFrame(random_100, columns=['text', 'labels', 'preds'])
dump.to_csv('random_100.csv', index=False)

In [3]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
dump = pd.read_csv('random_100.csv')
dump.sample(10)

Unnamed: 0,text,labels,preds
20,"I believe in climate change, always have. However I'm more concerned about the potential for a new ice age (which also has growing scientific support) than I am about warming trends. The reality is according to many climate change models, when it comes to a warming climate Canada is (will be) a net beneficiary (agriculture Canada has a good summary). So rather than ""fighting"" global warming we should be encouraging it.",[0. 0. 0. 0. 0.],[False False False False False]
51,Is there any writers at ADN that will report honestly and fairly to both sides of an issue?,[0. 0. 0. 0. 0.],[False False False False False]
15,When will the media have the courage to interview the parents who brought their kids here illegally and ask why they never tried to gain legal citizenship?\n\nInstead they always blame the laws on the books. These kids should go home and talk to their parents and not walk out in a protest. Hopefully we do not cave on this issue.,[0. 0. 0. 0. 0.],[False False False False False]
95,"Yeah, who doesn't want more of this.\nBC Hydro’s real debt has grown 1337% under Liberals\nhttp://commonsensecanadian.ca/bc-hydro-real-debt-grown-1337-liberals-shouldnt-someone-call-cops/\n\n98 BC Liberal Falsehoods, Boondoggles and Scandals: The Clark Era 53\nhttps://thetyee.ca/Opinion/2017/04/04/BC-Liberal-Falsehoods-Scandals-Clark/",[0. 0. 0. 0. 0.],[False False False False False]
65,"A team of home-schooled children from Wasilla out tested the entire rest of the state and will be representing Alaska at the 2016 National Academic Decathlon.\n\nAlaska is second in the nation with per capita education spending of $18,175 per student, while simultaneously holding the shameful distinction of having 63% of 8th grade students not proficient in either reading or math. (source NAEP)\n\nThese big government money wasters should get their own house in order, before pointing fingers at home-schoolers.",[0. 0. 0. 0. 0.],[False False False False False]
43,"good idea!\npoop on the nay-sayers until their kids, or mother are the one hit or killed, then they'll change their minds.",[1. 1. 0. 1. 1.],[False False False False False]
72,Businesses should not be taxed anything more than municipal property tax. If the Libs had any tact they'd be making business-friendly moves to compete with USA. They'd also be managing HST and income tax revenues responsibly.,[0. 0. 0. 0. 0.],[False False False False False]
9,"Sure, then let's have a bloated, egotistical, trash talking, racist carry the nuclear codes.\nPerhaps you need a little more sleep?",[1. 1. 0. 1. 1.],[ True False False True True]
83,What an utter buffoon.,[1. 0. 0. 1. 0.],[ True False False True True]
35,Wallonia does not suppress Flemish language the way Quebec suppresses English language,[0. 0. 0. 0. 0.],[False False False False False]


In [23]:
labels = np.array([[0, 1, 0], [1, 0, 1], [0, 1, 0], [1, 0, 1], [0, 1, 0], [1, 0, 1], [0, 1, 0], [1, 0, 1]])
preds = np.array([[0, 0, 1], [1, 0, 1], [0, 1, 0], [1, 0, 1], [0, 1, 0], [1, 0, 1], [0, 1, 0], [1, 0, 1]])
a = torch.zeros(3)
a += np.sum(preds == labels, axis=0)
a

tensor([8., 7., 7.], dtype=torch.float64)