# RoBERTA Multilabel classification
In first place I chose BERT and DistilBERT, but RoBERTA turned out to be the best option eventually

In [1]:
import pandas as pd
import numpy as np
import random

import torch
import warnings

from tqdm import tqdm

from ydata_profiling import ProfileReport

from torch.nn import BCEWithLogitsLoss
from transformers import RobertaTokenizerFast, \
RobertaModel, Trainer, TrainingArguments,EvalPrediction, TrainerCallback

from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel, RobertaClassificationHead
from torch.utils.data import DataLoader

from skmultilearn.model_selection import iterative_train_test_split

#import wandb # explore for later
# import os
# os.environ["WANDB_DISABLED"] = "true"
%matplotlib inline

In [2]:
gpu_ok = False
if torch.cuda.is_available():
    device_cap = torch.cuda.get_device_capability()
    if device_cap in ((7, 0), (8, 0), (9, 0)):
        gpu_ok = True
if not gpu_ok:
    warnings.warn(
        "GPU is not NVIDIA V100, A100, or H100. Speedup numbers may be lower "
        "than expected."
    )



In [3]:
# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)
torch.manual_seed(hash("by removing stochasticity") % 2**32 - 1)
torch.cuda.manual_seed_all(hash("so runs are repeatable") % 2**32 - 1)

# 1. Preprocessing Data

ETL steps

## Load dataset

In [4]:
df_train = pd.read_csv('train.csv')
df_train

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


## Data analysis

In [12]:
profile_report_train = ProfileReport(df_train, title='Toxic Comments train dataset', explorative=True)
profile_report_train.to_notebook_iframe()

# We have here data imbalance

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

## Data Augmentation (Back translation from English to German and vice versa)

In [5]:
print(torch.version.cuda)
print(torch.backends.cudnn.enabled)

# Check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

torch.set_float32_matmul_precision('high')

11.8
True
Using device: cuda


In [6]:
import pandas as pd
from langdetect import detect, DetectorFactory
from transformers import MarianMTModel, MarianTokenizer
import torch
from tqdm import tqdm

# Ensure consistent results from langdetect
DetectorFactory.seed = 0

class BackTranslator:
    def __init__(self, from_lang='en', to_lang='de', device='cuda'):
        self.from_lang = from_lang
        self.to_lang = to_lang
        self.device = device
        self.from_model_name = f'Helsinki-NLP/opus-mt-{from_lang}-{to_lang}'
        self.to_model_name = f'Helsinki-NLP/opus-mt-{to_lang}-{from_lang}'

        # Initialize model and tokenizer
        self.from_model = MarianMTModel.from_pretrained(self.from_model_name).to(self.device)
        self.from_model = torch.compile(self.from_model, mode="max-autotune", fullgraph=True)
        self.to_model = MarianMTModel.from_pretrained(self.to_model_name).to(self.device)
        self.to_model = torch.compile(self.to_model, mode="max-autotune", fullgraph=True)
        self.from_tokenizer = MarianTokenizer.from_pretrained(self.from_model_name)
        self.to_tokenizer = MarianTokenizer.from_pretrained(self.to_model_name)

    def translate_batch(self, batch):
        translations = []
        for text in batch:
            if not text.strip():  # Skip empty or whitespace-only texts
                translations.append('')
                continue
            encoded_text = self.from_tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(self.device)
            translated = self.to_model.generate(**encoded_text)
            decoded_text = self.to_tokenizer.batch_decode(translated, skip_special_tokens=True)
            translations.extend(decoded_text)
        return translations

def detect_languages(texts):
    languages = []
    for text in tqdm(texts, desc='Detecting languages', unit='text'):
        try:
            lang = detect(text)
            languages.append(lang)
        except Exception as e:
            print(f"Language detection failed for text: {text}. Error: {e}")
            languages.append(None)  # Use None for texts where detection fails
    return languages

def back_translate(df, languages, from_lang='en', to_lang='de', device='cuda', batch_size=32):
    translator = BackTranslator(from_lang=from_lang, to_lang=to_lang, device=device)
    
    # Filter English comments
    english_comments = [df['comment_text'][i] for i, lang in enumerate(languages) if lang == 'en']
    
    # Translate English comments
    translated_comments = []
    for i in tqdm(range(0, len(english_comments), batch_size), desc='Translating comments', unit='batch'):
        batch = english_comments[i:i + batch_size]
        translated_batch = translator.translate_batch(batch)
        translated_comments.extend(translated_batch)
    
    # Map translations back to the original indices
    augmented_comments = []
    translation_index = 0
    for lang in languages:
        if lang == 'en':
            augmented_comments.append(translated_comments[translation_index])
            translation_index += 1
        else:
            augmented_comments.append(df['comment_text'][len(augmented_comments)])
    
    # Create a new DataFrame with the augmented data
    augmented_df = df.copy()
    augmented_df['comment_text'] = augmented_comments
    
    return augmented_df

In [None]:
languages = detect_languages(df_train['comment_text'])
languages

In [None]:
augmented_df = back_translate(df_train, languages, from_lang='en', to_lang='de', device='cuda', batch_size=64)
augmented_df

In [None]:
# augmented_df['augmented'] = 1
# df_train['augmented'] = 0
final_df_train = pd.concat([df_train, augmented_df], ignore_index=True)
final_df_train

## Lable weights


In [7]:
texts = df_train['comment_text'].tolist()
labels = df_train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

labels = np.array(labels, dtype=int)

label_weights = 1 - labels.sum(axis=0) / labels.sum()
label_weights

array([0.56424868, 0.95455582, 0.75927403, 0.98638099, 0.77557126,
       0.95996923])

## Train, Dev split

In [8]:
test_size = 0.2
row_ids = np.arange(len(labels))
train_idx, _, test_idx, _ = iterative_train_test_split(row_ids[:, np.newaxis], labels, test_size=test_size)

train_dataset = df_train.iloc[train_idx.flatten()].reset_index(drop=True)
test_dataset = df_train.iloc[test_idx.flatten()].reset_index(drop=True)

train_dataset.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [9]:
train_dataset['labels'] = train_dataset[train_dataset.columns[2:]].values.tolist()
train_dataset = train_dataset[['id','comment_text', 'labels']].reset_index(drop=True)
test_dataset['labels'] = test_dataset[test_dataset.columns[2:]].values.tolist()
test_dataset = test_dataset[['id','comment_text', 'labels']].reset_index(drop=True)
train_dataset

Unnamed: 0,id,comment_text,labels
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,"[0, 0, 0, 0, 0, 0]"
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,"[0, 0, 0, 0, 0, 0]"
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...","[0, 0, 0, 0, 0, 0]"
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...","[0, 0, 0, 0, 0, 0]"
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...","[0, 0, 0, 0, 0, 0]"
...,...,...,...
127651,ffe029a7c79dc7fe,"""\nplease identify what part of BLP applies be...","[0, 0, 0, 0, 0, 0]"
127652,ffe897e7f7182c90,Catalan independentism is the social movement ...,"[0, 0, 0, 0, 0, 0]"
127653,ffe987279560d7ff,""":::::And for the second time of asking, when ...","[0, 0, 0, 0, 0, 0]"
127654,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,"[0, 0, 0, 0, 0, 0]"


## Handling the Data class

In [10]:
# instantiate a class that will handle the data
class Data_Processing(object):
    def __init__(self, tokenizer, id_column, text_column, label_column):
        
        # define the text column from the dataframe
        self.text_column = text_column.tolist()
    
        # define the label column and transform it to list
        
        self.label_column = label_column
        
        # define the id column and transform it to list
        self.id_column = id_column.tolist()
        
    
# iter method to get each element at the time and tokenize it using bert        
    def __getitem__(self, index):
        comment_text = str(self.text_column[index])
        comment_text = " ".join(comment_text.split())
        
        inputs = tokenizer.encode_plus(comment_text,
                                       add_special_tokens = True,
                                       max_length= 512,
                                       padding = 'max_length',
                                       return_attention_mask = True,
                                       truncation = True,
                                       return_tensors='pt')
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        labels_ = torch.tensor(self.label_column[index], dtype=torch.float)
            
        id_ = self.id_column[index]
        return {'input_ids':input_ids[0], 'attention_mask':attention_mask[0], 
                'labels':labels_, 'id_':id_}
  
    def __len__(self):
        return len(self.text_column)

## Tokenization

In [11]:
batch_size = 32
# create a class to process the traininga and test data
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base',
                                          padding = 'max_length',
                                          truncation=True, 
                                          max_length = 512)
training_data = Data_Processing(tokenizer, 
                                train_dataset['id'], 
                                train_dataset['comment_text'], 
                                train_dataset['labels'])

test_data =  Data_Processing(tokenizer, 
                             test_dataset['id'], 
                             test_dataset['comment_text'], 
                             test_dataset['labels'])

# use the dataloaders class to load the data
dataloaders_dict = {'train': DataLoader(training_data, batch_size=batch_size, shuffle=True, num_workers=2),
                    'val': DataLoader(test_data, batch_size=batch_size, shuffle=True, num_workers=2)
                   }

dataset_sizes = {'train':len(training_data),
                 'val':len(test_data)
                }



In [12]:
# check we are getting the right output
a = next(iter(dataloaders_dict['val']))
a
#len(dataloaders_dict['train'])

{'input_ids': tensor([[    0,   113,  1491,  ...,     1,     1,     1],
         [    0, 32541,     6,  ...,     1,     1,     1],
         [    0,   113,  1491,  ...,     1,     1,     1],
         ...,
         [    0,   113,   345,  ...,     1,     1,     1],
         [    0,   113,  3180,  ...,     1,     1,     1],
         [    0,   500,    12,  ...,     1,     1,     1]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [1., 1., 1., 0., 1., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0.,

In [13]:
class RobertaForMultiLabelSequenceClassification(RobertaPreTrainedModel):
    """
    We instantiate a class of LongFormer adapted for a multilabel classification task. 
    This instance takes the pooled output of the LongFormer based model and passes it through a
    classification head. We replace the traditional Cross Entropy loss with a BCE loss that generate probabilities
    for all the labels that we feed into the model.
    """

    def __init__(self, config, pos_weight=None):
        super(RobertaForMultiLabelSequenceClassification, self).__init__(config)
        self.num_labels = config.num_labels
        self.pos_weight = pos_weight
        self.roberta = RobertaModel(config)
        self.classifier = RobertaClassificationHead(config)
        self.init_weights()
        
    def forward(self, input_ids=None, attention_mask=None, global_attention_mask=None, 
                token_type_ids=None, position_ids=None, inputs_embeds=None, 
                labels=None):
        
        # create global attention on sequence, and a global attention token on the `s` token
        # the equivalent of the CLS token on BERT models
        # pass arguments to longformer model
        outputs = self.roberta(
            input_ids = input_ids,
            attention_mask = attention_mask,
            token_type_ids = token_type_ids,
            position_ids = position_ids)
        
        # if specified the model can return a dict where each key corresponds to the output of a
        # LongformerPooler output class. In this case we take the last hidden state of the sequence
        # which will have the shape (batch_size, sequence_length, hidden_size). 
        sequence_output = outputs['last_hidden_state']
        
        # pass the hidden states through the classifier to obtain thee logits
        logits = self.classifier(sequence_output)
        outputs = (logits,) + outputs[2:]

        if labels is not None:
            loss_fct = BCEWithLogitsLoss(pos_weight=self.pos_weight)
            labels = labels.float()
            loss = loss_fct(logits.view(-1, self.num_labels), 
                            labels.view(-1, self.num_labels))
            outputs = (loss,) + outputs
        
        
        return outputs

# Creating the model

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == 'cuda':
    print('Device name:', torch.cuda.get_device_name(0))
else:
    print('Using CPU')

Device name: NVIDIA GeForce RTX 3060 Laptop GPU


In [15]:
model = RobertaForMultiLabelSequenceClassification.from_pretrained("roberta-base",
                                                                   num_labels = 6,
                                                                   cache_dir='./roberta_model_cache',
                                                                   return_dict=True)
model.to(device)

torch.set_float32_matmul_precision('high')

model

Some weights of RobertaForMultiLabelSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForMultiLabelSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
   

## Metrics: f1, roc_auc, accuracy

In [16]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
    
def multi_label_metric(
    predictions, 
    references, 
    ):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_true = references
    y_pred[np.where(probs >= 0.5)] = 1
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    metrics = {'f1':f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metric(
        predictions=preds, 
        references=p.label_ids
    )
    return result

## Save best model Callback

In [17]:
class SaveBestModelCallback(TrainerCallback):
    def __init__(self, metric_name="roc_auc"):
        self.best_score = -float('inf')
        self.metric_name = metric_name

    def on_train_begin(self, args, state, control, **kwargs):
        assert args.eval_strategy != "no", "SaveBestModelCallback requires an evaluation strategy of steps or epoch"

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        metric_value = metrics.get(self.metric_name)
        if metric_value is None:
            print(f"Warning: Metric '{self.metric_name}' not found in evaluation metrics.")
            return
        
        if metric_value > self.best_score:
            print(f"** {self.metric_name} improved from {np.round(self.best_score, 4)} to {np.round(metric_value, 4)} **")
            self.best_score = metric_value
            control.should_save = True
        else:
            print(f"{self.metric_name} score {np.round(metric_value, 4)} (Prev. Best {np.round(self.best_score, 4)})")

## Training arguments

In [18]:
training_args = TrainingArguments(
    output_dir = './roberta_trainer',
    disable_tqdm = False,
    run_name = 'roberta_multilabel_trainer_jigsaw_eval',
    warmup_steps = 1000,
    eval_strategy = "steps",
    eval_steps=500,
    dataloader_num_workers = 0,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps = 4,
    logging_dir='./roberta_logs',
    fp16 = True, # False for better results, but demands more GPU memory
    per_device_train_batch_size = 32,
    per_device_eval_batch_size= 16,
    gradient_accumulation_steps = 16,
    gradient_checkpointing=True,
    num_train_epochs = 4,
    save_strategy="no",
    save_total_limit=1,
)

## Trainer

In [19]:
# instantiate the trainer class and check for available devices

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_data,
    eval_dataset=test_data,
    compute_metrics = compute_metrics,
    #data_collator = Data_Processing(),
    callbacks=[SaveBestModelCallback(metric_name="roc_auc")]
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


## Train

In [20]:
trainer.train()

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
500,0.0441,0.043362,0.784367,0.886924,0.92549




  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


TrainOutput(global_step=996, training_loss=0.09711774177550073, metrics={'train_runtime': 21230.9406, 'train_samples_per_second': 24.051, 'train_steps_per_second': 0.047, 'total_flos': 1.3508495758614528e+17, 'train_loss': 0.09711774177550073, 'epoch': 3.993984962406015})

In [21]:
trainer.save_model()

tokenizer.save_pretrained('./roberta_trainer')

('./roberta_trainer/tokenizer_config.json',
 './roberta_trainer/special_tokens_map.json',
 './roberta_trainer/vocab.json',
 './roberta_trainer/merges.txt',
 './roberta_trainer/added_tokens.json',
 './roberta_trainer/tokenizer.json')

In [22]:
trainer.evaluate()



{'eval_loss': 0.038854245096445084,
 'eval_f1': 0.7950481430536451,
 'eval_roc_auc': 0.9069646976623721,
 'eval_accuracy': 0.9267429108569638,
 'eval_runtime': 339.9139,
 'eval_samples_per_second': 93.891,
 'eval_steps_per_second': 5.869,
 'epoch': 3.993984962406015}

## End of training (restart kernel)

In [None]:
torch.cuda.empty_cache()

# 2. Inference

In [1]:
import pandas as pd
import numpy as np
import random

import torch
import warnings

from tqdm import tqdm

from ydata_profiling import ProfileReport

from torch.nn import BCEWithLogitsLoss
from transformers import RobertaTokenizerFast, \
RobertaModel, Trainer, TrainingArguments,EvalPrediction, TrainerCallback

from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel, RobertaClassificationHead
from torch.utils.data import DataLoader

from skmultilearn.model_selection import iterative_train_test_split

#import wandb # explore for later
# import os
# os.environ["WANDB_DISABLED"] = "true"
%matplotlib inline

In [2]:
print(torch.version.cuda)
print(torch.backends.cudnn.enabled)

# Check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

torch.set_float32_matmul_precision('high')

11.8
True
Using device: cuda


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == 'cuda':
    print('Device name:', torch.cuda.get_device_name(0))
else:
    print('Using CPU')

Device name: NVIDIA GeForce RTX 3060 Laptop GPU


## Load the trained model

In [4]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Define the path where your model is saved
model_path = 'roberta_trainer'

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.to(device)
# Load the tokenizer (if you saved it)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [5]:
insults_test = pd.read_csv('test.csv')

insults_test

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.
...,...,...
153159,fffcd0960ee309b5,". \n i totally agree, this stuff is nothing bu..."
153160,fffd7a9a6eb32c16,== Throw from out field to home plate. == \n\n...
153161,fffda9e8d6fafa9e,""" \n\n == Okinotorishima categories == \n\n I ..."
153162,fffe8f1340a79fc2,""" \n\n == """"One of the founding nations of the..."


In [6]:
# instantiate a class that will handle the data
class Data_Processing_test():
    def __init__(self, tokenizer, id_column, text_column):
        
        # define the text column from the dataframe
        self.text_column = text_column.tolist()
                    
        # define the id column and transform it to list
        self.id_column = id_column.tolist()
            
# iter method to get each element at the time and tokenize it using bert        
    def __getitem__(self, index):
        comment_text = str(self.text_column[index])
        comment_text = " ".join(comment_text.split())
        
        inputs = tokenizer.encode_plus(comment_text,
                                       add_special_tokens = True,
                                       max_length= 512,
                                       padding = 'max_length',
                                       return_attention_mask = True,
                                       truncation = True,
                                       return_tensors='pt')
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        id_ = self.id_column[index]
        return {'input_ids':input_ids[0], 'attention_mask':attention_mask[0], 
                'id_':id_}
  
    def __len__(self):
        return len(self.text_column) 

In [7]:
batch_size = 64
# create a class to process the traininga and test data
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base',
                                          padding = 'max_length',
                                          truncation=True, 
                                          max_length = 512)
test_data_pred =  Data_Processing_test(tokenizer,
                                       insults_test['id'], 
                                       insults_test['comment_text'])

# use the dataloaders class to load the data
dataloaders_dict = {'test': DataLoader(test_data_pred,
                                                 batch_size=batch_size, shuffle=True, num_workers=2)}



## Embeddings and Prediction Save

After 1 hour of training:\
Private Score: 0.9754\
Public score: 0.9744

In [8]:
def prediction_and_embeddings():
    prediction_data_frame_list = []
    embeddings_data_frame_list = []
    with torch.no_grad():
        for i, batch in enumerate(tqdm(dataloaders_dict['test'], desc="Predicting")):  # Wrap the loop with tqdm
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            # feed the sequences to the model, specifying the attention mask
            outputs = model(inputs, attention_mask=attention_mask, output_hidden_states=True)
            
            hidden_states = outputs.hidden_states
            last_hidden_states = hidden_states[-1].mean(dim=1).cpu().numpy()

            ids = np.array(batch['id_'])
            embeddings_df = pd.DataFrame(last_hidden_states, index=ids)
            embeddings_data_frame_list.append(embeddings_df)

            # Apply sigmoid to get probabilities
            sigmoid = torch.nn.Sigmoid()
            probs = sigmoid(torch.Tensor(outputs[0].detach().cpu().data.numpy()))
            
            # Convert probabilities to numpy array
            probs = np.array(probs)
            
            # Store predictions
            y_pred = np.zeros(probs.shape)
            y_pred = probs
            temp_data = pd.DataFrame(zip(batch['id_'], probs), columns=['id', 'target'])
            prediction_data_frame_list.append(temp_data)

    embeddings_file = 'test_embeddings.csv'
    predictions_file = 'test_predictions.csv'

    all_embeddings_df = pd.concat(embeddings_data_frame_list)
    all_embeddings_df.to_csv(embeddings_file, index_label='id')

    prediction_df = pd.concat(prediction_data_frame_list)
    prediction_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] = pd.DataFrame(prediction_df.target.tolist(), index=prediction_df.index)
    prediction_df = prediction_df.drop(columns='target')
    prediction_df.to_csv(predictions_file, index=False)

prediction_and_embeddings()

Predicting: 100%|██████████| 2394/2394 [37:24<00:00,  1.07it/s]


In [None]:
torch.cuda.empty_cache()

# Pseudo-labelling
## Filter (-1) test labels

In [1]:
import pandas as pd

# Load the datasets
test = pd.read_csv("test.csv")          # Contains the text data
test_labels = pd.read_csv("test_labels.csv")  # Contains the real labels
predictions = pd.read_csv("submission.csv")   # Contains the predicted labels

# Step 1: Merge test data with test labels
merged_df = pd.merge(test, test_labels, on='id', how='left')

# Step 2: Filter rows where any label is -1
# This will keep rows where at least one label is -1
filtered_df = merged_df[(merged_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] == -1).any(axis=1)]

# Step 3: Replace labels with predictions from submission.csv
# Ensure predictions match the format and order of test_labels
# Convert predictions to binary (1 if prob > 0.5, else 0)
binary_predictions = predictions.copy()
binary_predictions.iloc[:, 1:] = (predictions.iloc[:, 1:] > 0.5).astype(int)
binary_predictions.to_csv('test_binary_predictions.csv', index=False)

# Merge the filtered dataframe with binary predictions to replace labels
final_merged_df = pd.merge(filtered_df[['id']], binary_predictions, on='id', how='left')

# Step 4: Save the final result to a new CSV file
final_merged_df.to_csv("final_filtered_predictions.csv", index=False)

print("Filtered and updated data saved to 'final_filtered_predictions.csv'.")

Filtered and updated data saved to 'final_filtered_predictions.csv'.


## Check amount of rows

In [2]:
negative_toxicity_count = test_labels[test_labels['toxic'] == -1].shape[0]
negative_toxicity_count

89186

In [3]:
final_merged_df.shape[0]

89186

## Final merged train

In [4]:
import pandas as pd

# Load datasets
test = pd.read_csv("test.csv")  # Contains text data
train = pd.read_csv("train.csv")  # Contains additional data
final_filtered_predictions = pd.read_csv("final_filtered_predictions.csv")  # Filtered predictions

# Step 1: Join final_filtered_predictions with test to include text
# Reset index so 'id' is a column for merging
test.reset_index(inplace=True)
final_filtered_predictions.reset_index(inplace=True)

# Join on 'id'
merged_with_text = pd.merge(final_filtered_predictions, test, on='id', how='left')

# Step 2: Clean column names if necessary
# Identify columns with suffixes '_x' and '_y'
columns_with_suffix = [col for col in merged_with_text.columns if col.endswith('_x') or col.endswith('_y')]

# Remove columns with '_y' suffixes as they are likely duplicates
columns_to_keep = [col for col in merged_with_text.columns if not col.endswith('_y')]
merged_with_text = merged_with_text[columns_to_keep]

# Rename columns from '_x' to correct names if necessary
column_rename = {
    'toxic_x': 'toxic',
    'severe_toxic_x': 'severe_toxic',
    'obscene_x': 'obscene',
    'threat_x': 'threat',
    'insult_x': 'insult',
    'identity_hate_x': 'identity_hate',
    'comment_text_x': 'comment_text'
}

# Apply renaming
merged_with_text.rename(columns=column_rename, inplace=True)

# Step 3: Join the result with train.csv
# Reset index to include 'id' in the final merge
train.reset_index(inplace=True)

# Concatenate merged_with_text and train
final_merged_with_train = pd.concat([merged_with_text, train], ignore_index=True)

# Ensure the final dataframe has the desired column order
final_merged_with_train = final_merged_with_train[['id' ,'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

# Save to CSV, including 'id' as a column
final_merged_with_train.to_csv("final_merged_with_train.csv", index=False)

final_merged_with_train

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,1.0,0.0,1.0,0.0,1.0,0.0
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,0.0,0.0,0.0,0.0,0.0,0.0
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",0.0,0.0,0.0,0.0,0.0,0.0
3,00017563c3f7919a,":If you have a look back at the source, the in...",0.0,0.0,0.0,0.0,0.0,0.0
4,00017695ad8997eb,I don't anonymously edit articles at all.,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
248752,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0.0,0.0,0.0,0.0,0.0,0.0
248753,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0.0,0.0,0.0,0.0,0.0,0.0
248754,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0.0,0.0,0.0,0.0,0.0,0.0
248755,fff125370e4aaaf3,And it looks like it was actually you who put ...,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
final_merged_with_train_idx = pd.Index(final_merged_with_train.index)
final_merged_with_train_dataset = final_merged_with_train.iloc[final_merged_with_train_idx].reset_index(drop=True)
final_merged_with_train_dataset['labels'] = final_merged_with_train_dataset[final_merged_with_train_dataset.columns[2:]].values.tolist()
final_merged_with_train_dataset = final_merged_with_train_dataset[['id','comment_text', 'labels']].reset_index(drop=True)
final_merged_with_train_dataset

Unnamed: 0,id,comment_text,labels
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,"[1.0, 0.0, 1.0, 0.0, 1.0, 0.0]"
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
3,00017563c3f7919a,":If you have a look back at the source, the in...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
4,00017695ad8997eb,I don't anonymously edit articles at all.,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
...,...,...,...
248752,ffe987279560d7ff,""":::::And for the second time of asking, when ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
248753,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
248754,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
248755,fff125370e4aaaf3,And it looks like it was actually you who put ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"


# 3. Ensemble Boosting (LightGBM)

## Load data

In [6]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
import os
from skmultilearn.model_selection import iterative_train_test_split

# Load data
embeddings_df = pd.read_csv('test_embeddings.csv', index_col='id')
labels_df = pd.read_csv('test_binary_predictions.csv', index_col='id')
data_df = embeddings_df.join(labels_df)

## Train LightGBM models

In [7]:
# Folder to save the models
model_dir = "lightgbm_models"
os.makedirs(model_dir, exist_ok=True)

# Prepare features and labels
X = data_df.iloc[:, :-6].values  # Assuming the last 6 columns are the labels
y = data_df.iloc[:, -6:].values  # Assuming the last 6 columns are the labels

# Define LightGBM parameters
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'max_depth': 6,
    'min_data_in_leaf': 20,
    'min_gain_to_split': 0.1
}

predictions_dict = {}

# Loop through each label
for label_index, label_name in tqdm(enumerate(data_df.columns[-6:]), desc="Training models", total=6):
    print(f"\nTraining model for label: {label_name}")
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X, label=y[:, label_index])
    
    # Train the LightGBM model
    model = lgb.train(params, train_data, num_boost_round=100)
    
    # Get predictions
    preds = model.predict(X)

    predictions_dict[label_name] = preds
    
    # Check if y_val contains more than one class
    if len(set(y[:, label_index])) > 1:
        # Calculate AUC score
        auc = roc_auc_score(y[:, label_index], preds)
        print(f"AUC for {label_name}: {auc}")
    else:
        print(f"Skipping AUC calculation for {label_name} due to only one class present in y_val.")
    
    # Save the model
    model_filename = os.path.join(model_dir, f'model_{label_name}.txt')
    model.save_model(model_filename)

Training models:   0%|          | 0/6 [00:00<?, ?it/s]


Training model for label: toxic
[LightGBM] [Info] Number of positive: 20824, number of negative: 132340
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.176367 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 153164, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.135959 -> initscore=-1.849268
[LightGBM] [Info] Start training from score -1.849268


Training models:  17%|█▋        | 1/6 [00:23<01:57, 23.59s/it]

AUC for toxic: 0.9903701134971092

Training model for label: severe_toxic
[LightGBM] [Info] Number of positive: 0, number of negative: 153164
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.170185 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 153164, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000000 -> initscore=-34.538776
[LightGBM] [Info] Start training from score -34.538776


Training models:  33%|███▎      | 2/6 [00:33<01:01, 15.31s/it]

Skipping AUC calculation for severe_toxic due to only one class present in y_val.

Training model for label: obscene
[LightGBM] [Info] Number of positive: 11706, number of negative: 141458
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.263327 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 153164, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076428 -> initscore=-2.491901
[LightGBM] [Info] Start training from score -2.491901


Training models:  50%|█████     | 3/6 [00:55<00:54, 18.33s/it]

AUC for obscene: 0.9933093931774737

Training model for label: threat
[LightGBM] [Info] Number of positive: 0, number of negative: 153164
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.151471 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 153164, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000000 -> initscore=-34.538776
[LightGBM] [Info] Start training from score -34.538776


Training models:  67%|██████▋   | 4/6 [01:04<00:29, 14.81s/it]

Skipping AUC calculation for threat due to only one class present in y_val.

Training model for label: insult
[LightGBM] [Info] Number of positive: 11630, number of negative: 141534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.273497 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 153164, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.075932 -> initscore=-2.498952
[LightGBM] [Info] Start training from score -2.498952


Training models:  83%|████████▎ | 5/6 [01:24<00:16, 16.67s/it]

AUC for insult: 0.9933218681227766

Training model for label: identity_hate
[LightGBM] [Info] Number of positive: 0, number of negative: 153164
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.134619 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 153164, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000000 -> initscore=-34.538776
[LightGBM] [Info] Start training from score -34.538776


Training models: 100%|██████████| 6/6 [01:33<00:00, 15.59s/it]

Skipping AUC calculation for identity_hate due to only one class present in y_val.





In [8]:
lightgbm_preds_df = pd.DataFrame(predictions_dict)
lightgbm_preds_df

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0.062635,1.000000e-15,0.012078,1.000000e-15,0.006438,1.000000e-15
1,0.000981,1.000000e-15,0.000569,1.000000e-15,0.000567,1.000000e-15
2,0.000981,1.000000e-15,0.000569,1.000000e-15,0.000567,1.000000e-15
3,0.005639,1.000000e-15,0.001452,1.000000e-15,0.001838,1.000000e-15
4,0.001134,1.000000e-15,0.000545,1.000000e-15,0.000679,1.000000e-15
...,...,...,...,...,...,...
153159,0.056554,1.000000e-15,0.013366,1.000000e-15,0.011085,1.000000e-15
153160,0.001931,1.000000e-15,0.000569,1.000000e-15,0.000567,1.000000e-15
153161,0.130744,1.000000e-15,0.020561,1.000000e-15,0.018645,1.000000e-15
153162,0.000980,1.000000e-15,0.000569,1.000000e-15,0.000567,1.000000e-15


In [9]:
roberta_probs_df = pd.read_csv('test_predictions.csv', index_col='id')
roberta_probs_df

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
574bea768ea8e03d,0.749741,0.005284,0.014407,0.142617,0.068731,0.020472
45662fde56604c8f,0.001794,0.000759,0.000983,0.000738,0.001051,0.000936
ccd7cb7391710c75,0.001265,0.000886,0.000954,0.000792,0.001006,0.001025
e3ecf0eafce84d09,0.035507,0.000693,0.004648,0.000968,0.004609,0.001881
145bf896d81baa64,0.001449,0.000833,0.000956,0.000788,0.001040,0.000990
...,...,...,...,...,...,...
321e16ba75f3e401,0.686864,0.001684,0.113581,0.001837,0.249578,0.004101
66d4e7f853319e0c,0.005121,0.000608,0.001183,0.000717,0.001549,0.000949
54deea5a8261fcf4,0.602051,0.002996,0.309188,0.001465,0.332667,0.004991
2080b2eb59ecfe3d,0.002068,0.000730,0.001036,0.000723,0.001110,0.000893


In [10]:
print("RoBERTa index type:", type(roberta_probs_df.index))
print("RoBERTa index head:", roberta_probs_df.index[:5])

print("LightGBM index type:", type(lightgbm_preds_df.index))
print("LightGBM index head:", lightgbm_preds_df.index[:5])


RoBERTa index type: <class 'pandas.core.indexes.base.Index'>
RoBERTa index head: Index(['574bea768ea8e03d', '45662fde56604c8f', 'ccd7cb7391710c75',
       'e3ecf0eafce84d09', '145bf896d81baa64'],
      dtype='object', name='id')
LightGBM index type: <class 'pandas.core.indexes.range.RangeIndex'>
LightGBM index head: RangeIndex(start=0, stop=5, step=1)


In [11]:
lightgbm_preds_df.index = roberta_probs_df.index

In [12]:
print("RoBERTa index type:", type(roberta_probs_df.index))
print("RoBERTa index head:", roberta_probs_df.index[:5])

print("LightGBM index type:", type(lightgbm_preds_df.index))
print("LightGBM index head:", lightgbm_preds_df.index[:5])

RoBERTa index type: <class 'pandas.core.indexes.base.Index'>
RoBERTa index head: Index(['574bea768ea8e03d', '45662fde56604c8f', 'ccd7cb7391710c75',
       'e3ecf0eafce84d09', '145bf896d81baa64'],
      dtype='object', name='id')
LightGBM index type: <class 'pandas.core.indexes.base.Index'>
LightGBM index head: Index(['574bea768ea8e03d', '45662fde56604c8f', 'ccd7cb7391710c75',
       'e3ecf0eafce84d09', '145bf896d81baa64'],
      dtype='object', name='id')


In [13]:
lightgbm_preds_df.to_csv('lightgbm_predictions.csv')

In [14]:
# Make sure the indices match
assert roberta_probs_df.index.equals(lightgbm_preds_df.index), "Indices do not match"


## Combine Predictions from RoBERTa and LightGBM models

### The best for now: 
weight_roberta = 0.9\
weight_lightgbm = 0.1

In [15]:
# Define weights
weight_roberta = 0.9
weight_lightgbm = 0.1

# Normalize weights
total_weight = weight_roberta + weight_lightgbm
weight_roberta /= total_weight
weight_lightgbm /= total_weight

In [16]:
# Define a small value to identify near-zero predictions
small_value = 1e-15

# Create a mask for LightGBM predictions that are essentially zero
lightgbm_mask = lightgbm_preds_df < small_value

In [17]:
# Apply mask to LightGBM predictions
# Replace LightGBM predictions with NaN where they are essentially zero
lightgbm_preds_df_adjusted = lightgbm_preds_df.where(~lightgbm_mask, other=None)

# Combine predictions by weighted average
# For cells with NaN in LightGBM predictions, only use RoBERTa predictions
combined_preds_df = (roberta_probs_df * weight_roberta + 
                      lightgbm_preds_df_adjusted * weight_lightgbm).fillna(roberta_probs_df)
combined_preds_df

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
574bea768ea8e03d,0.681031,0.004755,0.014174,0.128355,0.062502,0.018425
45662fde56604c8f,0.001713,0.000683,0.000941,0.000665,0.001002,0.000843
ccd7cb7391710c75,0.001237,0.000798,0.000916,0.000713,0.000962,0.000922
e3ecf0eafce84d09,0.032520,0.000624,0.004328,0.000871,0.004332,0.001693
145bf896d81baa64,0.001417,0.000749,0.000914,0.000709,0.001004,0.000891
...,...,...,...,...,...,...
321e16ba75f3e401,0.623833,0.001515,0.103560,0.001654,0.225729,0.003691
66d4e7f853319e0c,0.004802,0.000547,0.001121,0.000646,0.001451,0.000854
54deea5a8261fcf4,0.554920,0.002697,0.280325,0.001319,0.301265,0.004492
2080b2eb59ecfe3d,0.001959,0.000657,0.000989,0.000650,0.001056,0.000804


In [18]:
combined_preds_df.to_csv('weighted_combined_predictions.csv')