# Preprocess the data

## Pre settings of the code

In [2]:
# Import all required libraries
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport
from sklearn.preprocessing import MultiLabelBinarizer

import pandas as pd
import numpy as np
import random

import torch
import warnings

from tqdm import tqdm

from ydata_profiling import ProfileReport

from torch.nn import BCEWithLogitsLoss
from transformers import RobertaTokenizerFast, \
RobertaModel, Trainer, TrainingArguments,EvalPrediction, TrainerCallback

from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel, RobertaClassificationHead
from torch.utils.data import DataLoader

from skmultilearn.model_selection import iterative_train_test_split

%matplotlib inline

In [3]:
# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)
torch.manual_seed(hash("by removing stochasticity") % 2**32 - 1)
torch.cuda.manual_seed_all(hash("so runs are repeatable") % 2**32 - 1)

## Load the data

In [4]:
df = pd.read_excel('JobLevelData.xlsx')
df

Unnamed: 0,Title,Column 1,Column 2,Column 3,Column 4
0,Vice President / Director of Systems Engineering,Vice President,,,
1,Systems Engineer; Systems Architect,Manager,Individual Contributor/Staff,,
2,"Executive Director, Global IT Infrastructure /...",Director,Chief Officer,,
3,CTO/Executive Director of Technology Services,Director,Chief Officer,,
4,"Vice President, CIO",Vice President,,,
...,...,...,...,...,...
2235,Net Software Architect and Team Project Lead,Manager,,,
2236,Solutions Architect & Technical Lead,Manager,Individual Contributor/Staff,,
2237,"Manager, Salesforcecom Administration and Rele...",Manager,,,
2238,Innovation Automation Architect,Manager,,,


## Analyse the data

In [5]:
report = ProfileReport(df)
report.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
# Clean the data - drop the NaN values
df = df.dropna(subset=['Column 1'])

df = df.reset_index(drop=False)
df.rename(columns={'index': 'Id'}, inplace=True)

df

Unnamed: 0,Id,Title,Column 1,Column 2,Column 3,Column 4
0,0,Vice President / Director of Systems Engineering,Vice President,,,
1,1,Systems Engineer; Systems Architect,Manager,Individual Contributor/Staff,,
2,2,"Executive Director, Global IT Infrastructure /...",Director,Chief Officer,,
3,3,CTO/Executive Director of Technology Services,Director,Chief Officer,,
4,4,"Vice President, CIO",Vice President,,,
...,...,...,...,...,...,...
2225,2235,Net Software Architect and Team Project Lead,Manager,,,
2226,2236,Solutions Architect & Technical Lead,Manager,Individual Contributor/Staff,,
2227,2237,"Manager, Salesforcecom Administration and Rele...",Manager,,,
2228,2238,Innovation Automation Architect,Manager,,,


In [7]:
df['Labels'] = df[['Column 1', 'Column 2', 'Column 3', 'Column 4']].values.tolist()

# Remove NaN values from the lists
df['Labels'] = df['Labels'].apply(lambda x: [i for i in x if pd.notna(i)])

# Use MultiLabelBinarizer to encode the labels
mlb = MultiLabelBinarizer()
labels_encoded = mlb.fit_transform(df['Labels'])

# Create a new dataframe with the encoded labels
labels_df = pd.DataFrame(labels_encoded, columns=mlb.classes_)

df_encoded = pd.concat([df[['Id', 'Title']], labels_df], axis=1)

df_encoded

Unnamed: 0,Id,Title,Chief Officer,Director,Individual Contributor/Staff,Manager,Owner,Vice President
0,0,Vice President / Director of Systems Engineering,0,0,0,0,0,1
1,1,Systems Engineer; Systems Architect,0,0,1,1,0,0
2,2,"Executive Director, Global IT Infrastructure /...",1,1,0,0,0,0
3,3,CTO/Executive Director of Technology Services,1,1,0,0,0,0
4,4,"Vice President, CIO",0,0,0,0,0,1
...,...,...,...,...,...,...,...,...
2225,2235,Net Software Architect and Team Project Lead,0,0,0,1,0,0
2226,2236,Solutions Architect & Technical Lead,0,0,1,1,0,0
2227,2237,"Manager, Salesforcecom Administration and Rele...",0,0,0,1,0,0
2228,2238,Innovation Automation Architect,0,0,0,1,0,0


## Find the unique number of column combinations

In [8]:
unique_combinations = df[['Column 1', 'Column 2', 'Column 3', 'Column 4']].drop_duplicates()

unique_combinations['non_nan_count'] = unique_combinations[['Column 1', 'Column 2', 'Column 3', 'Column 4']].notna().sum(axis=1)

unique_combinations_sorted = unique_combinations.sort_values(by=['non_nan_count', 'Column 1', 'Column 2', 'Column 3', 'Column 4'], ascending=[True, True, True, True, True])

unique_combinations_sorted = unique_combinations_sorted.drop(columns=['non_nan_count'])

unique_combinations_sorted['Cluster'] = range(len(unique_combinations))

df_new = df.merge(unique_combinations_sorted, on=['Column 1', 'Column 2', 'Column 3', 'Column 4'], how='left')

unique_combinations_sorted

Unnamed: 0,Column 1,Column 2,Column 3,Column 4,Cluster
5,Chief Officer,,,,0
1004,Director,,,,1
30,Individual Contributor/Staff,,,,2
1812,Manager,,,,3
1217,Owner,,,,4
0,Vice President,,,,5
2,Director,Chief Officer,,,6
2092,Director,Individual Contributor/Staff,,,7
749,Individual Contributor/Staff,Chief Officer,,,8
29,Manager,Chief Officer,,,9


In [9]:
report = ProfileReport(df_new)
report.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

## Train, Dev split

In [10]:
# From tha anylysing the data, we can see that the data is imbalanced, so we will use 'iterative_train_test_split'

texts = df_encoded['Title'].tolist()

labels = df_encoded[['Chief Officer', 'Director', 'Individual Contributor/Staff',
             'Manager', 'Owner', 'Vice President']].values

labels = np.array(labels, dtype=int)

test_size = 0.2
row_ids = np.arange(len(labels))
train_idx, _, test_idx, _ = iterative_train_test_split(row_ids[:, np.newaxis], labels, test_size=test_size)

train_dataset = df_encoded.iloc[train_idx.flatten()].reset_index(drop=True)
test_dataset = df_encoded.iloc[test_idx.flatten()].reset_index(drop=True)

train_dataset.head()

Unnamed: 0,Id,Title,Chief Officer,Director,Individual Contributor/Staff,Manager,Owner,Vice President
0,2,"Executive Director, Global IT Infrastructure /...",1,1,0,0,0,0
1,4,"Vice President, CIO",0,0,0,0,0,1
2,5,"CIO, Cissp",1,0,0,0,0,0
3,7,Chief Technology Officer / Head Of Marketing,1,0,0,0,0,0
4,9,Chief Data Architect,1,0,0,0,0,0


In [11]:
train_dataset['Labels'] = train_dataset[train_dataset.columns[2:]].values.tolist()
train_dataset = train_dataset[['Id', 'Title', 'Labels']].reset_index(drop=True)
test_dataset['Labels'] = test_dataset[test_dataset.columns[2:]].values.tolist()
test_dataset = test_dataset[['Id', 'Title', 'Labels']].reset_index(drop=True)
test_dataset.to_csv('test_dataset.csv', index=False)
train_dataset

Unnamed: 0,Id,Title,Labels
0,1,Systems Engineer; Systems Architect,"[0, 0, 1, 1, 0, 0]"
1,2,"Executive Director, Global IT Infrastructure /...","[1, 1, 0, 0, 0, 0]"
2,4,"Vice President, CIO","[0, 0, 0, 0, 0, 1]"
3,5,"CIO, Cissp","[1, 0, 0, 0, 0, 0]"
4,7,Chief Technology Officer / Head Of Marketing,"[1, 0, 0, 0, 0, 0]"
...,...,...,...
1779,2235,Net Software Architect and Team Project Lead,"[0, 0, 0, 1, 0, 0]"
1780,2236,Solutions Architect & Technical Lead,"[0, 0, 1, 1, 0, 0]"
1781,2237,"Manager, Salesforcecom Administration and Rele...","[0, 0, 0, 1, 0, 0]"
1782,2238,Innovation Automation Architect,"[0, 0, 0, 1, 0, 0]"


## Handling the Data class

In [12]:
# Instantiate a class that will handle the data
class Data_Processing(object):
    def __init__(self, tokenizer, id_column, text_column, label_column):
        
        # Define the text column from the dataframe
        self.text_column = text_column.tolist()
    
        # Define the label column and transform it to list
        
        self.label_column = label_column
        
        # Define the id column and transform it to list
        self.id_column = id_column.tolist()
        
    
# Iter method to get each element at the time and tokenize it using bert        
    def __getitem__(self, index):
        title = str(self.text_column[index])
        title = " ".join(title.split())
        
        inputs = tokenizer.encode_plus(title,
                                       add_special_tokens = True,
                                       max_length= 512,
                                       padding = 'max_length',
                                       return_attention_mask = True,
                                       truncation = True,
                                       return_tensors='pt')
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        labels_ = torch.tensor(self.label_column[index], dtype=torch.float)
            
        id_ = self.id_column[index]
        return {'input_ids':input_ids[0], 'attention_mask':attention_mask[0], 
                'labels':labels_, 'id_':id_}
  
    def __len__(self):
        return len(self.text_column)

## Tokenization

In [13]:
batch_size = 32
# Create a class to process the traininga and test data
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base',
                                          padding = 'max_length',
                                          truncation=True, 
                                          max_length = 512)
training_data = Data_Processing(tokenizer,
                                train_dataset['Id'],
                                train_dataset['Title'], 
                                train_dataset['Labels'])

test_data =  Data_Processing(tokenizer,
                             test_dataset['Id'],
                             test_dataset['Title'], 
                             test_dataset['Labels'])

# Use the dataloaders class to load the data
dataloaders_dict = {'train': DataLoader(training_data, batch_size=batch_size, shuffle=True, num_workers=2),
                    'val': DataLoader(test_data, batch_size=batch_size, shuffle=True, num_workers=2)
                   }

dataset_sizes = {'train':len(training_data),
                 'val':len(test_data)
                }



In [14]:
# Check we are getting the right output

a = next(iter(dataloaders_dict['val']))
a

{'input_ids': tensor([[    0, 36327, 37920,  ...,     1,     1,     1],
         [    0,   104, 12015,  ...,     1,     1,     1],
         [    0, 37142,   270,  ...,     1,     1,     1],
         ...,
         [    0,   243,  1841,  ...,     1,     1,     1],
         [    0, 44426,  4827,  ...,     1,     1,     1],
         [    0, 33867, 41614,  ...,     1,     1,     1]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[0., 0., 1., 0., 0., 0.],
         [0., 0., 0., 1., 0., 1.],
         [1., 0., 0., 0., 0., 0.],
         [0., 0., 1., 0., 0., 0.],
         [0., 0., 1., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0.],
         [0., 0., 1., 0., 0., 0.],
         [0., 0., 1., 0., 0., 0.],
         [0., 0., 1., 0., 0., 0.],
         [0., 1., 0., 0., 0., 0.],
         [0.,

# Creating the model

## RoBERTa Custom Class

In [15]:
class RobertaForMultiLabelSequenceClassification(RobertaPreTrainedModel):
    """
    We instantiate a class of RoBERTa adapted for a multilabel classification task. 
    This instance takes the pooled output of the RoBERTa based model and passes it through a
    classification head. We replace the traditional Cross Entropy loss with a BCE loss that generate probabilities
    for all the labels that we feed into the model.
    """

    def __init__(self, config, pos_weight=None):
        super(RobertaForMultiLabelSequenceClassification, self).__init__(config)
        self.num_labels = config.num_labels
        self.pos_weight = pos_weight
        self.roberta = RobertaModel(config)
        self.classifier = RobertaClassificationHead(config)
        self.init_weights()
        
    def forward(self, input_ids=None, attention_mask=None, global_attention_mask=None, 
                token_type_ids=None, position_ids=None, inputs_embeds=None, 
                labels=None):
        
        # Create global attention on sequence, and a global attention token on the `s` token
        # pass arguments to RoBERTa model
        outputs = self.roberta(
            input_ids = input_ids,
            attention_mask = attention_mask,
            token_type_ids = token_type_ids,
            position_ids = position_ids)
        
        sequence_output = outputs['last_hidden_state']
        
        # Pass the hidden states through the classifier to obtain the logits
        logits = self.classifier(sequence_output)
        outputs = (logits,) + outputs[2:]

        if labels is not None:
            loss_fct = BCEWithLogitsLoss(pos_weight=self.pos_weight)
            labels = labels.float()
            loss = loss_fct(logits.view(-1, self.num_labels), 
                            labels.view(-1, self.num_labels))
            outputs = (loss,) + outputs
        
        
        return outputs

## Check the cuda and GPU

In [16]:
print('Torch cuda version: ', torch.version.cuda)
print('Torch cuda is enabled: ', torch.backends.cudnn.enabled)

# Check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

torch.set_float32_matmul_precision('high') # for less precision but faster computation, it is good practice to use 'high' precision

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == 'cuda':
    print('Device name:', torch.cuda.get_device_name(0))
else:
    print('Using CPU')

Torch cuda version:  11.8
Torch cuda is enabled:  True
Using device: cuda
Device name: NVIDIA GeForce RTX 3060 Laptop GPU


In [17]:
model = RobertaForMultiLabelSequenceClassification.from_pretrained("roberta-base",
                                                                   num_labels = 6,
                                                                   cache_dir='./roberta_model_cache',
                                                                   return_dict=True)
model.to(device)

model

Some weights of RobertaForMultiLabelSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForMultiLabelSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
   

## Metrics: f1, roc_auc, accuracy

In [18]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
    
def multi_label_metric(
    predictions, 
    references, 
    ):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_true = references
    y_pred[np.where(probs >= 0.5)] = 1
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    metrics = {'f1':f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metric(
        predictions=preds, 
        references=p.label_ids
    )
    return result

## Save best model Callback

In [19]:
class SaveBestModelCallback(TrainerCallback):
    def __init__(self, metric_name="roc_auc"):
        self.best_score = -float('inf')
        self.metric_name = metric_name

    def on_train_begin(self, args, state, control, **kwargs):
        assert args.eval_strategy != "no", "SaveBestModelCallback requires an evaluation strategy of steps or epoch"

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        metric_value = metrics.get(self.metric_name)
        if metric_value is None:
            print(f"Warning: Metric '{self.metric_name}' not found in evaluation metrics.")
            return
        
        if metric_value > self.best_score:
            print(f"** {self.metric_name} improved from {np.round(self.best_score, 4)} to {np.round(metric_value, 4)} **")
            self.best_score = metric_value
            control.should_save = True
        else:
            print(f"{self.metric_name} score {np.round(metric_value, 4)} (Prev. Best {np.round(self.best_score, 4)})")

## Training arguments

In [20]:
# Instantiate the training args class

training_args = TrainingArguments(
    output_dir = './roberta_trainer',
    disable_tqdm = False,
    run_name = 'roberta_multilabel_trainer_jigsaw_eval',
    warmup_steps = 1000,
    eval_strategy = "steps",
    eval_steps=500,
    dataloader_num_workers = 0,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps = 4,
    logging_dir='./roberta_logs',
    fp16 = True, # False for better results, but demands more GPU memory
    per_device_train_batch_size = 32,
    per_device_eval_batch_size= 16,
    gradient_accumulation_steps = 16,
    gradient_checkpointing=True,
    num_train_epochs = 60,
    save_strategy="no",
    save_total_limit=1,
)

## Trainer

In [21]:
# Instantiate the trainer class

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_data,
    eval_dataset=test_data,
    compute_metrics = compute_metrics,
    #data_collator = Data_Processing(),
    callbacks=[SaveBestModelCallback(metric_name="roc_auc")]
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


## Train

In [22]:
trainer.train()

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss


TrainOutput(global_step=180, training_loss=0.4260837627781762, metrics={'train_runtime': 3666.7329, 'train_samples_per_second': 29.209, 'train_steps_per_second': 0.049, 'total_flos': 2.432180976167117e+16, 'train_loss': 0.4260837627781762, 'epoch': 51.42857142857143})

In [23]:
trainer.save_model()

tokenizer.save_pretrained('./roberta_trainer')

('./roberta_trainer/tokenizer_config.json',
 './roberta_trainer/special_tokens_map.json',
 './roberta_trainer/vocab.json',
 './roberta_trainer/merges.txt',
 './roberta_trainer/added_tokens.json',
 './roberta_trainer/tokenizer.json')

In [24]:
trainer.evaluate()



{'eval_loss': 0.1325758844614029,
 'eval_f1': 0.9177489177489178,
 'eval_roc_auc': 0.9392004863961088,
 'eval_accuracy': 0.8921348314606742,
 'eval_runtime': 4.4212,
 'eval_samples_per_second': 100.651,
 'eval_steps_per_second': 6.333,
 'epoch': 51.42857142857143}

Number of labels 6: \
Number of epoch: 4, time:~4min, eval_roc_auc 47512591486696354 \
Number of epoch: 20, time:~20min, eval_roc_auc 0.5 \
Number of epoch: 60, time:~60min, eval_roc_auc 0.9392004863961088