# RobBERT-XMLC model for Explicit and Implicit skill-extraction

This code is created as part of the Master Thesis of Ninande Vermeer: "Using RobBERT for Implicit and Explicit Skill-Extraction from Dutch Job Descriptions". It can be used to conduct the experiments of the project. The Sample and RO sample data cannot be shared. However, the dataset of Bhola et al. can be found on their official Github repository: https://github.com/WING-NUS/JD2Skills-BERT-XMLC. 

In order to conduct the different experiments, change the "change this" variables in the Settings section.

## Packages and Modules

In [None]:
# Data and visualization
import pandas as pd
import pickle
import matplotlib 
import matplotlib.pyplot as plt
import boto3
import io

# Math
import numpy as np

# Time and Logging
import time
import logging

# BERT model (simpletransformers will install all required packages)
!pip install --upgrade torch
import torch
!pip install simpletransformers

In [None]:
'''
ADJUSTED MODULES OF SIMPLETRANSFORMERS AND USED PACKAGES
'''

from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers import (
    BertModel,
    RobertaModel
)
from transformers.models.bert.modeling_bert import BertPreTrainedModel

from transformers.models.roberta.modeling_roberta import (
    ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
    RobertaClassificationHead,
    RobertaConfig,
    RobertaModel,
)

import os
import random
import warnings
from multiprocessing import cpu_count

from transformers import (
    WEIGHTS_NAME,
    AlbertConfig,
    AlbertTokenizer,
    BertConfig,
    BertTokenizer,
    BertweetTokenizer,
    BigBirdConfig,
    BigBirdTokenizer,
    CamembertConfig,
    CamembertTokenizer,
    DistilBertConfig,
    DistilBertTokenizer,
    ElectraConfig,
    ElectraTokenizer,
    FlaubertConfig,
    FlaubertTokenizer,
    LongformerConfig,
    LongformerTokenizer,
    RobertaConfig,
    RobertaTokenizer,
    XLMConfig,
    XLMRobertaConfig,
    XLMRobertaTokenizer,
    XLMTokenizer,
    XLNetConfig,
    XLNetTokenizer,
)

from simpletransformers.classification import ClassificationModel
from simpletransformers.config.global_args import global_args
from simpletransformers.config.model_args import MultiLabelClassificationArgs
from simpletransformers.config.utils import sweep_config_to_sweep_values
from simpletransformers.custom_models.models import (
    AlbertForMultiLabelSequenceClassification,
    BertForMultiLabelSequenceClassification,
    BertweetForMultiLabelSequenceClassification,
    BigBirdForMultiLabelSequenceClassification,
    CamembertForMultiLabelSequenceClassification,
    DistilBertForMultiLabelSequenceClassification,
    ElectraForMultiLabelSequenceClassification,
    FlaubertForMultiLabelSequenceClassification,
    LongformerForMultiLabelSequenceClassification,
    RobertaForMultiLabelSequenceClassification,
    XLMForMultiLabelSequenceClassification,
    XLMRobertaForMultiLabelSequenceClassification,
    XLNetForMultiLabelSequenceClassification,
)

# 2-LAYERED BERT-XMLC
class BertForMultiLabelSequenceClassificationXMLC(BertPreTrainedModel):
    """
    Two-layered Bert model adapted for multi-label sequence classification
    """
    
    def __init__(self, config, pos_weight=None):
        super(BertForMultiLabelSequenceClassificationXMLC, self).__init__(config)
        self.num_labels = config.num_labels
        self.pos_weight = pos_weight
        
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.hidden_size)
        self.classifier_1 = nn.Linear(config.hidden_size, config.num_labels)
        self.relu = nn.ReLU()
        self.init_weights()

    def forward(
        self,
        input_ids,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        labels=None,
    ):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
        )
        
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        logits = self.relu(logits)
        logits = self.classifier_1(logits)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        if labels is not None:
            loss_fct = BCEWithLogitsLoss(pos_weight=self.pos_weight)
            labels = labels.float()
            loss = loss_fct(
                logits.view(-1, self.num_labels), labels.view(-1, self.num_labels)
            )
            outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)
    
    def unfreeze(self,start_layer,end_layer):
        def children(m):
            return m if isinstance(m, (list, tuple)) else list(m.children())
        def set_trainable_attr(m, b):
            m.trainable = b
            for p in m.parameters():
                p.requires_grad = b
        def apply_leaf(m, f):
            c = children(m)
            if isinstance(m, nn.Module):
                f(m)
            if len(c) > 0:
                for l in c:
                    apply_leaf(l, f)
        def set_trainable(l, b):
            apply_leaf(l, lambda m: set_trainable_attr(m, b))

        # You can unfreeze the last layer of bert by calling set_trainable(model.bert.encoder.layer[23], True)
        set_trainable(self.bert, False)
        for i in range(start_layer, end_layer+1):
            set_trainable(self.bert.encoder.layer[i], True)

# 2-LAYERED ROBBERT-XMLC

class RobertaForMultiLabelSequenceClassificationXMLC(BertPreTrainedModel):
    """
    Two-layered Roberta model adapted for multi-label sequence classification
    """

    config_class = RobertaConfig
    pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST
    base_model_prefix = "roberta"

    def __init__(self, config, pos_weight=None):
        super(RobertaForMultiLabelSequenceClassificationXMLC, self).__init__(config)
        self.num_labels = config.num_labels
        self.pos_weight = pos_weight

        self.roberta = RobertaModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.hidden_size)
        self.classifier_1 = nn.Linear(config.hidden_size, config.num_labels)
        self.relu = nn.ReLU()
        self.init_weights()

    def forward(
            self,
            input_ids,
            attention_mask=None,
            token_type_ids=None,
            position_ids=None,
            head_mask=None,
            labels=None,
        ):

        outputs = self.roberta(input_ids, token_type_ids=token_type_ids,attention_mask=attention_mask, head_mask=head_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        logits = self.relu(logits)
        logits = self.classifier_1(logits)

        outputs = (logits,) + outputs[2:]
        if labels is not None:
            loss_fct = BCEWithLogitsLoss(pos_weight=self.pos_weight)
            labels = labels.float()
            loss = loss_fct(
                logits.view(-1, self.num_labels), labels.view(-1, self.num_labels)
            )
            outputs = (loss,) + outputs

        return outputs


    def unfreeze(self,start_layer,end_layer):
        def children(m):
            return m if isinstance(m, (list, tuple)) else list(m.children())
        def set_trainable_attr(m, b):
            m.trainable = b
            for p in m.parameters():
                p.requires_grad = b
        def apply_leaf(m, f):
            c = children(m)
            if isinstance(m, nn.Module):
                f(m)
            if len(c) > 0:
                for l in c:
                    apply_leaf(l, f)
        def set_trainable(l, b):
            apply_leaf(l, lambda m: set_trainable_attr(m, b))

        # You can unfreeze the last layer of bert by calling set_trainable(model.bert.encoder.layer[23], True)
        set_trainable(self.bert, False)
        for i in range(start_layer, end_layer+1):
            set_trainable(self.bert.encoder.layer[i], True)
            
# 1-LAYERED ROBBERT-XMLC

class RobertaForMultiLabelSequenceClassificationXMLC1(BertPreTrainedModel):
    """
    Single-layered Roberta model adapted for multi-label sequence classification
    """

    config_class = RobertaConfig
    pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST
    base_model_prefix = "roberta"

    def __init__(self, config, pos_weight=None):
        super(RobertaForMultiLabelSequenceClassificationXMLC1, self).__init__(config)
        self.num_labels = config.num_labels
        self.pos_weight = pos_weight

        self.roberta = RobertaModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights()

    def forward(
            self,
            input_ids,
            attention_mask=None,
            token_type_ids=None,
            position_ids=None,
            head_mask=None,
            labels=None,
        ):

        outputs = self.roberta(input_ids, token_type_ids=token_type_ids,attention_mask=attention_mask, head_mask=head_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        outputs = (logits,) + outputs[2:]
        if labels is not None:
            loss_fct = BCEWithLogitsLoss(pos_weight=self.pos_weight)
            labels = labels.float()
            loss = loss_fct(
                logits.view(-1, self.num_labels), labels.view(-1, self.num_labels)
            )
            outputs = (loss,) + outputs

        return outputs


    def unfreeze(self,start_layer,end_layer):
        def children(m):
            return m if isinstance(m, (list, tuple)) else list(m.children())
        def set_trainable_attr(m, b):
            m.trainable = b
            for p in m.parameters():
                p.requires_grad = b
        def apply_leaf(m, f):
            c = children(m)
            if isinstance(m, nn.Module):
                f(m)
            if len(c) > 0:
                for l in c:
                    apply_leaf(l, f)
        def set_trainable(l, b):
            apply_leaf(l, lambda m: set_trainable_attr(m, b))

        # You can unfreeze the last layer of bert by calling set_trainable(model.bert.encoder.layer[23], True)
        set_trainable(self.bert, False)
        for i in range(start_layer, end_layer+1):
            set_trainable(self.bert.encoder.layer[i], True)
            
# 1-LAYERED ROBBERT-XMLC without dropout

class RobertaForMultiLabelSequenceClassificationXMLCBasic(BertPreTrainedModel):
    """
    Single-layered Roberta model without dropout layer, adapted for multi-label sequence classification
    """

    config_class = RobertaConfig
    pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST
    base_model_prefix = "roberta"

    def __init__(self, config, pos_weight=None):
        super(RobertaForMultiLabelSequenceClassificationXMLCBasic, self).__init__(config)
        self.num_labels = config.num_labels
        self.pos_weight = pos_weight

        self.roberta = RobertaModel(config)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights()

    def forward(
            self,
            input_ids,
            attention_mask=None,
            token_type_ids=None,
            position_ids=None,
            head_mask=None,
            labels=None,
        ):

        outputs = self.roberta(input_ids, token_type_ids=token_type_ids,attention_mask=attention_mask, head_mask=head_mask)
        pooled_output = outputs[1]
        logits = self.classifier(pooled_output)

        outputs = (logits,) + outputs[2:]
        if labels is not None:
            loss_fct = BCEWithLogitsLoss(pos_weight=self.pos_weight)
            labels = labels.float()
            loss = loss_fct(
                logits.view(-1, self.num_labels), labels.view(-1, self.num_labels)
            )
            outputs = (loss,) + outputs

        return outputs


    def unfreeze(self,start_layer,end_layer):
        def children(m):
            return m if isinstance(m, (list, tuple)) else list(m.children())
        def set_trainable_attr(m, b):
            m.trainable = b
            for p in m.parameters():
                p.requires_grad = b
        def apply_leaf(m, f):
            c = children(m)
            if isinstance(m, nn.Module):
                f(m)
            if len(c) > 0:
                for l in c:
                    apply_leaf(l, f)
        def set_trainable(l, b):
            apply_leaf(l, lambda m: set_trainable_attr(m, b))

        # You can unfreeze the last layer of bert by calling set_trainable(model.bert.encoder.layer[23], True)
        set_trainable(self.bert, False)
        for i in range(start_layer, end_layer+1):
            set_trainable(self.bert.encoder.layer[i], True)
            
#### Modified classification model
            
class MultiLabelClassificationModel(ClassificationModel):
    def __init__(
        self,
        model_type,
        model_name,
        num_labels=None,
        pos_weight=None,
        args=None,
        use_cuda=True,
        cuda_device=-1,
        **kwargs,
    ):

        """
        Initializes a MultiLabelClassification model.
        Args:
            model_type: The type of model (bert, roberta)
            model_name: Default Transformer model name or path to a directory containing Transformer model file (pytorch_nodel.bin).
            num_labels (optional): The number of labels or classes in the dataset.
            pos_weight (optional): A list of length num_labels containing the weights to assign to each label for loss calculation.
            args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
            use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
            cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default.
            **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied.
        """  # noqa: ignore flake8"

        MODEL_CLASSES = {
            "albert": (
                AlbertConfig,
                AlbertForMultiLabelSequenceClassification,
                AlbertTokenizer,
            ),
            "bert": (
                BertConfig,
                BertForMultiLabelSequenceClassification,
                BertTokenizer,
            ),
            "bert_xmlc": (
                BertConfig,
                BertForMultiLabelSequenceClassificationXMLC,
                BertTokenizer,
            ),
            "bertweet": (
                RobertaConfig,
                BertweetForMultiLabelSequenceClassification,
                BertweetTokenizer,
            ),
            "bigbird": (
                BigBirdConfig,
                BigBirdForMultiLabelSequenceClassification,
                BigBirdTokenizer,
            ),
            "camembert": (
                CamembertConfig,
                CamembertForMultiLabelSequenceClassification,
                CamembertTokenizer,
            ),
            "distilbert": (
                DistilBertConfig,
                DistilBertForMultiLabelSequenceClassification,
                DistilBertTokenizer,
            ),
            "electra": (
                ElectraConfig,
                ElectraForMultiLabelSequenceClassification,
                ElectraTokenizer,
            ),
            "flaubert": (
                FlaubertConfig,
                FlaubertForMultiLabelSequenceClassification,
                FlaubertTokenizer,
            ),
            "longformer": (
                LongformerConfig,
                LongformerForMultiLabelSequenceClassification,
                LongformerTokenizer,
            ),
            "roberta": (
                RobertaConfig,
                RobertaForMultiLabelSequenceClassification,
                RobertaTokenizer,
            ),
            "roberta_xmlc": (
                RobertaConfig,
                RobertaForMultiLabelSequenceClassificationXMLC,
                RobertaTokenizer,
            ),
            "roberta_xmlc1": (
                RobertaConfig,
                RobertaForMultiLabelSequenceClassificationXMLC1,
                RobertaTokenizer,
            ),
            'roberta_xmlc_basic':(
                RobertaConfig,
                RobertaForMultiLabelSequenceClassificationXMLCBasic,
                RobertaTokenizer,
            ),
            "xlm": (XLMConfig, XLMForMultiLabelSequenceClassification, XLMTokenizer),
            "xlmroberta": (
                XLMRobertaConfig,
                XLMRobertaForMultiLabelSequenceClassification,
                XLMRobertaTokenizer,
            ),
            "xlnet": (
                XLNetConfig,
                XLNetForMultiLabelSequenceClassification,
                XLNetTokenizer,
            ),
        }

        self.args = self._load_model_args(model_name)

        if isinstance(args, dict):
            self.args.update_from_dict(args)
        elif isinstance(args, MultiLabelClassificationArgs):
            self.args = args

        if self.args.thread_count:
            torch.set_num_threads(self.args.thread_count)

        if "sweep_config" in kwargs:
            self.is_sweeping = True
            sweep_config = kwargs.pop("sweep_config")
            sweep_values = sweep_config_to_sweep_values(sweep_config)
            self.args.update_from_dict(sweep_values)
        else:
            self.is_sweeping = False

        if self.args.manual_seed:
            random.seed(self.args.manual_seed)
            np.random.seed(self.args.manual_seed)
            torch.manual_seed(self.args.manual_seed)
            if self.args.n_gpu > 0:
                torch.cuda.manual_seed_all(self.args.manual_seed)

        if not use_cuda:
            self.args.fp16 = False

        config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
        if num_labels:
            self.config = config_class.from_pretrained(
                model_name, num_labels=num_labels, **self.args.config
            )
            self.num_labels = num_labels
        else:
            self.config = config_class.from_pretrained(model_name, **self.args.config)
            self.num_labels = self.config.num_labels
        self.pos_weight = pos_weight

        if use_cuda:
            if torch.cuda.is_available():
                if cuda_device == -1:
                    self.device = torch.device("cuda")
                else:
                    self.device = torch.device(f"cuda:{cuda_device}")
            else:
                raise ValueError(
                    "'use_cuda' set to True when cuda is unavailable."
                    " Make sure CUDA is available or set use_cuda=False."
                )
        else:
            self.device = "cpu"

        if not self.args.quantized_model:
            if self.pos_weight:
                self.model = model_class.from_pretrained(
                    model_name,
                    config=self.config,
                    pos_weight=torch.Tensor(self.pos_weight).to(self.device),
                    **kwargs,
                )
            else:
                self.model = model_class.from_pretrained(
                    model_name, config=self.config, **kwargs
                )
        else:
            quantized_weights = torch.load(
                os.path.join(model_name, "pytorch_model.bin")
            )
            if self.pos_weight:
                self.model = model_class.from_pretrained(
                    None,
                    config=self.config,
                    state_dict=quantized_weights,
                    weight=torch.Tensor(self.pos_weight).to(self.device),
                )
            else:
                self.model = model_class.from_pretrained(
                    None, config=self.config, state_dict=quantized_weights
                )

        if self.args.dynamic_quantize:
            self.model = torch.quantization.quantize_dynamic(
                self.model, {torch.nn.Linear}, dtype=torch.qint8
            )
        if self.args.quantized_model:
            self.model.load_state_dict(quantized_weights)
        if self.args.dynamic_quantize:
            self.args.quantized_model = True

        self.results = {}

        self.tokenizer = tokenizer_class.from_pretrained(
            model_name, do_lower_case=self.args.do_lower_case, **kwargs
        )

        if self.args.special_tokens_list:
            self.tokenizer.add_tokens(
                self.args.special_tokens_list, special_tokens=True
            )
            self.model.resize_token_embeddings(len(self.tokenizer))

        self.args.model_name = model_name
        self.args.model_type = model_type

        if self.args.wandb_project and not wandb_available:
            warnings.warn(
                "wandb_project specified but wandb is not available. Wandb disabled."
            )
            self.args.wandb_project = None

        self.weight = None  # Not implemented for multilabel

    def _load_model_args(self, input_dir):
        args = MultiLabelClassificationArgs()
        args.load(input_dir)
        return args

    def train_model(
        self,
        train_df,
        multi_label=True,
        eval_df=None,
        output_dir=None,
        show_running_loss=True,
        args=None,
        verbose=True,
        **kwargs,
    ):
        return super().train_model(
            train_df,
            multi_label=multi_label,
            eval_df=eval_df,
            output_dir=output_dir,
            show_running_loss=show_running_loss,
            verbose=True,
            args=args,
            **kwargs,
        )

    def eval_model(
        self,
        eval_df,
        multi_label=True,
        output_dir=None,
        verbose=False,
        silent=False,
        **kwargs,
    ):
        return super().eval_model(
            eval_df,
            output_dir=output_dir,
            multi_label=multi_label,
            verbose=verbose,
            silent=silent,
            **kwargs,
        )

    def evaluate(
        self,
        eval_df,
        output_dir,
        multi_label=True,
        prefix="",
        verbose=True,
        silent=False,
        **kwargs,
    ):
        return super().evaluate(
            eval_df,
            output_dir,
            multi_label=multi_label,
            prefix=prefix,
            verbose=verbose,
            silent=silent,
            **kwargs,
        )

    def load_and_cache_examples(
        self,
        examples,
        evaluate=False,
        no_cache=False,
        multi_label=True,
        verbose=True,
        silent=False,
    ):
        return super().load_and_cache_examples(
            examples,
            evaluate=evaluate,
            no_cache=no_cache,
            multi_label=multi_label,
            verbose=verbose,
            silent=silent,
        )

    def compute_metrics(
        self, preds, model_outputs, labels, eval_examples, multi_label=True, **kwargs
    ):
        return super().compute_metrics(
            preds,
            model_outputs,
            labels,
            eval_examples,
            multi_label=multi_label,
            **kwargs,
        )

    def predict(self, to_predict, multi_label=True):
        return super().predict(to_predict, multi_label=multi_label)

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

## Settings

Set the following settings to conduct experiments in the report:

| Experiment Number | MAX_TOKEN_COUNT | WITH_IMPLICIT_SKILLS | EXPERIMENT | MODEL_COMPLEXITY
| --- | --- | --- | --- | --- | 
| 1 | 512 | True | sample20000 | advanced2 |
| 2 | 256 | True | newsample20000 | advanced2 |
| 3 | 256 | True | newsample20000 | advanced |
| 4 | 256 | True | newsample20000 | basic |
| 5 | 256 | False | newsample20000 | advanced2 |
| 6 | 256 | True | newsample20000 | advanced2Multilingual |
| 7 | 256 | - | bhola | advanced2Multilingual |
| 8 | 256 | - | bhola | advanced2 |

In [None]:
# MODEL
MAX_TOKEN_COUNT = 256 # Change this
EPOCHS = 5
BATCH_SIZE = 4
WARMUP_PROPORTION = 0.1
WEIGHT_DECAY = 0.01
LEARNING_RATE = 1e-4
MANUAL_SEED = 42

TRAIN_MODEL = True # Change this 
WITH_IMPLICIT_SKILLS = True # Change this 

if WITH_IMPLICIT_SKILLS:
    COL_SKILLS = "totalskills"
    IMPLICITNESS = "InclImplicit"
else:
    COL_SKILLS = "skills"
    IMPLICITNESS = "ExclImplicit"
    
if EPOCHS != 5:
    N_EPOCHS = '_Epochs' + str(EPOCHS)
else:
    N_EPOCHS = ''

# FILES
s3 = boto3.client('s3')
BUCKET = 'NAME_OF_BUCKET' # Change this
PATH_TO_FILE = 'PATH_TO_FILE' # Change this
EXPERIMENT = 'bhola' # Change this 

MODEL_COMPLEXITY = 'advanced2' # Change this 

if EXPERIMENT == 'bhola':
    FILE_TO_READ_TRAIN = PATH_TO_FILE + 'bhola_dataset.train.pkl'
    FILE_TO_READ_VALID = PATH_TO_FILE + 'bhola_dataset.valid.pkl'
    FILE_TO_READ_TEST = PATH_TO_FILE + 'bhola_dataset.test.pkl'
    IMPLICITNESS = ''
    if MODEL_COMPLEXITY == 'advanced2Multilingual':
        MODEL_TYPE = "bert_xmlc"
        MODEL_NAME = "bert-base-multilingual-uncased"
    elif MODEL_COMPLEXITY == 'advanced2':
        MODEL_TYPE = "bert_xmlc"
        MODEL_NAME = "bert-base-uncased"
    else:
        MODEL_TYPE = "bert"
        MODEL_NAME = "bert-base-uncased"
else:
    FILE_TO_READ_TRAIN = PATH_TO_FILE + 'RobBERT_' + EXPERIMENT + '_train.csv'
    FILE_TO_READ_VALID = PATH_TO_FILE + 'RobBERT_' + EXPERIMENT + '_valid.csv'
    FILE_TO_READ_TEST = PATH_TO_FILE + 'RobBERT_' + EXPERIMENT + '_test.csv'
    if MODEL_COMPLEXITY == 'advanced2Multilingual':
        MODEL_TYPE = "bert_xmlc"
        MODEL_NAME = "bert-base-multilingual-uncased"
    elif MODEL_COMPLEXITY == 'advanced2':
        MODEL_TYPE = "roberta_xmlc"
        MODEL_NAME = "pdelobelle/robbert-v2-dutch-base"
    elif MODEL_COMPLEXITY == 'advanced':
        MODEL_TYPE = "roberta_xmlc1"
        MODEL_NAME = "pdelobelle/robbert-v2-dutch-base"
    elif MODEL_COMPLEXITY == 'basic':
        MODEL_TYPE = "roberta_xmlc_basic"
        MODEL_NAME = "pdelobelle/robbert-v2-dutch-base"
    else:
        MODEL_TYPE = "roberta" 
        MODEL_NAME = "pdelobelle/robbert-v2-dutch-base"
    
BEST_MODEL_DIR = 'outputs/model_' + EXPERIMENT + MODEL_COMPLEXITY + IMPLICITNESS + N_EPOCHS
OUTPUT_RESULTS_OVERVIEW = EXPERIMENT + MODEL_COMPLEXITY + IMPLICITNESS + N_EPOCHS + '_results_overview.tsv'
OUTPUT_RESULTS = EXPERIMENT + MODEL_COMPLEXITY + IMPLICITNESS + N_EPOCHS + '_results.tsv'

## The Dataset

### Import the data from the s3 bucket

In [None]:
if EXPERIMENT == 'bhola':
    # Training dataset
    obj = s3.get_object(Bucket=BUCKET, Key=FILE_TO_READ_TRAIN)
    train_df = pickle.loads(obj['Body'].read())

    # Validation dataset
    obj = s3.get_object(Bucket=BUCKET, Key=FILE_TO_READ_VALID)
    valid_df = pickle.loads(obj['Body'].read())

    # Test dataset
    obj = s3.get_object(Bucket=BUCKET, Key=FILE_TO_READ_TEST)
    test_df = pickle.loads(obj['Body'].read())
    
    print(len(train_df), len(valid_df), len(test_df))
else: 
    # Training dataset
    obj = s3.get_object(Bucket=BUCKET, Key=FILE_TO_READ_TRAIN)
    train_df = pd.read_csv(io.BytesIO(obj['Body'].read()))

    # Validation dataset
    obj = s3.get_object(Bucket=BUCKET, Key=FILE_TO_READ_VALID)
    valid_df = pd.read_csv(io.BytesIO(obj['Body'].read()))

    # Test dataset
    obj = s3.get_object(Bucket=BUCKET, Key=FILE_TO_READ_TEST)
    test_df = pd.read_csv(io.BytesIO(obj['Body'].read()))

    print(train_df.shape, valid_df.shape, test_df.shape)

### Get complete dataset

In [None]:
if EXPERIMENT != 'bhola':
    dataset_df = pd.concat([train_df, valid_df, test_df], ignore_index = True)
    print(dataset_df.shape)

### Preprocess the skills and dataset

In [None]:
##### SKILLS ##### 
def clean_skills(skills):
    skills = skills[1:-1]
    skills = list(skills.split(", "))
    return [skill[1:-1] for skill in skills]

def get_skills(dataset):
    skills_set = set()

    for skills in dataset[COL_SKILLS]:
        skills = clean_skills(skills)
        skills_set.update(skills)
        
    return skills_set

def skills_set_to_dict(skills_set):
    skills_dict = {}
    
    for skill in list(skills_set):
        skills_dict[skill] = 0
        
    return skills_dict

def get_skills_dict(dataset): 
    skills_set = get_skills(dataset)
    skills_dict = skills_set_to_dict(skills_set)
    return skills_dict

##### VACANCIES ##### 
def process_dataset_to_columns(df, skills_dict):
    new_df_list = []
    for i in range(len(df)):
        # Get text
        text = df.iloc[i]["job_description"]

        # Get labels
        label_skills_dict = skills_dict.copy()
        skills = clean_skills(df.iloc[i][COL_SKILLS])

        for skill in skills:
            if skill in skills_dict.keys():
                label_skills_dict[skill] = 1
        labels = list(label_skills_dict.values())
        new_df_list.append([text, labels])
        
    new_df = pd.DataFrame(new_df_list, columns=["text", "labels"])
    
    return new_df  

In [None]:
def SkillsExtractionDataset(total_df, train_df, val_df):
    ''' Collects all input information for the BERT model. '''
    # Get information about skills
    skills_dict = get_skills_dict(total_df)
    
    ## Check if empty is in skills_dict, delete if necessary
    if '' in skills_dict.keys():
        skills_dict.pop('')
        
    num_of_labels = len(skills_dict.keys())
    
    # Get columns of train set
    start_time = time.time()
    print(f"Start with creating the training dataset at {time.ctime()}.")
    train_col_df = process_dataset_to_columns(train_df, skills_dict)
    print(f"Creating the training dataset cost {(time.time() - start_time)/60} minutes.")
    
    # Get columns of validation set
    start_time = time.time()
    print(f"Start with creating the validation dataset at {time.ctime()}.")
    val_col_df = process_dataset_to_columns(val_df, skills_dict)
    print(f"Creating the validation dataset cost {(time.time() - start_time)/60} minutes.")
    
    return dict(
        num_of_labels= num_of_labels,
        skills_dict = skills_dict,
        train_col_df = train_col_df,
        val_col_df = val_col_df,
    )

def data_process_bhola(dataset):
    ''' If the experiment is with the bhola et al dataset, use this function to process the dataset. '''
    # Separate text from labels
    job_descriptions = [dataset[i][0] for i in range(len(dataset))]
    skill_labels = [dataset[i][1] for i in range(len(dataset))]
    
    # Create dataframe
    data = {"text": job_descriptions, "labels": skill_labels}
    df = pd.DataFrame(data)
    
    return df

if EXPERIMENT == 'bhola':
    train_col_df = data_process_bhola(train_df)
    val_col_df = data_process_bhola(valid_df)
    NUM_OF_LABELS = 2548
else:
    dataset_dict = SkillsExtractionDataset(dataset_df, train_df, valid_df)
    print(dataset_dict.keys())
    NUM_OF_LABELS = dataset_dict['num_of_labels']
    train_col_df = dataset_dict['train_col_df']
    val_col_df = dataset_dict['val_col_df']

In [None]:
print(NUM_OF_LABELS)

## Training the model 

### Arguments

In [None]:
model_args = MultiLabelClassificationArgs()
model_args.train_custom_parameters_only = True
model_args.reprocess_input_data = True
model_args.evaluate_during_training = True
model_args.max_seq_length = MAX_TOKEN_COUNT
model_args.fp16 = False
model_args.num_train_epochs = EPOCHS
model_args.overwrite_output_dir = True
model_args.best_model_dir = BEST_MODEL_DIR
model_args.eval_batch_size = BATCH_SIZE
model_args.train_batch_size = BATCH_SIZE
model_args.manual_seed = MANUAL_SEED
model_args.warmup_ratio = WARMUP_PROPORTION 
model_args.save_eval_checkpoints = False
model_args.save_eval_checkpoints = False
model_args.save_model_every_epoch = False

if MODEL_COMPLEXITY == 'advanced2Multilingual':
    model_args.do_lower_case = True
    
if MODEL_COMPLEXITY == 'advanced2Multilingual' or MODEL_COMPLEXITY == 'advanced2':
    model_args.custom_parameter_groups = [
    {
        "params": ["classifier.weight"],
        "lr": LEARNING_RATE,
    },
    {
        "params": ["classifier.bias"],
        "lr": LEARNING_RATE,
        "weight_decay": WEIGHT_DECAY,
    },
    {
        "params": ["classifier_1.weight"],
        "lr": LEARNING_RATE,
    },
    {
        "params": ["classifier_1.bias"],
        "lr": LEARNING_RATE,
        "weight_decay": WEIGHT_DECAY,
    }
    ]
else:
    model_args.custom_parameter_groups = [
        {
            "params": ["classifier.weight"],
            "lr": LEARNING_RATE,
        },
        {
            "params": ["classifier.bias"],
            "lr": LEARNING_RATE,
            "weight_decay": WEIGHT_DECAY,
        }
    ]

In [None]:
print(model_args)

### Create the model

In [None]:
if TRAIN_MODEL:
    # Check whether a CUDA is available
    cuda_available = torch.cuda.is_available()

    model = MultiLabelClassificationModel(
        MODEL_TYPE,
        MODEL_NAME,
        num_labels=NUM_OF_LABELS,
        use_cuda=cuda_available,
        args=model_args
    )

### Training and validating

In [None]:
if TRAIN_MODEL:
    start_time_train = time.time()
    print("Start of training:", start_time_train)

    # Train the model
    model.train_model(train_df = train_col_df, eval_df = val_col_df)

    print("Training took", (time.time()-start_time_train)/60, "minutes.")

In [None]:
if TRAIN_MODEL:
    start_time_val = time.time()
    print("Start of validation:", start_time_val)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(val_col_df)

    print("Validation took", (time.time()-start_time_val)/60, "minutes.")

In [None]:
if TRAIN_MODEL:
    print(result)
    print(model_outputs)
    print(len(wrong_predictions))

### Testing

In [None]:
# Load model if model is not yet loaded
if not TRAIN_MODEL:
    model = MultiLabelClassificationModel(
        MODEL_TYPE, 
        BEST_MODEL_DIR,
        num_labels=NUM_OF_LABELS,
        use_cuda=False,
        args=model_args
    )

In [None]:
def get_ranked_df(i, test_df):
    # Get info from test item
    text = test_df.iloc[i]["job_description"]
    skills = test_df.iloc[i][COL_SKILLS]
    
    # Get the predicted labels and predicted probabilities
    pred_labels, pred_probs = model.predict([text])
    
    # Turn skills from text to labels to get the true_labels    
    skills_dict = dataset_dict['skills_dict'].copy()

    skills = clean_skills(skills)
    for skill in skills:
        if skill in skills_dict.keys():
            skills_dict[skill] = 1

    true_labels = [v for v in skills_dict.values()]

    # Get the name of each skill
    skill_list = [s for s in skills_dict.keys()]
    
    # Create DataFrame for this test item
    data = {"true labels": true_labels, "pred labels": pred_labels[0], "pred probs": pred_probs[0], "skill": skill_list}
    test_item_df = pd.DataFrame(data) 
    ranked_test_item_df = test_item_df.sort_values("pred probs", ascending=False)
    
    return ranked_test_item_df

def get_ranked_df_bhola(i, test_df):
    # Process dataset
    test_col_df = data_process_bhola(test_df)
    
    # Get info from test item
    text = test_col_df.iloc[i]["text"]
    true_labels = test_col_df.iloc[i]["labels"]
    
    # Get the predicted labels and predicted probabilities
    pred_labels, pred_probs = model.predict([text])
    
    # Create DataFrame for this test item
    data = {"true labels": true_labels, "pred labels": pred_labels[0], "pred probs": pred_probs[0]}
    test_item_df = pd.DataFrame(data) 
    ranked_test_item_df = test_item_df.sort_values("pred probs", ascending=False)
    
    return ranked_test_item_df

# Test if it worked
if EXPERIMENT == 'bhola':
    ranked_test_item_df = get_ranked_df_bhola(0, test_df)
else:
    ranked_test_item_df = get_ranked_df(0, test_df)
    
print(ranked_test_item_df)

In [None]:
def get_retrieved_skills(M, ranked_df):
    skills = ranked_df['skill'].head(M)
    return skills.tolist()

In [None]:
def get_metrics_values(ranked_df):
    M = [5, 10, 30, 50, 100]
    epsilon = 1.0e-4 # (To avoid zero-division)
    metric_dict = {}
    
    true_labels = ranked_df['true labels'].to_numpy()
    pred_labels = ranked_df['pred labels'].to_numpy()
    
    # Get indices of true relevant skills
    true_index = np.where(true_labels==1)[0]
    # Get amount of relevant skills for this job description
    true_index_len = len(true_index)
    # Get indices of predicted relevant skills
    pred_index = np.where(pred_labels==1)[0]
    
    # Calculate RR
    rr = 0
    for i in range(len(true_labels)):
        if true_labels[i] == 1:
            rr = 1/float(i+1)
            break       
    metric_dict['rr'] = rr
    
    # Calculate Recall@M and nDCG@M and get the skills
    skills_list = []
    idcg = np.sum([1.0/np.log2(x+2) for x in range(true_index_len)]) # +2 instead of +1 since the ranking starts with 0
    for m in M:     
        # Calculate Recall@m
        correct = 0
        for i in range(m):
            if true_labels[i] == 1:
                correct+=1
        metric_dict["recall@" + str(m)] = correct/(true_index_len+epsilon) # Epsilon to avoid zero-division
        
        # Calculate nDCG@m
        dcg = 0
        for i in range(m):
            if true_labels[i] == 1: # Check if the skill on this position is relevant (only true labels necessary since the first m skills will be extracted non-the-less their classification)
                dcg = dcg + 1.0/np.log2(i+2) # +2 to avoid zero-division
        metric_dict["ndcg@" + str(m)] = dcg/idcg
        
        # Get skills for @m
        skills = get_retrieved_skills(m, ranked_df)
        skills_list.append(skills)
        
    return metric_dict, skills_list

def get_metrics_values_bhola(ranked_df):
    M = [5, 10, 30, 50, 100]
    epsilon = 1.0e-4 # (To avoid zero-division)
    metric_dict = {}
    
    true_labels = ranked_df['true labels'].to_numpy()
    pred_labels = ranked_df['pred labels'].to_numpy()
    
    # Get indices of true relevant skills
    true_index = np.where(true_labels==1)[0]
    # Get amount of relevant skills for this job description
    true_index_len = len(true_index)
    # Get indices of predicted relevant skills
    pred_index = np.where(pred_labels==1)[0]
    
    # Calculate RR
    rr = 0
    for i in range(len(true_labels)):
        if true_labels[i] == 1:
            rr = 1/float(i+1)
            break       
    metric_dict['rr'] = rr
    
    # Calculate Recall@M and nDCG@M and get the skills
    skills_list = []
    idcg = np.sum([1.0/np.log2(x+2) for x in range(true_index_len)]) # +2 instead of +1 since the ranking starts with 0
    for m in M:     
        # Calculate Recall@m
        correct = 0
        for i in range(m):
            if true_labels[i] == 1:
                correct+=1
        metric_dict["recall@" + str(m)] = correct/(true_index_len+epsilon) # Epsilon to avoid zero-division
        
        # Calculate nDCG@m
        dcg = 0
        for i in range(m):
            if true_labels[i] == 1: # Check if the skill on this position is relevant (only true labels necessary since the first m skills will be extracted non-the-less their classification)
                dcg = dcg + 1.0/np.log2(i+2) # +2 to avoid zero-division
        metric_dict["ndcg@" + str(m)] = dcg/idcg
        
    return metric_dict

if EXPERIMENT == 'bhola':
    metric_dict = get_metrics_values_bhola(ranked_test_item_df)
    print(metric_dict)
else:
    metric_dict, skills_list = get_metrics_values(get_ranked_df(5, test_df))
    print(metric_dict)
    print(skills_list)

In [None]:
def get_metric_overview(test_df):
    # Initiate lists
    rec5 = []
    rec10 = []
    rec30 = []
    rec50 = []
    rec100 = []

    ndcg5 = []
    ndcg10 = []
    ndcg30 = []
    ndcg50 = []
    ndcg100 = []

    rr = []
    
    skills_all_items = []

    for i in range(len(test_df)):    
        ranked_df = get_ranked_df(i, test_df)
        metric_dict, skills_list = get_metrics_values(ranked_df)

        rec5.append(metric_dict["recall@5"])
        rec10.append(metric_dict["recall@10"])
        rec30.append(metric_dict["recall@30"])
        rec50.append(metric_dict["recall@50"])
        rec100.append(metric_dict["recall@100"])

        ndcg5.append(metric_dict["ndcg@5"])
        ndcg10.append(metric_dict["ndcg@10"])
        ndcg30.append(metric_dict["ndcg@30"])
        ndcg50.append(metric_dict["ndcg@50"])
        ndcg100.append(metric_dict["ndcg@100"])

        rr.append(metric_dict['rr'])
        
        skills_all_items.append(skills_list)
    
    return [rec5, rec10, rec30, rec50, rec100, ndcg5, ndcg10, ndcg30, ndcg50, ndcg100, rr], skills_all_items

def get_metric_overview_bhola(test_df):
    # Initiate lists
    rec5 = []
    rec10 = []
    rec30 = []
    rec50 = []
    rec100 = []

    ndcg5 = []
    ndcg10 = []
    ndcg30 = []
    ndcg50 = []
    ndcg100 = []
    
    rr = []

    for i in range(len(test_df)):    
        ranked_df = get_ranked_df_bhola(i, test_df)
        metric_dict = get_metrics_values_bhola(ranked_df)

        rec5.append(metric_dict["recall@5"])
        rec10.append(metric_dict["recall@10"])
        rec30.append(metric_dict["recall@30"])
        rec50.append(metric_dict["recall@50"])
        rec100.append(metric_dict["recall@100"])

        ndcg5.append(metric_dict["ndcg@5"])
        ndcg10.append(metric_dict["ndcg@10"])
        ndcg30.append(metric_dict["ndcg@30"])
        ndcg50.append(metric_dict["ndcg@50"])
        ndcg100.append(metric_dict["ndcg@100"])

        rr.append(metric_dict['rr'])
    
    return [rec5, rec10, rec30, rec50, rec100, ndcg5, ndcg10, ndcg30, ndcg50, ndcg100, rr]

if EXPERIMENT == 'bhola':
    metrics_overview = get_metric_overview_bhola(test_df)
else:
    metrics_overview, skills_all_items = get_metric_overview(test_df)

In [None]:
metric_names_M = ['recall5', 'recall10', 'recall30', 'recall50', 'recall100', \
                 'ndcg5', 'ndcg10', 'ndcg30', 'ndcg50', 'ndcg100', 'rr']

metric_overview_dict = {}
metric_and_skill_per_item = {}

# Do not take into account vacancies without any explicit skills
ignore_indices = []

ndcg_5 = metrics_overview[5]
for i in range(len(ndcg_5)):
    if np.isnan(ndcg_5[i]):
        ignore_indices.append(i)
        
print(f"The values of {len(ignore_indices)} vacancies will not be taken into account")

for i in range(len(metric_names_M)):
    metric = metric_names_M[i]
    values_all = metrics_overview[i]
    
    values = []
    for i in range(len(values_all)):
        if i not in ignore_indices:
            values.append(values_all[i])
    
    metric_overview_dict[metric] = [np.mean(values), np.median(values), np.min(values), np.max(values)]
    metric_and_skill_per_item[metric] = values_all
    
print(f"The mean, median, min and max values per metric@m are: \n", metric_overview_dict)
# Create a dataframe to easily download the overview of the results
df_results_overview = pd.DataFrame(metric_overview_dict)
# Download as TSV
df_results_overview.to_csv(OUTPUT_RESULTS_OVERVIEW, sep='\t',index=False)

# Create a dataframe to easily download the results
df_results = pd.DataFrame(metric_and_skill_per_item)
if EXPERIMENT != 'bhola':
    # Add skills
    df_results["retrievedskills"] = skills_all_items
    
# Download as TSV
df_results.to_csv(OUTPUT_RESULTS, sep='\t', index=False)