In [1]:
import torch
import pandas as pd

In [2]:
%load_ext autoreload
%autoreload 2

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
data_tain_path = './dataset/train_set.csv'
data_test_path = './dataset/test_set.csv'

In [4]:
data_train = pd.read_csv(data_tain_path)
data_test = pd.read_csv(data_test_path)

In [5]:
data_train

Unnamed: 0,Job_offer,Label
0,"Openjobmetis SpA ricerca, per importante azien...",Java Developer
1,"La persona prescelta, diplomata o laureata in ...",Software Engineer
2,Sei un informatico o matematico con la passion...,Software Engineer
3,"Ti occuperai della progettazione, realizzazion...",Programmer
4,Stiamo cercando uno\una sviluppatore\sviluppat...,Programmer
...,...,...
1747,"* Test JUnit\n * Git, La Ibs Srl è alla ri...",Java Developer
1748,"La ricorsa, inserita all'interno di un team, l...",Programmer
1749,Sviluppatore Java Junior da inserire in attivi...,Java Developer
1750,La risorsa sarà inserita nei team di risorse d...,Web Developer


In [6]:
data_train_formatted = []
for pd_data_row in data_train.iloc:
    row = {'sentence': pd_data_row['Job_offer'], 'label': pd_data_row['Label']}
    data_train_formatted.append(row)

data_test_formatted = []
for pd_data_row in data_test.iloc:
    row = {'sentence': pd_data_row['Job_offer'], 'label': pd_data_row['Label']}
    data_test_formatted.append(row)

In [13]:
data_train_labels = sorted(list(set([r["label"] for r in data_train_formatted])))
data_test_labels = sorted(list(set([r["label"] for r in data_test_formatted])))

In [14]:
data_train_labels

['Java Developer',
 'Programmer',
 'Software Engineer',
 'System Analyst',
 'Web Developer']

In [15]:
data_test_labels

['Java Developer',
 'Programmer',
 'Software Engineer',
 'System Analyst',
 'Web Developer']

In [17]:
id_to_label = data_train_labels
label_to_id = {l:i for i,l in enumerate(id_to_label)}

In [22]:
class JDataset():
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
        
    def create_collate_fn(self):
        def collate_fn(batch):
            batch_formatted = {}
            batch_formatted['sentence'] = [sample['sentence'] for sample in batch]
            batch_formatted['label'] = [sample['label'] for sample in batch]
            return batch_formatted
        return collate_fn

In [23]:
dataset_train = JDataset(data_train_formatted)
dataset_test = JDataset(data_test_formatted)

In [29]:
import torch.optim as optim

loss_function = torch.nn.CrossEntropyLoss()

In [27]:
from code_files.models.transformer_classifier import TClassifier
from code_files.utils.Trainer_nec import Trainer_nec

In [31]:
model = TClassifier(
    loss_fn = loss_function,
    hparams = {
        'transformer_name':"xlm-roberta-base",
        'id_to_label': data_train_labels
    },
    fine_tune_transformer = True
)

Downloading (…)lve/main/config.json: 100%|██████████| 615/615 [00:00<00:00, 136kB/s]
Downloading (…)tencepiece.bpe.model: 100%|██████████| 5.07M/5.07M [00:01<00:00, 4.89MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 9.10M/9.10M [00:01<00:00, 6.35MB/s]
Downloading pytorch_model.bin: 100%|██████████| 1.12G/1.12G [00:43<00:00, 25.8MB/s]
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceC

In [33]:
def print_summary(model, short = False):
    """prints the summary for a model

    Args:
        model (any): The torch model
        short (bool, optional): If the print must be synthetic. Defaults to False.
    """
    if not short:
        print(model)
        print('----------------------')
    p = sum(p.numel() for p in model.parameters())
    tp = sum(p.numel() for p in model.parameters() if p.requires_grad)
    ntp = p - tp
    print('parameters:', f'{p:,}')
    print('trainable parameters:', f'{tp:,}')
    print('non-trainable parameters:', f'{ntp:,}')

In [32]:
optimizer_pid = optim.SGD(model.parameters(), lr=0.0016, momentum=0.9)

In [35]:
print_summary(model, short = True)

parameters: 278,047,493
trainable parameters: 278,047,493
non-trainable parameters: 0
