In [1]:
import torch
import pandas as pd
import os, sys

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%load_ext autoreload
%autoreload 2

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
data_tain_path = './dataset/train_set.csv'
data_test_path = './dataset/test_set.csv'
batch_size = 1

In [4]:
data_train = pd.read_csv(data_tain_path)
data_test = pd.read_csv(data_test_path)

In [5]:
data_train

Unnamed: 0,Job_offer,Label
0,"Openjobmetis SpA ricerca, per importante azien...",Java Developer
1,"La persona prescelta, diplomata o laureata in ...",Software Engineer
2,Sei un informatico o matematico con la passion...,Software Engineer
3,"Ti occuperai della progettazione, realizzazion...",Programmer
4,Stiamo cercando uno\una sviluppatore\sviluppat...,Programmer
...,...,...
1747,"* Test JUnit\n * Git, La Ibs Srl è alla ri...",Java Developer
1748,"La ricorsa, inserita all'interno di un team, l...",Programmer
1749,Sviluppatore Java Junior da inserire in attivi...,Java Developer
1750,La risorsa sarà inserita nei team di risorse d...,Web Developer


In [6]:
data_train_formatted = []
for pd_data_row in data_train.iloc:
    row = {'sentence': pd_data_row['Job_offer'], 'label': pd_data_row['Label']}
    data_train_formatted.append(row)

data_test_formatted = []
for pd_data_row in data_test.iloc:
    row = {'sentence': pd_data_row['Job_offer'], 'label': pd_data_row['Label']}
    data_test_formatted.append(row)

In [7]:
data_train_labels = sorted(list(set([r["label"] for r in data_train_formatted])))
data_test_labels = sorted(list(set([r["label"] for r in data_test_formatted])))

In [8]:
data_train_labels

['Java Developer',
 'Programmer',
 'Software Engineer',
 'System Analyst',
 'Web Developer']

In [9]:
data_test_labels

['Java Developer',
 'Programmer',
 'Software Engineer',
 'System Analyst',
 'Web Developer']

In [10]:
id_to_label = data_train_labels
label_to_id = {l:i for i,l in enumerate(id_to_label)}

In [11]:
class JDataset():
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
        
    def create_collate_fn(self):
        def collate_fn(batch):
            batch_formatted = {}
            batch_formatted['sentence'] = [sample['sentence'] for sample in batch]
            batch_formatted['label'] = [sample['label'] for sample in batch]
            return batch_formatted
        return collate_fn

In [12]:
dataset_train = JDataset(data_train_formatted)
dataset_test = JDataset(data_test_formatted)

In [13]:
from torch.utils.data import DataLoader

dataloader_train = DataLoader(
    dataset_train,
    batch_size=batch_size,
    collate_fn=dataset_train.create_collate_fn(),
    shuffle=True,
)

dataloader_dev = DataLoader(
    dataset_test,
    batch_size=batch_size,
    collate_fn=dataset_train.create_collate_fn(),
    shuffle=False,
)

In [14]:
import torch.optim as optim

loss_function = torch.nn.CrossEntropyLoss()

In [15]:
from code_files.models.transformer_classifier import TClassifier
from code_files.utils.Trainer_nec import Trainer_nec

In [16]:
model = TClassifier(
    loss_fn = loss_function,
    hparams = {
        'transformer_name':"xlm-roberta-base",
        'id_to_label': data_train_labels
    },
    fine_tune_transformer = True
)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
def print_summary(model, short = False):
    """prints the summary for a model

    Args:
        model (any): The torch model
        short (bool, optional): If the print must be synthetic. Defaults to False.
    """
    if not short:
        print(model)
        print('----------------------')
    p = sum(p.numel() for p in model.parameters())
    tp = sum(p.numel() for p in model.parameters() if p.requires_grad)
    ntp = p - tp
    print('parameters:', f'{p:,}')
    print('trainable parameters:', f'{tp:,}')
    print('non-trainable parameters:', f'{ntp:,}')

In [18]:
optimizer_pid = optim.SGD(model.parameters(), lr=0.0016, momentum=0.9)

In [19]:
print_summary(model, short = True)

parameters: 278,047,493
trainable parameters: 278,047,493
non-trainable parameters: 0


In [20]:
for e in dataloader_dev:
    break

In [21]:
e

{'sentence': ["Siamo alla ricerca di figure di Full Stack Developer da inserire all'interno del nostro Team di sviluppo per l'evoluzione di sistemi complessi in ambito gestionale e la realizzazione di funzionalità innovative per i nostri clienti. Facendo riferimento al proprio Team Leader e seguendo un processo ben strutturato, la figura ricercata dovrà seguire la progettazione, lo sviluppo ed il test degli interventi assegnati e la manutenzione dei moduli della suite."],
 'label': ['Web Developer']}

In [22]:
history = {}

In [23]:
trainer = Trainer_nec()

history = trainer.train(
    model, optimizer_pid, dataloader_train, dataloader_dev,
    epochs=100, device=device,
    save_best=True, 
    min_score=0.8,
    save_path_name=os.path.join('checkpoints/transformer_classifier/', f'transformer.pth'),
    saved_history=history
)

torch.Size([1, 23])
BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.0480,  0.0987,  0.0585,  ..., -0.1251,  0.0694, -0.0113],
         [ 0.0030, -0.0402, -0.0480,  ..., -0.0980, -0.0076, -0.1950],
         [-0.0964, -0.0428,  0.0032,  ..., -0.1178, -0.0658,  0.1699],
         ...,
         [-0.0534, -0.0095,  0.0047,  ...,  0.0093, -0.0347,  0.1265],
         [-0.1634, -0.0852,  0.2749,  ..., -0.3903, -0.2148,  0.1752],
         [-0.0117, -0.0361, -0.0540,  ..., -0.4711, -0.0908,  0.0541]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-5.4394e-02,  2.3937e-01,  1.4170e-01,  4.8144e-01,  3.1102e-02,
          3.4568e-01,  4.2155e-01, -4.2921e-01,  1.5022e-01, -1.5042e-01,
          1.0694e-01,  1.3605e-01,  3.7633e-01,  3.5358e-01, -2.2302e-01,
         -1.8314e-01,  2.0493e-01,  4.1246e-01, -4.4405e-02, -1.9544e-01,
         -2.8153e-01,  3.5291e-01, -6.7343e-01, -5.5139e-01, -2.2133e-01,
          5.8162e-01,  1.1278e

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 4.00 GiB total capacity; 3.34 GiB already allocated; 0 bytes free; 3.46 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF