Check CUDA

In [1]:
import torch
if torch.cuda.is_available():    
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

Load BERT

In [2]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
bert = BertModel.from_pretrained("bert-base-multilingual-cased")
bert.to(device)
bert.cuda()

encoded_input = tokenizer(["test 1","test 2"], return_tensors='pt')
encoded_input.to(device)
output = bert(**encoded_input)
print(output["pooler_output"].shape)

torch.Size([2, 768])


Load dataset and transform data

In [3]:
import pandas as pd
import numpy as np

dataFrame = pd.read_pickle(r'../data/7587_corrige.pkl')

In [4]:
subset = dataFrame[['jobTitle', 'description', 'label']].copy()

subset.reset_index(drop=True, inplace=True)
subset.replace('', np.nan, inplace=True)
subset.dropna(inplace=True)

subset['text'] = subset['jobTitle'] + ' ' + subset['description']
subset = subset[['text','label']]
subset

Unnamed: 0,text,label
2,Stagiaire ingénieur en intelligence artificiel...,4.0
3,Stagiaire en développement logiciel Développem...,2.0
4,Stagiaire en développement Web Création et évo...,1.0
5,Stagiaire en développement Web Portage d’une a...,0.0
6,Développeur Data / IA Développement d'applicat...,4.0
...,...,...
11281,Opérateur production Montage de transmission a...,1.0
11282,Opérateur production Montage de transmission a...,1.0
11283,Technicien réparation informatique Reparation ...,0.0
11284,Technicien réparation Reparation & maintenance...,0.0


Get text embeddings

In [5]:
n_shots = 10 # Number of samples per class in the support set

def gen_support_set(n_shots, tokenizer, dataset):
    target_values = dataset["label"].unique()
    
    shuffled_dataset = dataset.sample(frac = 1)
    support_set = {}
    for t in target_values:
        current_target_dataset = shuffled_dataset[shuffled_dataset["label"] == t]
        support_set[t] = []
        for i in range(n_shots):
            encoded_input = tokenizer(current_target_dataset.iloc[i]["text"], return_tensors='pt', truncation=True)
            encoded_input.to(device)
            support_set[t].append(encoded_input)
    return support_set
    
support_set = gen_support_set(n_shots, tokenizer, subset)

In [6]:
def get_embeddings_support_set(support_set, bert):
    embeddings_support_set = {}
    for t in support_set.keys():
        embeddings_support_set[t] = []
        for i in range(len(support_set[t])):
            output = bert(**(support_set[t][i]))["pooler_output"]
            embeddings_support_set[t].append(output)
    return embeddings_support_set

embeddings_support_set = get_embeddings_support_set(support_set, bert)

In [7]:
def predict(tokenizer, bert, instance, support_set):
    encoded_input = tokenizer(instance, return_tensors='pt', truncation=True)
    encoded_input.to(device)
    embedding = bert(**encoded_input)["pooler_output"]
    similarities = []
    
    embeddings_support_set = get_embeddings_support_set(support_set, bert)
    
    for key in embeddings_support_set.keys():
        similarities_current_key = []
        for item in embeddings_support_set[key]:
            similarity = torch.nn.functional.cosine_similarity(embedding, item)
            similarities_current_key.append(torch.mean(similarity))
        similarities.append(torch.max(torch.stack(similarities_current_key))) # Take the closest element of the support set for the class key to the input
    return list(embeddings_support_set.keys())[torch.argmax(torch.stack(similarities))] # Take the closest element of all classes and return its class label

print(predict(tokenizer, bert, subset.iloc[0]["text"], support_set))
print(subset.iloc[0]["label"])

1.0
4.0


In [8]:
def gen_batches(training_set, tokenizer, batch_size):
    batches = []
    shuffled_set = training_set.sample(frac=1)

    nb_batches = len(shuffled_set) // batch_size
    
    k = 0
    len_shuffled_set = len(shuffled_set)
    unprocessed_data = shuffled_set["text"].tolist()
    
    for i in range(nb_batches):
        j = 0
        labels = []
        start = i * batch_size
        end = start + batch_size
        unprocessed_batch = unprocessed_data[start:end]
        inputs = tokenizer(unprocessed_batch, return_tensors='pt', padding=True, truncation=True)

        while(j<batch_size and k<len_shuffled_set):
            labels.append(shuffled_set.iloc[k]["label"])
            k += 1
            j += 1
        batches.append((inputs, labels))
            
    return batches

Split the dataset

In [9]:
def split_train_test(dataset, ratio):
    test_set = dataset.sample(frac = ratio)
    train_set = dataset.drop(test_set.index)
    return train_set, test_set

Fine-tune BERT

In [24]:
subset_trunc = subset.head(100)
train_set, test_set = split_train_test(subset_trunc, 0.2)
n_epochs = 10
optimizer = torch.optim.AdamW(bert.parameters())
torch.cuda.empty_cache()

bert.zero_grad()

try:
    for epoch in range(n_epochs):
        batches = gen_batches(train_set, tokenizer, 16)
        print("Epoch: ", epoch, "/",n_epochs)
        b = 0
        epoch_mean_loss = 0
        for batch in batches:
            optimizer.zero_grad()
            bert.train()
            inputs, labels = batch
            print("Batch: ", b, "/",len(batches))
            b += 1
            predictions = []
            inputs.to(device)
            bert_output = bert(**inputs)["pooler_output"]
            losses = []
            
            
            embeddings_support_set = get_embeddings_support_set(support_set, bert)
		
            for i in range(len(bert_output)):
                input2 = torch.unsqueeze(bert_output[i],0)
                input2.to(device)
                for j in embeddings_support_set.keys():
                    current_class_support_data = embeddings_support_set[j]
                    target = torch.tensor([1.0]) if j == labels[i] else torch.tensor([-1.0])
                    target = target.to(device)
                    for n in range(n_shots):
                        losses.append(torch.nn.functional.cosine_embedding_loss(current_class_support_data[n], input2, target))
                    
            loss = torch.mean(torch.stack(losses))
            epoch_mean_loss += loss.item()
                        
            torch.nn.utils.clip_grad_norm_(bert.parameters(), 1.0)
            loss.backward()
            optimizer.step()
        
        epoch_mean_loss /= len(batches)
        print(f"loss: {epoch_mean_loss:.2f}")
finally:
    torch.cuda.empty_cache()

Epoch:  0 / 10
Batch:  0 / 5
Batch:  1 / 5
Batch:  2 / 5
Batch:  3 / 5
Batch:  4 / 5
loss: 0.75
Epoch:  1 / 10
Batch:  0 / 5


KeyboardInterrupt: 

In [23]:
print(predict(tokenizer, bert, subset.iloc[0]["text"], support_set))
print(subset.iloc[0]["label"])

0.0
4.0
