# Notebook to test how to use BERT and PyTorch with CUDA for FSL

Check CUDA

In [62]:
import torch
if torch.cuda.is_available():    
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

Load BERT

In [63]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
bert = BertModel.from_pretrained("bert-base-multilingual-cased")
bert.to(device)
bert.cuda()

encoded_input = tokenizer(["test 1","test 2"], return_tensors='pt')
encoded_input.to(device)
output = bert(**encoded_input)
print(output["pooler_output"].shape)

torch.Size([2, 768])


Load dataset and transform data

In [64]:
import pandas as pd
import numpy as np

dataFrame = pd.read_pickle(r'../data/7587_corrige.pkl')

In [65]:
subset = dataFrame[['jobTitle', 'description', 'label']].copy()

subset.reset_index(drop=True, inplace=True)
subset.replace('', np.nan, inplace=True)
subset.dropna(inplace=True)

subset['text'] = subset['jobTitle'] + ' ' + subset['description']
subset['label'] = np.where((subset["label"] < 3) | (subset["label"].isna()), 0, 1)
subset = subset[['text','label']]
subset

Unnamed: 0,text,label
2,Stagiaire ingénieur en intelligence artificiel...,1
3,Stagiaire en développement logiciel Développem...,0
4,Stagiaire en développement Web Création et évo...,0
5,Stagiaire en développement Web Portage d’une a...,0
6,Développeur Data / IA Développement d'applicat...,1
...,...,...
11281,Opérateur production Montage de transmission a...,0
11282,Opérateur production Montage de transmission a...,0
11283,Technicien réparation informatique Reparation ...,0
11284,Technicien réparation Reparation & maintenance...,0


Split between training and test set and truncate the dataset to simulate few-shot context

In [66]:
def split_train_test(dataset, n_samples_per_class):
    train_set = dataset.groupby('label').head(n_samples_per_class)
    test_set = dataset.drop(train_set.index)
    return train_set, test_set

train_set, test_set = split_train_test(subset, 10)

Create support set

In [67]:
n_shots = min(10,len(train_set)//2) # Number of samples per class in the support set

def gen_support_set(n_shots, tokenizer, dataset):   
    shuffled_dataset = dataset.sample(frac = 1)
    support_set = {}
    for t in [0,1]: # class 0 and class 1 (not related to AI and related to AI)
        current_target_dataset = shuffled_dataset[shuffled_dataset["label"] == t]
        support_set[t] = []
        for i in range(n_shots):
            encoded_input = tokenizer(current_target_dataset.iloc[i]["text"], return_tensors='pt', truncation=True)
            encoded_input.to(device)
            support_set[t].append(encoded_input)
    return support_set
    
support_set = gen_support_set(n_shots, tokenizer, train_set)

Get prototypes from the support set (one prototype per class)

In [68]:
def get_prototypes_support_set(support_set, bert):
    prototypes_support_set = {}
    for t in support_set.keys():
        embeddings_support_set = []
        for i in range(len(support_set[t])):
            output = bert(**(support_set[t][i]))["pooler_output"]
            embeddings_support_set.append(output)
        prototypes_support_set[t] = torch.mean(torch.stack(embeddings_support_set), axis=0)
    return prototypes_support_set

In [69]:
def predict(tokenizer, bert, instance, support_set):
    bert.eval()
    encoded_input = tokenizer(instance, return_tensors='pt', truncation=True)
    encoded_input.to(device)
    embedding = bert(**encoded_input)["pooler_output"]
    similarities = []
    
    prototypes_support_set = get_prototypes_support_set(support_set, bert)
    
    for key in prototypes_support_set.keys():
        similarity_current_key = torch.nn.functional.cosine_similarity(embedding, prototypes_support_set[key])
        similarities.append(similarity_current_key)
    return list(prototypes_support_set.keys())[torch.argmax(torch.stack(similarities))] # Take the closest element of all classes and return its class label

In [70]:
def gen_batches(training_set, tokenizer, batch_size):
    batches = []
    shuffled_set = training_set.sample(frac=1)

    nb_batches = len(shuffled_set) // batch_size
    
    k = 0
    len_shuffled_set = len(shuffled_set)
    unprocessed_data = shuffled_set["text"].tolist()
    
    for i in range(nb_batches):
        j = 0
        labels = []
        start = i * batch_size
        end = start + batch_size
        unprocessed_batch = unprocessed_data[start:end]
        inputs = tokenizer(unprocessed_batch, return_tensors='pt', padding=True, truncation=True)

        while(j<batch_size and k<len_shuffled_set):
            labels.append(shuffled_set.iloc[k]["label"])
            k += 1
            j += 1
        batches.append((inputs, labels))
            
    return batches

Freeze some weights

In [71]:
freeze_first_params_ratio = 0.7
nb_frozen_params = int(freeze_first_params_ratio * len(list(bert.named_parameters())))

for name, param in list(bert.named_parameters())[0:nb_frozen_params+1]: 
    param.requires_grad = False

Fine-tune BERT

In [72]:
n_epochs = 20
optimizer = torch.optim.AdamW(bert.parameters(), lr=1e-5)
torch.cuda.empty_cache()

bert.zero_grad()

try:
    bert.train()
    for epoch in range(n_epochs):
        batches = gen_batches(train_set, tokenizer, 16)
        print("Epoch: ", epoch, "/",n_epochs)
        #b = 0
        epoch_mean_loss = 0
        for batch in batches:
            optimizer.zero_grad()
            inputs, labels = batch
            #print("Batch: ", b, "/",len(batches))
            #b += 1
            predictions = []
            inputs.to(device)
            bert_output = bert(**inputs)["pooler_output"]
            losses = []           
            
            embeddings_support_set = get_prototypes_support_set(support_set, bert)
        
            for i in range(len(bert_output)):
                input2 = torch.unsqueeze(bert_output[i],0)
                input2.to(device)
                for j in embeddings_support_set.keys():
                    current_class_support_data = embeddings_support_set[j]
                    target = torch.tensor([1.0]) if j == labels[i] else torch.tensor([-1.0])
                    target = target.to(device)
                    losses.append(torch.nn.functional.cosine_embedding_loss(current_class_support_data, input2, target))
            loss = torch.mean(torch.stack(losses))
            epoch_mean_loss += loss.item()
                        
            loss.backward()
            optimizer.step()
        
        epoch_mean_loss /= len(batches)
        print(f"loss: {epoch_mean_loss:.2f}")
finally:
    torch.cuda.empty_cache()

Epoch:  0 / 20
loss: 0.50
Epoch:  1 / 20
loss: 0.50
Epoch:  2 / 20
loss: 0.50
Epoch:  3 / 20
loss: 0.49
Epoch:  4 / 20
loss: 0.50
Epoch:  5 / 20
loss: 0.49
Epoch:  6 / 20
loss: 0.49
Epoch:  7 / 20
loss: 0.48
Epoch:  8 / 20
loss: 0.47
Epoch:  9 / 20
loss: 0.45
Epoch:  10 / 20
loss: 0.45
Epoch:  11 / 20
loss: 0.42
Epoch:  12 / 20
loss: 0.38
Epoch:  13 / 20
loss: 0.36
Epoch:  14 / 20
loss: 0.26
Epoch:  15 / 20
loss: 0.28
Epoch:  16 / 20
loss: 0.31
Epoch:  17 / 20
loss: 0.19
Epoch:  18 / 20
loss: 0.22
Epoch:  19 / 20
loss: 0.17


In [73]:
pred1 = predict(tokenizer, bert,"Mon rôle chez DreamQuark, est de résoudre les problématiques des différents acteurs autour de la\nbanque et assurance (Churn, upsale, cross-sale etc.) à travers des techniques de Machine\nLearning/Deep learning et analyse statistique.\n\n● Contribution à l'amélioration de Brain, la plateforme d'Auto-ML de Dreamquark, en développant de nouvelles features à l'aide du framework Pytorch, Scikit-learn, Numpy, Pandas, FastApi, Docker, Kubernetes et CircleCi\n\n● Développement d'un package Time Series avec l'intégration de module automatique de preprocessing et module de training avec des réseaux de neurone TCN (Temporal Convolutional Network)\n\n● Développement d'un moteur de data-preparation scalable à l'horizontal compatible Pandas et Dask, s'inspirant de la philosophe Pandas et scikit-learn pipeline permettant de rendre reproductible les codes jupyter en production.\n\nStack Technique :\n\nPython, Pytorch, Scikit-learn, Numpy, Docker, Kubernetes, Circleci, Dask, FastApi, Dask, Azure, Circle\nCi, Prefect, Alembic, SqlAlchemy, Postgresql'",support_set)
pred2 = predict(tokenizer, bert,"• Utilisation de Flask et d’Elasticsearch afin de créer une API\nREST pour faire des recherches sur des régions de\nplanètes.\n\n• Conception d'une application web avec Vue.js et Quasar\nutilisant cette API, avec visualisation 3D des données.",support_set)

print("expected: ", [1,0], "predictions: ", [pred1,pred2])

expected:  [1, 0] predictions:  [1, 1]


Evaluation

In [74]:
from torcheval.metrics.functional import binary_f1_score

def eval(test_set, tokenizer, bert, support_set):   
    bert.eval()

    predictions = []
    expected = []

    batches = gen_batches(test_set, tokenizer, 16)
    # b = 0

    prototypes_support_set = get_prototypes_support_set(support_set, bert)

    for batch in batches:
        inputs, labels = batch
        # print("Batch: ", b, "/",len(batches))
        # b += 1
        inputs.to(device)
        bert_output = bert(**inputs)["pooler_output"]
            
        for i in range(len(bert_output)):
            embedding = torch.unsqueeze(bert_output[i],0)
            similarities = []
            for key in prototypes_support_set.keys():
                similarity_current_key = torch.nn.functional.cosine_similarity(embedding, prototypes_support_set[key])
                similarities.append(similarity_current_key)
            predictions.append(torch.tensor(list(prototypes_support_set.keys())[torch.argmax(torch.stack(similarities))])) # Take the closest element of all classes and return its class label
            expected.append(torch.tensor(labels[i]))

    predictions = torch.stack(predictions)
    expected = torch.stack(expected)

    return binary_f1_score(predictions, expected).item()

In [75]:
print(f"F1 score: {eval(test_set.head(100), tokenizer, bert, support_set):.2f}")

F1 score: 0.67
