Check CUDA

In [1]:
import tensorflow as tf;
print(tf.config.list_physical_devices('GPU'))

""" gpus = tf.config.experimental.list_physical_devices('GPU')
if len(gpus) > 0:
    print("GPU detected")
    tf.config.experimental.set_memory_growth(gpus[0], True)
    tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2048)])
 """
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
tf.config.set_visible_devices([], 'GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


Load BERT

In [2]:
from transformers import BertTokenizer, TFBertModel

tokenizer = BertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
bert = TFBertModel.from_pretrained("distilbert-base-multilingual-cased")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['distilbert.transformer.layer.3.attention.q_lin.weight', 'distilbert.transformer.layer.1.attention.q_lin.weight', 'distilbert.transformer.layer.4.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.2.attention.k_lin.bias', 'distilbert.transformer.layer.2.attention.q_lin.weight', 'distilbert.transformer.layer.2.output_layer_norm.weight', 'di

Test BERT loading

In [3]:
encoded_input = tokenizer(["text1","text2"], return_tensors='tf')
output = bert(encoded_input)
print(output["pooler_output"].shape)

(2, 768)


Load dataset and transform data

In [4]:
import pandas as pd
import numpy as np

dataFrame = pd.read_pickle(r'../data/7587_corrige.pkl')

In [5]:
subset = dataFrame[['jobTitle', 'description', 'label']].copy()

subset.reset_index(drop=True, inplace=True)
subset.replace('', np.nan, inplace=True)
subset.dropna(inplace=True)

subset['text'] = subset['jobTitle'] + ' ' + subset['description']
subset = subset[['text','label']]
subset

Unnamed: 0,text,label
2,Stagiaire ingénieur en intelligence artificiel...,4.0
3,Stagiaire en développement logiciel Développem...,2.0
4,Stagiaire en développement Web Création et évo...,1.0
5,Stagiaire en développement Web Portage d’une a...,0.0
6,Développeur Data / IA Développement d'applicat...,4.0
...,...,...
11281,Opérateur production Montage de transmission a...,1.0
11282,Opérateur production Montage de transmission a...,1.0
11283,Technicien réparation informatique Reparation ...,0.0
11284,Technicien réparation Reparation & maintenance...,0.0


Get text embeddings

In [6]:
from random import shuffle
import tensorflow as tf

n_shots = 2 # Number of samples per class in the support set

def gen_support_set(n_shots, tokenizer, bert, dataset):
    target_values = dataset["label"].unique()
    
    shuffled_dataset = dataset.sample(frac = 1)
    support_set = {}
    for t in range(len(target_values)):
        current_target_dataset = shuffled_dataset[shuffled_dataset["label"] == t]
        support_set[t] = []
        for i in range(n_shots):
            encoded_input = tokenizer(current_target_dataset.iloc[i]["text"], return_tensors='tf')
            output = bert(encoded_input)["pooler_output"]
            support_set[t].append(output)
    return support_set
    
support_set = gen_support_set(n_shots, tokenizer, bert, subset)
len(support_set)

5

In [7]:
def predict(tokenizer, bert, instance, support_set):
    encoded_input = tokenizer(instance, return_tensors='tf')
    embedding = bert(encoded_input)["pooler_output"]
    similarities = []
    for key in support_set.keys():
        similarities_current_key = []
        for item in support_set[key]:
            similarity = tf.keras.losses.cosine_similarity(embedding, item)
            similarities_current_key.append(-tf.reduce_mean(similarity).numpy())
        similarities.append(np.max(similarities_current_key))
    print(similarities)
    return list(support_set.keys())[np.argmax(similarities)]

print(predict(tokenizer, bert, subset.iloc[0]["text"], support_set))
print(subset.iloc[0]["label"])

[0.9931004, 0.99388397, 0.99519753, 0.994191, 0.99419695]
2
4.0


In [8]:
def gen_batches(training_set, tokenizer, batch_size):
    batches = []
    shuffled_set = training_set.sample(frac=1)

    nb_batches = len(shuffled_set) // batch_size
    
    k = 0
    len_shuffled_set = len(shuffled_set)
    unprocessed_data = shuffled_set["text"].tolist()
    
    for i in range(nb_batches):
        j = 0
        labels = []
        start = i * batch_size
        end = start + batch_size
        unprocessed_batch = unprocessed_data[start:end]
        inputs = tokenizer(unprocessed_batch, return_tensors='tf',padding=True, truncation=True)

        while(j<batch_size and k<len_shuffled_set):
            labels.append(shuffled_set.iloc[k]["label"])
            k += 1
            j += 1
        batches.append((inputs, labels))
            
    return batches

Split the dataset

In [9]:
def split_train_test(dataset, ratio):
    test_set = dataset.sample(frac = ratio)
    train_set = dataset.drop(test_set.index)
    return train_set, test_set

Fine-tune BERT

In [10]:
subset_trunc = subset.head(80)
train_set, test_set = split_train_test(subset_trunc, 0.2)
n_epochs = 5


for epoch in range(n_epochs):
    batches = gen_batches(train_set, tokenizer, 32)
    print("Epoch: ", epoch, "/",n_epochs)
    b = 0
    for batch in batches:
        inputs, labels = batch
        print("Batch: ", b, "/",len(batches))
        b += 1
        with tf.GradientTape() as tape:
            predictions = []
            bert_output = bert(inputs, training=True)
            predictions = bert_output["pooler_output"]
            losses = []
            for i in range(len(predictions)):
                for j in range(n_shots):
                    losses.append(tf.keras.losses.cosine_similarity(support_set[labels[j]], predictions[i]))
            loss = tf.reduce_mean(losses,axis=0)
        print(loss)
        gradients = tape.gradient(loss, bert.trainable_variables)
        tf.keras.optimizers.Adam(learning_rate=2e-5).apply_gradients(zip(gradients, bert.trainable_variables))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch:  0 / 5
Batch:  0 / 2
bert...
tf.Tensor(
[[-0.8865897]
 [-0.888097 ]], shape=(2, 1), dtype=float32)


KeyboardInterrupt: 

In [None]:
print(predict(tokenizer, bert, subset.iloc[0]["text"], support_set))
print(subset.iloc[0]["label"])

[0.96372575, 0.9763564, 0.9829797, 0.97101414, 0.9338231]
2
4.0
