# Notebook to test SetFit performance

Check CUDA

In [1]:
import torch
if torch.cuda.is_available():    
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

Load dataset and transform data

In [2]:
import pandas as pd
import numpy as np

dataFrame = pd.read_pickle(r'../data/7587_corrige.pkl')

In [3]:
subset = dataFrame[['jobTitle', 'description', 'label']].copy()

subset.reset_index(drop=True, inplace=True)
subset.replace('', np.nan, inplace=True)
subset.dropna(inplace=True)

subset['text'] = subset['jobTitle'] + ' ' + subset['description']
subset['label'] = np.where((subset["label"] < 3) | (subset["label"].isna()), 0, 1)
subset = subset[['text','label']]
subset

Unnamed: 0,text,label
2,Stagiaire ingénieur en intelligence artificiel...,1
3,Stagiaire en développement logiciel Développem...,0
4,Stagiaire en développement Web Création et évo...,0
5,Stagiaire en développement Web Portage d’une a...,0
6,Développeur Data / IA Développement d'applicat...,1
...,...,...
11281,Opérateur production Montage de transmission a...,0
11282,Opérateur production Montage de transmission a...,0
11283,Technicien réparation informatique Reparation ...,0
11284,Technicien réparation Reparation & maintenance...,0


Split between training and test set and truncate the dataset to simulate few-shot context

In [4]:
from datasets import Dataset

def split_train_test(dataset, ratio):
    test_set = dataset.sample(frac = ratio)
    train_set = dataset.drop(test_set.index)
    return train_set, test_set

train_set, test_set = split_train_test(subset, 0.2)

train_ds = Dataset.from_pandas(train_set, split="train")
test_ds = Dataset.from_pandas(test_set, split="test")

In [5]:
from setfit import sample_dataset

train_dataset = sample_dataset(train_ds, label_column="label", num_samples=10)
test_dataset = test_ds
print(train_dataset)

Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 20
})


  df = df.apply(lambda x: x.sample(min(num_samples, len(x)), random_state=seed))


Load a pre trained sentence transformers to be used by SetFit

In [6]:
from setfit import SetFitModel

# Load SetFit model from Hub
model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")
model.to("cuda")

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


SetFitModel(model_body=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})

In [7]:
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import Trainer, TrainingArguments
from sklearn.metrics import f1_score, accuracy_score

# Create trainer

def compute_metrics(y_pred, y_test):
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return { 'accuracy': accuracy, 'f1': f1}

trainer_arguments = TrainingArguments(
	loss=CosineSimilarityLoss,
    batch_size=(8,2),
    num_iterations=20, # Number of text pairs to generate for contrastive learning
    num_epochs=(1,10) # Number of epochs to use for contrastive learning (for the transformer and for the classification head)
)

trainer = Trainer(
    model=model,
    args=trainer_arguments,
    metric=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Training and evaluation

In [8]:
trainer.train()

***** Running training *****
  Num unique pairs = 800
  Batch size = 8
  Num epochs = 1
  Total optimization steps = 100


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

{'embedding_loss': 0.2509, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.01}
{'embedding_loss': 0.001, 'learning_rate': 1.1111111111111113e-05, 'epoch': 0.5}
{'embedding_loss': 0.0004, 'learning_rate': 0.0, 'epoch': 1.0}
{'train_runtime': 211.4763, 'train_samples_per_second': 3.783, 'train_steps_per_second': 0.473, 'epoch': 1.0}


How to save/load the fine tuned model

In [9]:
metrics = trainer.evaluate()
metrics

***** Running evaluation *****


{'accuracy': 0.8044530493707648, 'f1': 0.6456140350877193}

In [10]:
model._save_pretrained(r'../models/setfit') # The given folder must exist

In [11]:
model = SetFitModel._from_pretrained(r'../models/setfit')

A small test on unseen data

In [14]:
preds = model(["Mon rôle chez DreamQuark, est de résoudre les problématiques des différents acteurs autour de la\nbanque et assurance (Churn, upsale, cross-sale etc.) à travers des techniques de Machine\nLearning/Deep learning","• Utilisation de Flask et d’Elasticsearch afin de créer une API\nREST pour faire des recherches sur des régions de\nplanètes."])
print("predictions: ", preds,"expected: ",[1,0])

predictions:  tensor([1, 0], dtype=torch.int32) expected:  [1, 0]
