# Notebook to test SetFit performance

# Setup and data cleaning

## Change logs settings

In [1]:
from sentence_transformers.losses import CosineSimilarityLoss, BatchAllTripletLoss, BatchHardTripletLossDistanceFunction

import pandas as pd
import numpy as np

import logging
import warnings

# Disable some logs because there were too many messages during the tests
# logging.disable(logging.INFO)
# warnings.filterwarnings("ignore", category=DeprecationWarning) 

from datasets import disable_progress_bar
disable_progress_bar() # Disable the "Map" progress bar during the tests

## Load and clean the dataset

This dataset is not on the GitHub repository.
It's composed of work experienced fetched from LinkedIn and labelled between 0 and 4 (0 if it's not related to AI and 4 if it is)

In [2]:
dataFrame = pd.read_pickle(r'../data/7587_corrige.pkl')
subset = dataFrame[['jobTitle', 'description', 'label']].copy()

subset.reset_index(drop=True, inplace=True)
subset.replace('', np.nan, inplace=True)
subset.dropna(inplace=True)

subset['text'] = subset['jobTitle'] + ' ' + subset['description']
subset = subset[['text','label']]
subset_label_transform = subset.copy()

subset_label_transform['label'] = np.where((subset_label_transform["label"] < 3) | (subset_label_transform["label"].isna()), 0, 1)
subset_label_transform

Unnamed: 0,text,label
2,Stagiaire ingénieur en intelligence artificiel...,1
3,Stagiaire en développement logiciel Développem...,0
4,Stagiaire en développement Web Création et évo...,0
5,Stagiaire en développement Web Portage d’une a...,0
6,Développeur Data / IA Développement d'applicat...,1
...,...,...
11281,Opérateur production Montage de transmission a...,0
11282,Opérateur production Montage de transmission a...,0
11283,Technicien réparation informatique Reparation ...,0
11284,Technicien réparation Reparation & maintenance...,0


## Split the dataset in two subsets : the training and test sets

In [3]:
from benchmark.utility import split_dataset
train_set, test_set = split_dataset(subset_label_transform, 0.2)

## Run tests

In [4]:
from benchmark.utility import save_to_json
from benchmark.tests import n_shot_tests, input_length_tests, distance_tests, loss_tests, language_tests, model_tests, num_epochs_tests, constant_params_tests, data_augmentation_tests
from benchmark.train_eval_task import setfit_f1_score

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\robin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### N-shots

By default SetFit uses the oversampling strategy and the Cosine Similarity loss. For instance if we have 8 positive and 8 negative examples then we have:

|   | Y | Y | Y | Y | Y | Y | Y | Y | N | N | N | N | N | N | N | N |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Y | + | + | + | + | + | + | + | + | - | - | - | - | - | - | - | - |
| Y |   | + | + | + | + | + | + | + | - | - | - | - | - | - | - | - |
| Y |   |   | + | + | + | + | + | + | - | - | - | - | - | - | - | - |
| Y |   |   |   | + | + | + | + | + | - | - | - | - | - | - | - | - |
| Y |   |   |   |   | + | + | + | + | - | - | - | - | - | - | - | - |
| Y |   |   |   |   |   | + | + | + | - | - | - | - | - | - | - | - |
| Y |   |   |   |   |   |   | + | + | - | - | - | - | - | - | - | - |
| Y |   |   |   |   |   |   |   | + | - | - | - | - | - | - | - | - |
| N |   |   |   |   |   |   |   |   | + | + | + | + | + | + | + | + |
| N |   |   |   |   |   |   |   |   |   | + | + | + | + | + | + | + |
| N |   |   |   |   |   |   |   |   |   |   | + | + | + | + | + | + |
| N |   |   |   |   |   |   |   |   |   |   |   | + | + | + | + | + |
| N |   |   |   |   |   |   |   |   |   |   |   |   | + | + | + | + |
| N |   |   |   |   |   |   |   |   |   |   |   |   |   | + | + | + |
| N |   |   |   |   |   |   |   |   |   |   |   |   |   |   | + | + |
| N |   |   |   |   |   |   |   |   |   |   |   |   |   |   |   | + |

- P = 2 * (8 + 7 + 6 + 5 + 4 + 3 + 2 + 1) 	= 72
- N = 8 * 8 = 64 -> + 8 duplications 		= 72
- Total = 72 + 72 = 144

In [5]:
params = {
    "n_shot": [1, 2, 4, 6, 10, 20, 40, 60, 100],
    "n_iter": 15,
    "n_max_iter_per_shot": 10,
    "model": "sentence-transformers/paraphrase-mpnet-base-v2",
    "loss": CosineSimilarityLoss
}

results, train_times, eval_times = n_shot_tests(params, train_set, test_set, few_shot_model_f1_function=setfit_f1_score)

save_to_json(results, train_times, eval_times, params,  r'../results/setfit/n_shot')

  df = df.apply(lambda x: x.sample(min(num_samples, len(x)), random_state=seed))


Step: 1 / 1 Estimated remaining time: ?


### Input length

In [None]:
params = {
    "input_length_range": [[0,5],[5,25],[25,50],[50,100],[100,200],[200,350]],
    # [[6,10],[10,15],[15,20],[20,30], [6,15], [15,30], [6,20], [10,30], [6,30]],
    # [[0,5],[5,10], [10,50], [50,100],[100,200],[200,350]],
    # [[0,9],[1,9],[2,9],[3,9],[4,9],[5,9],[6,9],[7,9],[8,9],[9,9]],
    # [[0,9],[9,100],[9,350],[100,350],[0,350]],
	# [[8,50],[8,100],[8,150],[8,200],[8,250],[8,300],[8,350]],
	# [[7,350],[8,350],[9,350],[10,350]],
    # [[0,3],[0,4],[0,5],[0,6],[0,7],[0,8],[0,9],[0,10]],
    # [[0,5],[0,10],[0,100],[6,100],[ 200,350]],
	"n_shot": 10,
	"n_iter": 100,
	"model": "sentence-transformers/paraphrase-mpnet-base-v2",
	"loss": CosineSimilarityLoss
}

results, train_times, eval_times = input_length_tests(params, train_set, test_set, few_shot_model_f1_function=setfit_f1_score)

save_to_json(results, train_times, eval_times, params,  r'../results/setfit/input_length')

### Distance

In [None]:
params = {
	"n_shot": 10,
	"n_iter": 100,
	"model": "sentence-transformers/paraphrase-mpnet-base-v2",
	"distance": {
		"Cosine":BatchHardTripletLossDistanceFunction.cosine_distance,
		"Euclidian": BatchHardTripletLossDistanceFunction.eucledian_distance, # it's really "eucledian" and not "euclidian" in the module sentence_transformers
	},
	"loss": CosineSimilarityLoss,
}


results, train_times, eval_times = distance_tests(params, train_set, test_set, few_shot_model_f1_function=setfit_f1_score)

save_to_json(results, train_times, eval_times, params,  r'../results/setfit/distance')

### Loss (pair-wise or Triplet)

In [None]:
params = {
	"n_shot": 10,
	"n_iter": 100,
	"model": "sentence-transformers/paraphrase-mpnet-base-v2",
	"loss": {"Pair-wise":CosineSimilarityLoss, "Triplet":BatchAllTripletLoss}
}

results, train_times, eval_times = loss_tests(params, train_set, test_set, few_shot_model_f1_function=setfit_f1_score)

save_to_json(results, train_times, eval_times, params,  r'../results/setfit/loss')

### Language

In [None]:
params = {
	"n_shot": 10,
	"lang": ['fr','en'],
	"n_iter": 100,
	"model": "sentence-transformers/paraphrase-mpnet-base-v2",
	"loss": CosineSimilarityLoss
}

results, train_times, eval_times = language_tests(params, train_set, test_set, few_shot_model_f1_function=setfit_f1_score)

save_to_json(results, train_times, eval_times, params,  r'../results/setfit/language')

### Embedding Model

In [None]:
params = {
	"n_shot": 10,
	"n_iter": 100,
	"loss": CosineSimilarityLoss,
	"model": {
		# "instructor-large":"hkunlp/instructor-large",
		"GIST-small-Embedding-v0":"avsolatorio/GIST-small-Embedding-v0",
		"gte-tiny":"TaylorAI/gte-tiny",
		# "all-mpnet-base-v2-table":"deepset/all-mpnet-base-v2-table",
  		"paraphrase-mpnet-base-v2": "sentence-transformers/paraphrase-mpnet-base-v2",
		# "all-mpnet-base-v2":"sentence-transformers/all-mpnet-base-v2",
	}
}
results, run_train_times, eval_timestimes = model_tests(params, train_set, test_set, few_shot_model_f1_function=setfit_f1_score)

save_to_json(results, train_times, eval_times, params,  r'../results/setfit/model')

### Number of epochs

In [None]:
params = {
	"n_shot": 10,
	"n_iter": 100,
	"loss": CosineSimilarityLoss,
	"model": "sentence-transformers/paraphrase-mpnet-base-v2",
	"num_epochs": [(8,1),(8,2),(8,4),(8,8),(8,10),(8,20),(8,30),(8,40)], 
	# [(1,1),(2,1),(4,1),(8,1),(16,1),(32,1),(64,1)], 
	# [(1,1),(1,2),(1,4),(1,8),(1,12),(1,16),(1,20),(1,25),(1,30)],
}

results, train_times, eval_times = num_epochs_tests(params, train_set, test_set, few_shot_model_f1_function=setfit_f1_score)

save_to_json(results, train_times, eval_times, params,  r'../results/setfit/num_epochs')

### Data sampling

Run multiple tests with different training sets but the same parameters

In [None]:
# params = {
# 	"n_shot": 10,
# 	"n_iter": 100,
# 	"loss": CosineSimilarityLoss,
# 	"model": "sentence-transformers/paraphrase-mpnet-base-v2",
# 	"input_length_range":[0,9],
# }

# results, run_times = constant_params_tests(params, train_set, test_set, few_shot_model_f1_function=setfit_f1_score)

# save_to_json(results, run_times, params,  r'../results/setfit/data_sampling')

### Data Augmentation

For now we only use a back translation technique and synonym replacement, but we could try other ones

In [None]:
params = {
    "n_shot": 10,
    "n_iter": 100,
    "loss": CosineSimilarityLoss,
    "model": "sentence-transformers/paraphrase-mpnet-base-v2",
    "data_augmentation_ratio": 1.3, # + 30 %
    "data_augmentation_strategy":["none","swapping_inter", "back_translation", "synonym_replacement", "crossover"],
    "strategy_params": {
        "n_points_crossover": 2,
        "modification_rate": 0.5,
    }
}

results, train_times, eval_times = data_augmentation_tests(params, train_set, test_set, few_shot_model_f1_function=setfit_f1_score)
save_to_json(results, train_times, eval_times, params,  r'../results/setfit/data_augmentation')

### Dataset label selection

Here instead of considering Nan, 0, 1 and 2 as not being an AI experience and 3 and 4 as being one, we consider :

- not AI = 0 and 1 and AI = 3 and 4 (we drop the examples with the label NaN or 2)
- not AI = 0 and AI = 4 (we drop the examples with the label NaN, 1, 2 or 3)

In [None]:
subset_label_transform_likely_labels = subset.copy()
subset_label_transform_likely_labels.replace({2: np.nan}, inplace=True)
subset_label_transform_likely_labels.dropna(inplace=True)
subset_label_transform_likely_labels['label'] = np.where((subset_label_transform_likely_labels["label"] < 3), 0, 1)

subset_label_transform_sure_labels = subset.copy()
subset_label_transform_sure_labels.replace({1: np.nan, 2: np.nan, 3: np.nan}, inplace=True)
subset_label_transform_sure_labels.dropna(inplace=True)
subset_label_transform_sure_labels['label'] = np.where((subset_label_transform_sure_labels["label"] == 0), 0, 1)

# We keep the full test set
train_set_likely_labels, _ = split_dataset(subset_label_transform_likely_labels, 0.2) 
train_set_sure_labels, _ = split_dataset(subset_label_transform_sure_labels, 0.2)

In [None]:
params = {
    "n_shot": 10,
    "n_iter": 100,
    "loss": CosineSimilarityLoss,
    "model": "sentence-transformers/paraphrase-mpnet-base-v2",
}

tested_training_sets = {
    "all_labels": train_set,
    "likely_labels":train_set_likely_labels,
    "sure_labels":train_set_sure_labels,
}

results = {}
train_times = {}
eval_times = {}
progress = 0
progress_end = len(tested_training_sets)

for training_set_key, training_set_data in tested_training_sets.items():
    print("Test: ", progress,"/",progress_end)
    temp_results, temp_train_times, temp_eval_times = constant_params_tests(params, training_set_data, test_set, setfit_f1_score)
    results[training_set_key] = temp_results["all"]
    train_times[training_set_key] = temp_train_times["all"]
    eval_times[training_set_key] = temp_eval_times["all"]

params["training_set"] = list(tested_training_sets.keys())
save_to_json(results, train_times, eval_times, params,  r'../results/setfit/training_set_labels_restriction')