Install the sencence transformers library

In [None]:
!pip install -U sentence-transformers

Import all dependencies

In [None]:
from sentence_transformers import SentenceTransformer, SentencesDataset, losses
from huggingface_hub import notebook_login
from torch.utils.data import DataLoader
from torch import device, cuda
import boto3
import pickle

Login to huggingface account to push the model

In [None]:
notebook_login()

If the data is stored at an S3 bucket, read the triplets data from the S3 bucket

In [None]:
s3 = boto3.client('s3')

bucket = 'sagemaker-studio-113002098422-fuoot0q3jmu'

eclipse_triplets_file_key = 'thesis/splits/complete_triplets/complete_eclipse_triplets.pkl'
firefox_triplets_file_key = 'thesis/splits/complete_triplets/complete_firefox_triplets.pkl'
netbeans_triplets_file_key = 'thesis/splits/complete_triplets/complete_netbeans_triplets.pkl'
openoffice_triplets_file_key = 'thesis/splits/complete_triplets/complete_openoffice_triplets.pkl'


eclipse_triplets_obj = s3.get_object(Bucket = bucket, Key = eclipse_triplets_file_key)
firefox_triplets_obj = s3.get_object(Bucket = bucket, Key = firefox_triplets_file_key)
netbeans_triplets_obj = s3.get_object(Bucket = bucket, Key = netbeans_triplets_file_key)
openoffice_triplets_obj = s3.get_object(Bucket = bucket, Key = openoffice_triplets_file_key)


In [None]:
eclipse_triplets = pickle.load(eclipse_triplets_obj['Body'])
firefox_triplets = pickle.load(firefox_triplets_obj['Body'])
netbeans_triplets = pickle.load(netbeans_triplets_obj['Body'])
openoffice_triplets = pickle.load(openoffice_triplets_obj['Body'])

If the triplets data is stored locally, simply read the data

In [None]:
# eclipse_triplets    = pickle.load('eclipse_triplets_file_path')
# firefox_triplets    = pickle.load('firefox_triplets_file_path')
# netbeans_triplets   = pickle.load('netbeans_triplets_file_path')
# openoffice_triplets = pickle.load('openoffice_triplets_file_path')

Create an array with all triplets

In [None]:
triplets_datasets = [
    eclipse_triplets,
    firefox_triplets,
    netbeans_triplets,
    openoffice_triplets
]

triplets = []

for triplet_dataset in triplets_datasets:

    triplets.extend(triplet_dataset)

Import model to be fine-tuned and send it to the GPU (if available)

In [None]:
model_name = "all-mpnet-base-v2"
model = SentenceTransformer(model_name)

In [None]:
device = device('cuda' if cuda.is_available() else 'cpu')
device

In [None]:
model.to(device)

Set hyperparameters and fine-tune the model

In [None]:
train_dataset = SentencesDataset(triplets, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)

In [None]:
train_loss = losses.TripletLoss(model=model, triplet_margin=5)

In [None]:
num_epochs = 3
warm_up_steps = int(0.15 * len(train_dataloader) * num_epochs)
warm_up_steps

In [None]:
learning_rate = 2e-7

model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=num_epochs,
            warmup_steps=warm_up_steps,
            optimizer_params={"lr": learning_rate}
        )

Push the fine-tuned model to a huggingface repository

In [None]:
model_destination = "<your_huggingface_repository>/<model_name>"
model.push_to_hub(model_destination)