# Install and import dependencies

Installs dependencies

In [None]:
!pip install accelerate bitsandbytes datasets loralib
!pip install sentence-transformers transformers[torch] tqdm s3fs

Imports necessary libraries

In [None]:
from datasets import Dataset
from huggingface_hub import notebook_login
from peft import get_peft_model, LoraConfig, PeftConfig, PeftModel
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer
from sentence_transformers.losses import ContrastiveLoss, CosineSimilarityLoss
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
from tqdm import tqdm
from zipfile import ZipFile
import bitsandbytes as bnb
import pandas as pd
import torch
import torch.nn as nn
import s3fs

# Create Global Variables

In [None]:
ORIGINAL_DATASET_S3_PATH = "s3://path-to-original-dataset-zip-file"
RESULTS_S3_PATHS = [
    "s3://path-to-parquet-files-with-results-1",
    "s3://path-to-parquet-files-with-results-2"
]
DATASET_LOCAL_DISK_PATH = "dataset.hf"
BASE_MODEL_ID = "sentence-transformers/all-mpnet-base-v2"
HF_MODEL_ID = "carlosalvarezg/all-mpnet-base-v2"

# Create Helper Functions

In [None]:
def read_csv_from_zip_s3(zip_file_path: str) -> pd.Dataframe:
    """
    Read a CSV file from a zip file stored in S3.

    Args:
    zip_file_path (str): Full S3 path to the zip file (e.g., 's3://bucket/path/file.zip')

    Returns:
    pandas.DataFrame: The contents of the CSV file
    """
    # Initialize S3 filesystem
    fs = s3fs.S3FileSystem()

    try:
        # Read the zip file from S3
        with fs.open(zip_file_path, 'rb') as zip_file:
            # Create a pandas dataframe from the zip file
            df = pd.read_csv(zip_file, compression="zip")
        return df

    except Exception as e:
        print(f"Error reading CSV from zip in S3: {str(e)}")
        return None

In [None]:
def read_parquet_from_s3(s3_path: str) -> pd.Dataframe:
  """
  Read a Parquet file from S3.

  Args:
  s3_path (str): Full S3 path to the Parquet file (e.g., 's3://bucket/path/file.parquet')

  Returns:
  pandas.DataFrame: The contents of the Parquet file
  """
    # Initialize S3 filesystem
    fs = s3fs.S3FileSystem()

    try:
        # Read the Parquet file directly using s3fs
        df = pd.read_parquet(s3_path, filesystem=fs)

        print(f"Successfully read the Parquet file. Shape: {df.shape}")
        return df

    except Exception as e:
        print(f"Error reading Parquet file from S3: {str(e)}")
        return None

# Ceate Unified Dataset

Uses original recipes_data.csv.zip file and dataset containing pairs of similar and different indices to create a unified dataset of similar and different titles and ingredients. The first columns contains titles of recipes, the second columns contains titles and ingredients of similar and different recipes. The third columns contains a label, which is 1 if the recipes in the current row are similar, and it's 0 if the recipes in the current row are different.

Read original recipes_data.csv.zip file from S3

In [None]:
all_data = read_csv_from_zip_s3(ORIGINAL_DATASET_S3_PATH)

Read results from PySpark script

In [None]:
samples = [read_parquet_from_s3(s3_path) for s3_path in RESULTS_S3_PATHS]

PySpark and Pandas process strings differently. This sometimes results in titles not being parsed correctly when they're written by PySpark into Parquet files and then read by Pandas. Therefore, we go thorough all the recipe titles in both the original dataset and the results and find any recipes in the results that are not in the original dataset. Then we add their indices in indices_to_remove. This usually filter out around 200 indices (out of 2.23 million)

In [None]:
title_counts = {}
for title in tqdm(all_data["title"], total=len(all_data)):
    title = title.replace('"', "").lower()
    if title in title_counts:
        title_counts[title] += 1
    else:
        title_counts[title] = 1

indices_to_remove = [set() for _ in range(len(samples))]
for index1, sample in tqdm(enumerate(samples), total=len(samples)):
    for index2, title in enumerate(sample["title"]):
        title = title.replace('"', "").lower()
        if title in title_counts:
            title_counts[title] -= 1
            if title_counts[title] == 0:
                title_counts.pop(title)
        else:
            title_counts[title] = -1
            indices_to_remove[index1].add(index2)

In [None]:
del title_counts

Create a mapping from every title to every corresponding list of ingredients

In [None]:
title_to_ingredients = {}
for index, row in tqdm(all_data.iterrows(), total=len(all_data)):
    title = row["title"]
    ingredients = " ".join(eval(row["ingredients"]))
    if title in title_to_ingredients:
        title_to_ingredients[title].append(ingredients)
    else:
        title_to_ingredients[title] = [ingredients]

In [None]:
del all_data

Create two lists containing recipe tuples. Each tuple has two components: a string containing a recipe title, and another string containing a second recipe title and a list of the ingredients used in that recipe. similar_title_pairs contains tuples with strings that are similar to each other, and different_title_pairs contains tuples that are different from each other

In [None]:
similar_title_pairs = []
different_title_pairs = []
for bad_indices, sample in tqdm(zip(indices_to_remove, samples), total=len(samples)):
    titles = sample["title"].to_list()
    ingredients = [""]*len(sample)
    for index, title in enumerate(titles):
        if title in title_to_ingredients:
            ingredients[index] = title_to_ingredients[title].pop()
            if len(title_to_ingredients[title]) == 0:
                title_to_ingredients.pop(title)
        else:
            bad_indices.add(index)
    for index, row in sample.iterrows():
        if index in bad_indices:
            continue
        sim_and_diff_links_str = row["similar_and_different_links"]
        cur_title = titles[index]
        similar_links = eval(sim_and_diff_links_str[7:sim_and_diff_links_str.index("]", 1) + 1])
        cur_sim_pairs = [(cur_title, titles[sim_index] + " " + ingredients[sim_index]) for sim_index in similar_links if sim_index not in bad_indices]
        similar_title_pairs.extend(cur_sim_pairs)
        different_links = eval(sim_and_diff_links_str[sim_and_diff_links_str.index(", a") + 8:-2])
        cur_diff_pairs = [(cur_title, titles[diff_index] + " " + ingredients[diff_index]) for diff_index in different_links if diff_index not in bad_indices]
        different_title_pairs.extend(cur_diff_pairs)

In [None]:
del samples
del title_to_ingredients
del indices_to_remove

Create a Hugging Face Dataset using the data from similar_title_pairs and different_title_pairs

In [None]:
anchors = [pair[0] for pair in similar_title_pairs] + [pair[0] for pair in different_title_pairs]
pos_neg = [pair[1] for pair in similar_title_pairs] + [pair[1] for pair in different_title_pairs]
sim_length = len(similar_title_pairs)
diff_length = len(different_title_pairs)
del similar_title_pairs
del different_title_pairs

In [None]:
dataset_dict = {
    "anchor": anchors,
    "positive/negative": pos_neg,
    "label": [1]*sim_length + [0]*diff_length
}
del anchors
del pos_neg

In [None]:
dataset = Dataset.from_dict(dataset_dict)
del dataset_dict

Save dataset to disk so it can be read later

In [None]:
dataset.save_to_disk(DATASET_LOCAL_DISK_PATH)

# Prepare Training Variables
Creates all the variables necessary to train the model

Reads dataset from disk

In [None]:
dataset = Dataset.load_from_disk(DATASET_LOCAL_DISK_PATH)

If you are using CosineSimilarityLoss, you should only use positive examples. Therefore, we remove all examples where label == 0. We also rename the first two columns to sentence_A, and sentence_B. If you are using ContrastiveLoss, these changes aren't necessary

In [None]:
# Filters out negative pairs
dataset = dataset.filter(lambda example: example["label"] == 1)

# Renames columns to sentence_A, sentence_B
dataset = dataset.rename_columns({"anchor": "sentence_A", "positive/negative": "sentence_B"})

Split dataset into train data, validation data, and test data, which contain 49%, 21%, and 30% of the data, respectively

In [None]:
trainvalid_test = dataset.train_test_split(test_size=0.3)
train_valid = trainvalid_test['train'].train_test_split(test_size=0.3)
train_dataset = train_valid['train']
valid_dataset = train_valid['test']
test_dataset = trainvalid_test['test']

In [None]:
print(train_dataset)
print(valid_dataset)
print(test_dataset)

Gets tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)

Creates model and modified the model to use [LoRA](https://arxiv.org/abs/2106.09685). We do this by freezing the parameters in the original model, and adding new, low-rank matrices to a few of the weight layers in the model. We then fine-tune the parameters in these new low-rank matrices instead of the parameters in the original model

In [None]:
model = SentenceTransformer(BASE_MODEL_ID, device="cuda")

In [None]:
for param in model.parameters():
  param.requires_grad = False
  if param.ndim == 1:
    param.data = param.data.to(torch.float32)
config = LoraConfig(target_modules=["dense"])
model = get_peft_model(model, config)

Create a loss function. This can be either ContrastiveLoss or CosineSimilarityLoss

In [None]:
loss = CosineSimilarityLoss(model=model)

Login to Hugging Face using a token with write access. This allows us to push the model to Hugging Face Hub

In [None]:
notebook_login()

# Create Trainer and Train Model

Creates training arguments and trainer for the model

In [None]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="all-mpnet-base-v2",
    # Optional training parameters:
    max_steps=200_000,
    per_device_train_batch_size=48,
    per_device_eval_batch_size=48,
    fp16=True,  # Set to False if your GPU can't handle FP16
    bf16=False,  # Set to True if your GPU supports BF16
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=20_000,
    prediction_loss_only=True,
    save_strategy="no",
    save_steps=20_000,
    logging_steps=20_000,
    logging_strategy="steps",
    logging_first_step=True,
    learning_rate=1e-6,
    save_total_limit=10,
    report_to=["none"],
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=HF_MODEL_ID,
    hub_private_repo=False,
)

In [None]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset.train_test_split(test_size=0.01)["test"],
    tokenizer=tokenizer,
    loss=loss,
)

Trains model

In [None]:
trainer.train()

Push final model to Hugging Face Hub

In [None]:
model.push_to_hub(HF_MODEL_ID)

# Loading model example

Load model from Hugging Face Hub

In [None]:
model = PeftModel.from_pretrained(SentenceTransformer(BASE_MODEL_ID, device="cuda"), HF_MODEL_ID)