In [None]:
#environment setup
#use RTL 4090 for finetuning models and A100-80GB for ICL with LLaMA 3-70B models
!pip install transformers datasets accelerate scikit-learn evaluate wandb bitsandbytes peft --quiet
!FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn --no-build-isolation --quiet

## Finetuning LLMs

### Data curation and splitting 
please skip this part if you want to use the split I did which is in the parquet files under data folder.

In [None]:
# read raw data

import pandas as pd
import json


with open("data.json", "r") as f:
    data = json.load(f)["data"]

df = pd.DataFrame(data)

df.head()

In [None]:
df["category"].value_counts()

In [None]:
# there are 2 identical categories: "Off-topic" and "Off-Topic"
# rename to "Off-Topic"

df["category"] = df["category"].str.replace("Off-topic", "Off-Topic")

df["category"].value_counts()

In [None]:
# given labels are not equally distributed, split by category with stratify

from sklearn.model_selection import train_test_split


train, test = train_test_split(
    df, test_size=0.15, stratify=df["category"], random_state=42, shuffle=True
)
train, val = train_test_split(
    train, test_size=0.15, stratify=train["category"], random_state=42, shuffle=True
)

train.head()

In [None]:
# reset index and save to parquet

train = train.reset_index(drop=True)
val = val.reset_index(drop=True)
test = test.reset_index(drop=True)

# for each row, add to question column f"Question: {row['question']}" and answer column f"Answer: {row['answer']}" and create a new column combined question and answer in one column

train["question"] = train["question"].apply(lambda x: f"Question: {x}")
train["answer"] = train["answer"].apply(lambda x: f"Answer: {x}")

val["question"] = val["question"].apply(lambda x: f"Question: {x}")
val["answer"] = val["answer"].apply(lambda x: f"Answer: {x}")

test["question"] = test["question"].apply(lambda x: f"Question: {x}")
test["answer"] = test["answer"].apply(lambda x: f"Answer: {x}")

train["combined"] = train["question"] + "\n" + train["answer"]
val["combined"] = val["question"] + "\n" + val["answer"]
test["combined"] = test["question"] + "\n" + test["answer"]


train.to_parquet("train.parquet")
val.to_parquet("val.parquet")
test.to_parquet("test.parquet")

### Data Loading and tokenization for finetuning or ICL

In [None]:
# read train and val from parquet

import pandas as pd


train = pd.read_parquet("train.parquet")
val = pd.read_parquet("val.parquet")
test = pd.read_parquet("test.parquet")

# change column name from category to label which is used in HF datasets
train = train.rename(columns={"category": "label"})
val = val.rename(columns={"category": "label"})
test = test.rename(columns={"category": "label"})

train["label"].value_counts()

In [None]:
# get nan values for train and val and test

print(train.isna().sum())
print(val.isna().sum())
print(test.isna().sum())

In [None]:
test.head()

In [None]:
# casting to class labels and loading to HF datasets

from datasets import Dataset


train_dataset = Dataset.from_pandas(train, split="train").class_encode_column("label")
class_label_feature = train_dataset.features["label"]
val_dataset = Dataset.from_pandas(val, split="val").cast_column(
    "label", class_label_feature
)
test_dataset = Dataset.from_pandas(test, split="test").cast_column(
    "label", class_label_feature
)

In [None]:
# get label2id and id2label for the model
label2id = train_dataset.features["label"]._str2int
id2label = {v: k for k, v in label2id.items()}

# label2id = train_dataset.features['label'].names
# id2label = {i: label for i, label in enumerate(label2id)}

id2label

In [None]:
# used only for finetuning with weighted crossentropy

import torch

label_id_df = train["label"].map(label2id)

class_weights = (1 / label_id_df.value_counts(normalize=True).sort_index()).tolist()
class_weights = torch.tensor(class_weights)
class_weights = class_weights / class_weights.sum()
class_weights

### Model loading 

In [None]:
# define the model and training config and wandb config

%env WANDB_WATCH=all
%env WANDB_SILENT=true
#%env WANDB_LOG_MODEL=end
%env WANDB_LOG_MODEL=false
%env WANDB_PROJECT=kapa_question_type_classification

config = {
    "model_name": "meta-llama/Meta-Llama-3-8B",
    "input_type": "question",   # question or combined which refers to question + answer
    "version": "1",    #run number
    "batch_size": 16,
    "train_epochs": 100,
    "num_workers": 8,
    "lr": 1e-4,
    #"dropout": 0.3
}





In [None]:
# tokenizer and preprocessing


from transformers import AutoTokenizer


if "llama" in config["model_name"]:
    tokenizer = AutoTokenizer.from_pretrained(
        config["model_name"],
        token="HF_token",
        add_prefix_space=True,
    )
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token
    print("llama tokenizer")

else:
    tokenizer = AutoTokenizer.from_pretrained(
        config["model_name"],
        token="HF_token",
    )


def preprocess_function(examples):
    return tokenizer(examples[config["input_type"]], truncation=True)


tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

In [None]:
# define data collator with dynamic padding

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# define metrics: f1, precision, recall (weighted mode) and f1 per label


import evaluate
import numpy as np


def compute_metrics(eval_preds):
    metric = evaluate.combine(["f1", "precision", "recall"])
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    metrics = metric.compute(
        predictions=predictions, references=labels, average="weighted"
    )

    # F1 Score per label
    f1_metric = evaluate.load("f1")
    f1_per_label = f1_metric.compute(
        predictions=predictions, references=labels, average=None
    )
    f1_per_label_dict = {id2label[idx]: v for idx, v in enumerate(f1_per_label["f1"])}

    return {
        "f1": metrics["f1"],
        "precision": metrics["precision"],
        "recall": metrics["recall"],
        "f1_per_label": f1_per_label_dict,
    }

In [None]:
# define the model

from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
)
import os
import torch

os.environ["TOKENIZERS_PARALLELISM"] = "false"

if "llama" in config["model_name"]:
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        config["model_name"],
        num_labels=6,
        id2label=id2label,
        label2id=label2id,
        token="HF_token",
        quantization_config=quantization_config,
        device_map="auto",
        attn_implementation="flash_attention_2",
    )
    print("llama model")


else:

    model = AutoModelForSequenceClassification.from_pretrained(
        config["model_name"],
        num_labels=6,
        id2label=id2label,
        label2id=label2id,
        token="HF_token",
    )

In [None]:
# define the lora config and use this cell only if LLAMA 3-8B is used

from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

lora_config = LoraConfig(
    r=32,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # ["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS",
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

model.config.pad_token_id = tokenizer.pad_token_id
# model.config.use_cache = False
# model.config.pretraining_tp = 1

In [None]:
# use this cell only if dropout to be changed for Bert-like models
# import torch.nn as nn

# model.dropout = nn.Dropout(config["dropout"])

In [None]:
def count_trainable_parameters(model):
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    return params


count_trainable_parameters(model)

In [None]:
# training arguments

from transformers import EarlyStoppingCallback
from transformers import TrainingArguments, Trainer

early_stopping = EarlyStoppingCallback(early_stopping_patience=4)

args = TrainingArguments(
    f"{config['model_name']}-v-{config['version']}",
    evaluation_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=config["lr"],
    per_device_train_batch_size=config["batch_size"],
    per_device_eval_batch_size=config["batch_size"],
    num_train_epochs=config["train_epochs"],
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    push_to_hub=False,
    # fp16=True,
    # fp16_full_eval=True,
    bf16_full_eval=True,
    bf16=True,
    save_total_limit=1,
    gradient_checkpointing=True,
    optim="adamw_torch",
    report_to="wandb",
    lr_scheduler_type="cosine",
    max_grad_norm=0.3,
    warmup_ratio=0.01,
    logging_strategy="epoch",
    run_name=f"{config['model_name']}-v-{config['version']}",
    dataloader_num_workers=config["num_workers"],
    # gradient_accumulation_steps=2,
)

In [None]:
# define the trainer that performs the training with weighted crossentropy
# use only for finetuning with weighted crossentropy

from torch.nn import functional as F


class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(
                self.args.device
            )
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("label").long()

        outputs = model(**inputs)

        logits = outputs.get("logits")

        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping],
    data_collator=data_collator,
)
# trainer = CustomTrainer(
#     model,
#     args,
#     train_dataset=tokenized_train,
#     eval_dataset=tokenized_val,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
#     callbacks=[early_stopping],
#     data_collator=data_collator,
#     class_weights=class_weights,
# )

In [None]:
# train
trainer.train()

In [None]:
# save the model and finish the wandb run
import wandb

wandb.config.update(config)
wandb.finish()

## Zero/Few Shot In-Context Learning (ICL)

In [None]:
# define the pipeline with LLaMA 3-70B

import transformers
import torch
from transformers import BitsAndBytesConfig, AutoTokenizer


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_storage=torch.float16,
)

# model_id = "meta-llama/Meta-Llama-3-70B"
model_id = (
    "meta-llama/Meta-Llama-3.1-70B-Instruct"  # "meta-llama/Meta-Llama-3-70B-Instruct"
)

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    token="HF_token",  # add_prefix_space=True
)
tokenizer.pad_token_id = tokenizer.eos_token_id

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    tokenizer=tokenizer,
    model_kwargs={
        "quantization_config": bnb_config,
        "low_cpu_mem_usage": True,
        "attn_implementation": "sdpa",
    },
    device_map="auto",
    token="HF_token",
)

In [None]:
# define few-shot examples
# use in case of few shot ICL with random selection of few-shot examples
nr_of_shots = 1
sampled_questions = (
    train.groupby("label")
    .apply(lambda x: x.sample(n=nr_of_shots))
    .reset_index(drop=True)
)

from collections import defaultdict

# Create a defaultdict with list as the default factory
label_question_dict = defaultdict(list)

# Iterate over the DataFrame rows and append questions to the appropriate label list
for _, row in sampled_questions.iterrows():
    label_question_dict[row["label"]].append(row["question"])

# Convert defaultdict to a regular dictionary if desired
label_question_dict = dict(label_question_dict)

one_shot_examples = label_question_dict

one_shot_examples_string = ""
for label, questions in one_shot_examples.items():
    for question in questions:
        question = question.replace("Question: ", "")
        one_shot_examples_string += f"Question: {question}. Answer: {label}\n\n"

In [None]:
# define a function that finds the top N best matching questions per label for a given query question and
# format these as a string to be used in the instruct prompt
# use in case of few shot ICL with a heuristic selection of few-shot examples

import numpy as np
import pandas as pd

val_embeddings = np.load("val_question_bert_embeddings.npy")
train_embeddings = np.load("train_question_bert_embeddings.npy")


def find_top_matches_per_label(val_embeddings, train_embeddings, train_df, top_n=2):
    """
    Finds the top N best matching questions per label in the training set for each query in the validation set.

    Parameters:
    - val_embeddings (numpy.ndarray): The embeddings of the validation set questions.
    - train_embeddings (numpy.ndarray): The embeddings of the training set questions.
    - train_df (pandas.DataFrame): The training DataFrame containing "question" and "label" columns.
    - top_n (int): The number of top matches per label to return for each validation query.

    Returns:
    - dict: A dictionary where each key is a validation query index and the value is a dictionary
            with labels as keys and the top N matching questions as values.
    """
    results = {}

    # Iterate over each validation query
    for i, query_embedding in enumerate(val_embeddings):
        # Compute similarities between the query and all training embeddings
        similarities = np.dot(train_embeddings, query_embedding)

        # Create a DataFrame to store similarity scores with corresponding questions and labels
        similarity_df = pd.DataFrame(
            {
                "question": train_df["question"],
                "label": train_df["label"],
                "similarity": similarities,
            }
        )

        # Group by label and get the top N matches per label
        top_matches_per_label = (
            similarity_df.groupby("label")
            .apply(lambda x: x.nlargest(top_n, "similarity"))
            .reset_index(drop=True)
        )

        # Store the results for this query
        results[i] = {}
        for label in top_matches_per_label["label"].unique():
            best_matches = top_matches_per_label[
                top_matches_per_label["label"] == label
            ]["question"].tolist()
            results[i][label] = best_matches

    return results


# Example usage:
# Assuming you have your dataframes `val` and `train` and the corresponding embeddings `val_embeddings` and `train_embeddings`
top_n = 1  # Number of top matches per label
results = find_top_matches_per_label(val_embeddings, train_embeddings, train, top_n)


one_shot_examples = []

for i, one_shot_example in results.items():
    one_shot_examples_string = ""
    for label, questions in one_shot_example.items():
        for question in questions:
            question = question.replace("Question: ", "")
            one_shot_examples_string += f"Question: {question}. Answer: {label}\n\n"

    one_shot_examples.append(one_shot_examples_string)

In [None]:
# perform Zero-Shot or Few-Shot Classification
# for zero shot remove from the prompt:
#   {context}
#   {task_info}
#   {few_shots}
# for few shot only use this cell if you rely on random selection of few-shot examples

target = "QUESTION TYPE"
target_with_brackets = f"{{{target}}}"
labels = list(label2id.keys())
label_token_lists = {}
for l in list(label2id.keys()):
    label_token_lists[l] = tokenizer.encode(l, add_special_tokens=False)


def match_label(generated_string_tokens, ground_truth, label_token_lists):

    def is_subsequence(subseq, seq):
        """
        Check if subseq is a contiguous subsequence of seq.

        Args:
            subseq (list): The subsequence to check.
            seq (list): The sequence to check against.

        Returns:
            bool: True if subseq is a subsequence of seq, else False.
        """
        sub_len = len(subseq)
        for i in range(len(seq) - sub_len + 1):
            if seq[i : i + sub_len] == subseq:
                return True
        return False

    matching_labels = []

    # Check each label token list to see if it is a subsequence of the generated string tokens
    for label, label_tokens in label_token_lists.items():
        if is_subsequence(label_tokens, generated_string_tokens):
            matching_labels.append(label)

    # Return the label if only one matches, otherwise return None
    if len(matching_labels) == 1 and matching_labels[0] == ground_truth:
        return True  # matching_labels[0]
    else:
        return False


context = "This question is asked by the user to our support tech team and you should classify it based on its type which represents the nature of the issue the user is encountering."

task_info = """This question asked by the user is of the following types:
- `Discovery`: User is exploring how to do something or looking something up.
- `Troubleshooting`: User is asking about an error, often this is a stacktrace.
- `Code`: The user is inputing code and asking kapa to change, debug or explain something.
- `Comparison`: User is asking kapa to contrast two things i.e. Are X and Y the same, what is the difference betwenn X and Y
- `Advice`: The user is asking kapa what to do or what the best practice is.
- `Off-Topic`: Anything else, could be something unrelated, gibberish, abuse and so on.
"""

few_shots = f"""Here are some examples of questions that are classified based on their type:

{one_shot_examples_string}
"""


def classify(question):
    instructions = f"""
    {context}
    {task_info}
    {few_shots}
    
    Return the {target} {labels} with one word/label response without any additional text. Answer: {target_with_brackets}."""

    query = f"""
    Question: {question}. Answer: """

    messages = [
        {
            "role": "system",
            "content": instructions,
        },
        {"role": "user", "content": query},
    ]

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>"),
    ]
    output = pipeline(
        messages,
        max_new_tokens=3,  # 64, 128, 256, 4096   #max_length
        pad_token_id=pipeline.tokenizer.pad_token_id,
        eos_token_id=terminators,
        do_sample=False,
        # top_k=1,
        temperature=0.0,  # 0.1, 0.3, 0.6, 0.8
        # top_p=0.9,  # 0.8, 0.9, 0.95
        return_full_text=False,
    )[0]["generated_text"].lstrip()
    return output


def final_output(question, ground_truth):
    output = classify(question)
    generated_string_tokens = tokenizer.encode(output, add_special_tokens=False)
    if match_label(generated_string_tokens, ground_truth, label_token_lists):
        return label2id[ground_truth]
    else:
        # choose random label from remaining ones excluding ground truth
        remaining_labels = [l for l in labels if l != ground_truth]
        random_label = np.random.choice(remaining_labels)
        return label2id[random_label]

In [None]:
# use this cell for few shot with a heuristic selection of few-shot examples


target = "QUESTION TYPE"
target_with_brackets = f"{{{target}}}"
labels = list(label2id.keys())
label_token_lists = {}
for l in list(label2id.keys()):
    label_token_lists[l] = tokenizer.encode(l, add_special_tokens=False)


def match_label(generated_string_tokens, ground_truth, label_token_lists):

    def is_subsequence(subseq, seq):
        """
        Check if subseq is a contiguous subsequence of seq.

        Args:
            subseq (list): The subsequence to check.
            seq (list): The sequence to check against.

        Returns:
            bool: True if subseq is a subsequence of seq, else False.
        """
        sub_len = len(subseq)
        for i in range(len(seq) - sub_len + 1):
            if seq[i : i + sub_len] == subseq:
                return True
        return False

    matching_labels = []

    # Check each label token list to see if it is a subsequence of the generated string tokens
    for label, label_tokens in label_token_lists.items():
        if is_subsequence(label_tokens, generated_string_tokens):
            matching_labels.append(label)

    # Return the label if only one matches, otherwise return None
    if len(matching_labels) == 1 and matching_labels[0] == ground_truth:
        return True  # matching_labels[0]
    else:
        return False


context = "This question is asked by the user to our support tech team and you should classify it based on its type which represents the nature of the issue the user is encountering."

task_info = """This question asked by the user is of the following types:
- `Discovery`: User is exploring how to do something or looking something up.
- `Troubleshooting`: User is asking about an error, often this is a stacktrace.
- `Code`: The user is inputing code and asking kapa to change, debug or explain something.
- `Comparison`: User is asking kapa to contrast two things i.e. Are X and Y the same, what is the difference betwenn X and Y
- `Advice`: The user is asking kapa what to do or what the best practice is.
- `Off-Topic`: Anything else, could be something unrelated, gibberish, abuse and so on.
"""


def classify(question, one_shot_examples_string):

    few_shots = f"""Here are some examples of questions that are classified based on their type:

    {one_shot_examples_string}
    """
    instructions = f"""
    {context}
    {task_info}
    {few_shots}
    
    Return the {target} {labels} with one word/label response without any additional text. Answer: {target_with_brackets}."""

    query = f"""
    Question: {question}. Answer: """

    messages = [
        {
            "role": "system",
            "content": instructions,
        },
        {"role": "user", "content": query},
    ]

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>"),
    ]
    output = pipeline(
        messages,
        max_new_tokens=3,  # 64, 128, 256, 4096   #max_length
        pad_token_id=pipeline.tokenizer.pad_token_id,
        eos_token_id=terminators,
        do_sample=False,
        # top_k=1,
        temperature=0.0,  # 0.1, 0.3, 0.6, 0.8
        # top_p=0.9,  # 0.8, 0.9, 0.95
        return_full_text=False,
    )[0]["generated_text"].lstrip()
    return output


def final_output(question, ground_truth, one_shot_examples_string):
    output = classify(question, one_shot_examples_string)
    generated_string_tokens = tokenizer.encode(output, add_special_tokens=False)
    if match_label(generated_string_tokens, ground_truth, label_token_lists):
        return label2id[ground_truth]
    else:
        # choose random label from remaining ones excluding ground truth
        remaining_labels = [l for l in labels if l != ground_truth]
        random_label = np.random.choice(remaining_labels)
        return label2id[random_label]

In [None]:
# given val dataset, generate predictions

questions = val["question"].tolist()
ground_truths = val["label"].tolist()

preds = []

from tqdm import tqdm
import numpy as np

for i in tqdm(range(len(questions))):
    # one_shot_examples_string = one_shot_examples[i]
    question = questions[i]
    ground_truth = ground_truths[i]
    # pred = final_output(question, ground_truth, one_shot_examples_string)
    pred = final_output(question, ground_truth)
    preds.append(pred)

In [None]:
# convert ground truths to ids
ground_truths_ids = [label2id[l] for l in ground_truths]

In [None]:
# given preds and ground truth, calculate f1 score weighted
import evaluate

metric = evaluate.combine(["f1", "precision", "recall"])
metrics = metric.compute(
    predictions=preds, references=ground_truths_ids, average="weighted"
)
# F1 Score per label
f1_metric = evaluate.load("f1")
f1_per_label = f1_metric.compute(
    predictions=preds, references=ground_truths_ids, average=None
)
f1_per_label_dict = {id2label[idx]: v for idx, v in enumerate(f1_per_label["f1"])}

metrics.update(f1_per_label_dict)
metrics

In [None]:
# Get Bert embeddings for train or val data and save them to disk

import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from datasets import load_dataset
from tqdm import tqdm
from numpy.linalg import norm

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Make sure the model is in evaluation mode
model.eval()

# Check if CUDA is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


# Function to get embeddings
def get_embeddings(text):
    with torch.no_grad():
        inputs = tokenizer(
            text, return_tensors="pt", padding=True, truncation=True, max_length=512
        )
        inputs = {key: value.to(device) for key, value in inputs.items()}
        outputs = model(**inputs)
        # Get the embeddings of [CLS] token, which is the first token
        embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    return embeddings


# Initialize an empty list to store the embeddings
embeddings_list = []

# Iterate over the questions in the dataset with a progress bar
for question in tqdm(val_dataset["question"], desc="Generating embeddings"):
    embedding = get_embeddings(question)
    embedding = embedding / norm(embedding, axis=1, keepdims=True)
    embeddings_list.append(embedding)

# Convert the list of embeddings to a NumPy array
embeddings = np.vstack(embeddings_list)

# Save embeddings to a numpy file
np.save("val_question_bert_embeddings.npy", embeddings)

print("Embeddings saved to 'question_embeddings.npy'")