## Azure Embeddings

In [1]:
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
import time

In [2]:
AZURE_ENDPOINT = "https://oai-dxclz-dev-oaicat-01.openai.azure.com"
API_KEY = "b75c6627bded4f8dbe42825aaa5a1528"
API_VERSION = "2023-07-01-preview"
LLM_DEPLOYMENT_NAME = "GPT-4o"
LLM_MODEL_NAME = "gpt-4o"
EMBEDDINGS_MODEL_NAME = "text-embedding-3-large"
EMBEDDINGS_DEPLOYMENT_NAME = "embedding-large"

In [3]:
def create_embedding_model():
    return  AzureOpenAIEmbedding(
        model=EMBEDDINGS_MODEL_NAME,
        deployment_name=EMBEDDINGS_DEPLOYMENT_NAME,
        api_key=API_KEY,
        azure_endpoint=AZURE_ENDPOINT,
        api_version=API_VERSION,
    )

In [4]:
embed_model = create_embedding_model()

In [5]:
def get_embedding(text: str):
    return embed_model._get_query_embedding(text)

## Dataset Processing

In [6]:
from datasets import load_dataset
import numpy as np
import concurrent.futures
from tqdm.notebook import tqdm
import json

In [8]:
dataset = load_dataset("routellm/gpt4_judge_battles")
# dataset_embeddings = load_dataset("routellm/gpt4_judge_battles_embeddings")

In [12]:
# Save dataset to disk in json format, instead of the default parquet format
dataset["train"].to_json("data/gpt4_judge_battles.json")

Creating json from Arrow format:   0%|          | 0/110 [00:00<?, ?ba/s]

305427462

In [16]:
# Open the dataset file, and process it line by line
with open("data/gpt4_judge_battles.json") as f:
    data_array = []
    for i, line in enumerate(f):
        # if i == 2:
        #     break

        data = json.loads(line)

        winner = "model_a" if data["winner_model_a"] else "model_b"
        winner = "model_b" if data["winner_tie"] else winner # Bias the model towards using the WEAK model if there is a tie

        data["winner"] = winner

        data_array.append(data)


In [17]:
# Save data_array to disk in json format
with open("data/gpt4_judge_battles_prepared.json", "w") as f:
    json.dump(data_array, f)

In [111]:
def save_embeddings(embeddings, save_filename: str):
    embeddings_loaded = []

    with open(save_filename, "rb") as f:
        # Load the embeddings from the file, if it exists
        embeddings_loaded = np.load(f).tolist()

    # Open file and save to the end
    with open(save_filename, "wb") as f:
        # Append the new embeddings to the existing embeddings
        embeddings_loaded.extend(embeddings)
        embeddings = []
        embeddings_to_save = np.array(embeddings_loaded)
        embeddings_loaded = []
        np.save(f, embeddings_to_save)
    embeddings = []
    embeddings_loaded = []
    embeddings_to_save = np.array([])

In [107]:
# Batch prompts into groups of 2000 with a generator
def batch_prompts(prompts, batch_size=2000, start=0):
    for i in range(start, len(prompts), batch_size):
        yield prompts[i:i + batch_size]

In [110]:
# Generate embeddings for the dataset, using the column "prompt"

def generate_embeddings(dataset, batch_size=2000, start=0, save_filename: str = "data/gpt4_judge_battles_embeddings.npy"):
    print(f"Batch size: {batch_size}")

    # Generate embeddings for the dataset
    embeddings = []
    total_length = (len(dataset["train"]["prompt"]) - start) // (batch_size + 1)
    for i, batch in tqdm(enumerate(batch_prompts(dataset["train"]["prompt"], batch_size, start)), total=total_length):
        with concurrent.futures.ThreadPoolExecutor() as executor:
            batch_embeddings = list(executor.map(get_embedding, batch))
        
        embeddings.extend(batch_embeddings)
        time.sleep(1)

        # Every 50 iterations, save the embeddings to disk
        if (i + 1) % 50 == 0:
            save_embeddings(embeddings, save_filename)
            embeddings = []
            
    save_embeddings(embeddings, save_filename)
    embeddings = []

In [None]:
NUM_THREADS = 12
BATCH_SIZE = NUM_THREADS * (2 ** 3)
START = 0
SAVE_FILENAME = "data/gpt4_judge_battles_embeddings_2.npy"

generate_embeddings(dataset, BATCH_SIZE, START, SAVE_FILENAME)

In [100]:
# Load embeddings from file
with open("data/gpt4_judge_battles_embeddings.npy", "rb") as f:
    embeddings = np.load(f, allow_pickle=False)

In [102]:
len(dataset["train"]["prompt"])

109101

In [101]:
len(embeddings)

109101

## Train Matrix Factorization Model

In [113]:
# Call a python script
import subprocess

subprocess.run(["python", "routellm/routers/matrix_factorization/train_matrix_factorization.py"])

CompletedProcess(args=['python', 'routellm/routers/matrix_factorization/train_matrix_factorization.py'], returncode=1)