In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import torch

In [None]:
# Configuration
preprocessed_data_path = 'preprocessed_dataset.csv'  # Path to preprocessed dataset
output_data_path = 'dataset_with_bertsim_softlabels.csv'  # Path to save updated dataset
SENTENCE_TRANSFORMER_MODEL = 'all-MiniLM-L6-v2'  # Sentence transformer model
TEMPERATURE = 10.0  # Temperature for softmax scaling
BATCH_SIZE = 32  # Batch size for processing

In [None]:
# Define response columns (reduced to 3 models)
response_columns = [
    'gpt-3.5-turbo-1106|model_response',
    'claude-instant-v1|model_response',
    'claude-v1|model_response',
    'claude-v2|model_response',
    'meta/llama-2-70b-chat|model_response',
    'mistralai/mixtral-8x7b-chat|model_response',
    'zero-one-ai/Yi-34B-Chat|model_response',
    'WizardLM/WizardLM-13B-V1.2|model_response',
    'meta/code-llama-instruct-34b-chat|model_response',
    'mistralai/mistral-7b-chat|model_response'

]
ground_truth_column = 'gpt-4-1106-preview|model_response'

In [None]:
# Load preprocessed dataset
df = pd.read_csv(preprocessed_data_path)

In [3]:
import torch

# Set device (GPU if available, else CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


In [None]:
# Initialize sentence transformer
sentence_model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
sentence_model = sentence_model.to(device)

In [None]:
def compute_bertsim_scores(ground_truths, model_responses, sentence_model, batch_size, device):
    bertsim_scores = []
    num_models = len(model_responses[0])  # Number of models

    # Process ground truths in batches
    valid_ground_truths = [gt for gt in ground_truths if gt and not pd.isna(gt)]
    gt_indices = [i for i, gt in enumerate(ground_truths) if gt and not pd.isna(gt)]
    gt_embeddings = []

    for i in range(0, len(valid_ground_truths), batch_size):
        batch = valid_ground_truths[i:i + batch_size]
        embeddings = sentence_model.encode(batch, convert_to_tensor=True, device=device, show_progress_bar=False)
        gt_embeddings.extend(embeddings)
    
    # Map embeddings back to original indices
    gt_embedding_dict = {idx: emb for idx, emb in zip(gt_indices, gt_embeddings)}

    # Process model responses for each model
    for model_idx in range(num_models):
        model_scores = []
        # Extract responses for this model across all rows
        responses = [row[model_idx] for row in model_responses]
        valid_responses = [resp for resp in responses if resp and not pd.isna(resp)]
        resp_indices = [i for i, resp in enumerate(responses) if resp and not pd.isna(resp)]
        resp_embeddings = []

        # Encode responses in batches
        for i in range(0, len(valid_responses), batch_size):
            batch = valid_responses[i:i + batch_size]
            embeddings = sentence_model.encode(batch, convert_to_tensor=True, device=device, show_progress_bar=False)
            resp_embeddings.extend(embeddings)

        # Map embeddings back to original indices
        resp_embedding_dict = {idx: emb for idx, emb in zip(resp_indices, resp_embeddings)}

        # Compute scores for each row
        for row_idx in range(len(ground_truths)):
            if row_idx not in gt_embedding_dict or row_idx not in resp_embedding_dict:
                score = 0.0
            else:
                gt_emb = gt_embedding_dict[row_idx]
                resp_emb = resp_embedding_dict[row_idx]
                score = util.cos_sim(gt_emb, resp_emb).item()
            model_scores.append(score)
        bertsim_scores.append(model_scores)

    return np.array(bertsim_scores).T  # Transpose to have shape (num_rows, num_models)

In [None]:
# Function to generate soft labels
def generate_soft_labels(bertsim_scores, temperature):
    scaled_scores = bertsim_scores / temperature
    exp_scores = np.exp(scaled_scores)
    # Avoid division by zero by adding a small epsilon
    sum_exp_scores = np.sum(exp_scores, axis=1, keepdims=True)
    sum_exp_scores = np.where(sum_exp_scores == 0, 1e-10, sum_exp_scores)
    soft_labels = exp_scores / sum_exp_scores
    return soft_labels

In [None]:
# Prepare data
ground_truths = df[ground_truth_column].tolist()
model_responses = df[response_columns].values.tolist()

In [None]:
# Compute BERTSim scores
bertsim_scores = compute_bertsim_scores(ground_truths, model_responses, sentence_model, BATCH_SIZE, device)

In [None]:
# Generate soft labels
soft_labels = generate_soft_labels(bertsim_scores, TEMPERATURE)

In [None]:
# Add BERTSim scores and soft labels to dataframe
model_names = [col.split('|')[0] for col in response_columns]
for i, model_name in enumerate(model_names):
    df[f'{model_name}|BERTSim'] = bertsim_scores[:, i]
    df[f'{model_name}|soft_label'] = soft_labels[:, i]

In [None]:
# Save updated dataset
df.to_csv(output_data_path, index=False)

print(f"BERTSim scores and soft labels computed. Dataset saved as '{output_data_path}'.")

In [44]:
import pandas as pd

# Load the dataset
df = pd.read_csv("dataset_with_bertsim_softlabels.csv")

# Remove duplicate rows based on the 'prompts' column and create a new DataFrame
deduped_df = df.drop_duplicates(subset=['prompt'], keep='first').reset_index(drop=True)

# Optionally, save the new deduplicated dataset to a file
deduped_df.to_csv("dataset_with_bertsim_softlabels_deduped.csv", index=False)

print(f"New deduplicated dataset has {len(deduped_df)} rows and is saved as dataset_with_bertsim_softlabels_deduped.csv")

New deduplicated dataset has 8725 rows and is saved as dataset_with_bertsim_softlabels_deduped.csv


In [56]:
import pandas as pd

# Load the dataset
df = pd.read_csv("dataset_with_bertsim_softlabels_deduped.csv")

# Columns to keep
cols_to_keep = [
    "prompt",
    "oracle_model_to_route_to",
    "GroundTruth",
    "mistralai/mixtral-8x7b-chat|BERTSim",
    "mistralai/mixtral-8x7b-chat|soft_label",
    "zero-one-ai/Yi-34B-Chat|BERTSim",
    "zero-one-ai/Yi-34B-Chat|soft_label",
    "mistralai/mistral-7b-chat|BERTSim",
    "mistralai/mistral-7b-chat|soft_label",
    "mistralai/mixtral-8x7b-chat|model_response",
    "mistralai/mistral-7b-chat|model_response",
    "zero-one-ai/Yi-34B-Chat|model_response"
]

# Filter the DataFrame
filtered_df = df[cols_to_keep]

# Save to CSV
filtered_df.to_csv("Dataset_JSON/filtered_bertsim_softlabels.csv", index=False)

# Save to JSON
filtered_df.to_json("Dataset_JSON/filtered_bertsim_softlabels.json", orient="records", indent=2, force_ascii=False)

print("Filtered data saved to filtered_bertsim_softlabels.csv and filtered_bertsim_softlabels.json")

Filtered data saved to filtered_bertsim_softlabels.csv and filtered_bertsim_softlabels.json


### Calculating cost for each prompt

In [58]:
import json

# Load your JSON
with open('Dataset_JSON/filtered_bertsim_softlabels.json', 'r', encoding="utf-8") as f:
    data = json.load(f)

# Define model info: field name and cost per token
model_info = {
    "mistralai/mixtral-8x7b-chat": {
        "field": "mistralai/mixtral-8x7b-chat|model_response",
        "cost_per_token": 0.60  # per 1M tokens
    },
    "mistralai/mistral-7b-chat": {
        "field": "mistralai/mistral-7b-chat|model_response",
        "cost_per_token": 0.2   # per 1M tokens
    },
    "zero-one-ai/Yi-34B-Chat": {
        "field": "zero-one-ai/Yi-34B-Chat|model_response",
        "cost_per_token": 0.8   # per 1M tokens
    }
}

# Simple whitespace tokenizer
def count_tokens(text):
    if not isinstance(text, str):
        return 0
    return len(text.split())

# Process each row
for row in data:
    prompt = row.get("prompt", "")
    for model, info in model_info.items():
        # Count input tokens
        input_tokens = count_tokens(prompt)
        # Count output tokens
        output_text = row.get(info["field"], "")
        output_tokens = count_tokens(output_text)
        # Calculate cost
        cost = (input_tokens * info["cost_per_token"] / 1e6) + (output_tokens * info["cost_per_token"] / 1e6)
        # Add new column
        cost_field = info["field"].replace("|model_response", "|cost")
        row[cost_field] = cost

# Save to new JSON
with open('Dataset_JSON/filtered_bertsim_softlabels_with_cost.json', 'w', encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

print("Cost columns added and saved to Dataset_JSON/filtered_bertsim_softlabels_with_cost.json")

Cost columns added and saved to Dataset_JSON/filtered_bertsim_softlabels_with_cost.json


### Calculating Throughput

In [59]:
import json
from tqdm import tqdm

# You may want to use a real tokenizer for accurate token counts.
def count_tokens(text):
    # Replace with your tokenizer if needed
    return len(text.split())

# Model parameters
model_params = {
    "mistralai/mistral-7b-chat|model_response": {
        "avg_time_to_first_token": 0.18,
        "avg_decode_rate": 175
    },
    "zero-one-ai/Yi-34B-Chat|model_response": {
        "avg_time_to_first_token": 0.25,
        "avg_decode_rate": 108
    },
    "mistralai/mixtral-8x7b-chat|model_response": {
        "avg_time_to_first_token": 0.35,
        "avg_decode_rate": 54
    }
}

# Load your JSON file
with open('Dataset_JSON/filtered_bertsim_softlabels_with_cost.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

for item in tqdm(data):
    prompt = item['prompt']
    input_tokens = count_tokens(prompt)
    for model_col, params in model_params.items():
        response = item.get(model_col, "")
        output_tokens = count_tokens(response)
        avg_time_to_first_token = params["avg_time_to_first_token"]
        avg_decode_rate = params["avg_decode_rate"]
        # Time from query submission to completion
        total_time = avg_time_to_first_token + (output_tokens / avg_decode_rate)
        # Throughput calculation
        throughput = output_tokens / total_time if total_time > 0 else 0
        # Add new column
        throughput_col = model_col.replace('|model_response', '|throughput')
        item[throughput_col] = throughput

# Save back to JSON
with open('Dataset_JSON/final.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

print("Done! Throughput columns added.")

100%|██████████| 8725/8725 [00:00<00:00, 47217.19it/s]


Done! Throughput columns added.


In [None]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('Dataset_JSON/final.csv')

# List of columns to remove
columns_to_remove = [
    'mistralai/mistral-7b-chat|soft_label',
    'zero-one-ai/Yi-34B-Chat|soft_label',
    'mistralai/mixtral-8x7b-chat|soft_label'
]

# Remove the columns (ignore errors if columns are missing)
df = df.drop(columns=columns_to_remove, errors='ignore')

# Save to new CSV
df.to_csv('final1.csv', index=False)