In [2]:
!pip install pymongo




In [3]:
from collections import defaultdict
import json, math
import numpy as np
import pandas as pd
import plotly.express as px
from tqdm import tqdm
import requests
pd.options.display.float_format = '{:.2f}'.format
import pymongo

In [4]:
import pandas as pd

# Load the ratings data from the JSON files
ratings = pd.read_json('agent_ratings_V1.json')
toolratings = pd.read_json('toolratings_V0.json')
frameworkratings = pd.read_json('frameworkratings_V0.json')
modelratings = pd.read_json('modelratings_V0.json')

# Display the first few rows to verify the data is loaded correctly
ratings


Unnamed: 0,Prompt,Agent_A,Agent_B,Rating
0,"The prompts are:\n\n1. ""What is the best food ...",{'Agent name': 'langchain brave-search agent (...,{'Agent name': 'langchain google-serper search...,A is better
1,"The prompt is """" (an empty string).",{'Agent name': 'langchain brave-search agent (...,{'Agent name': 'langchain google-serper search...,A is better
2,"The prompt in the executed code is """".",{'Agent name': 'langchain brave-search agent (...,{'Agent name': 'langchain google-serper search...,A is better
3,"The prompt in the given executed code is """" (a...",{'Agent name': 'langchain brave-search agent (...,{'Agent name': 'langchain google-serper search...,B is better
4,Write sqlite query to get top 10 rows from the...,{'Agent name': 'sql agent plotter langchain (g...,{'Agent name': 'langchain ArXiv Article Fetche...,B is better
...,...,...,...,...
2508,"""what was AAPL stock yesterday""",{'Agent name': 'langchain alpha-vantage stock ...,{'Agent name': 'langchain Yahoo Finance News (...,A is better
2509,"""Analyze a dataset of daily temperature record...",{'Agent name': 'langchain Python REPL (gemini-...,{'Agent name': 'langchain Wolfram Alpha (gpt-4...,A is better
2510,"""Compare the year-to-date performance of Apple...",{'Agent name': 'langchain alpha-vantage stock ...,{'Agent name': 'langchain Yahoo Finance News (...,A is better
2511,"""Summarize the key insights and methodologies ...",{'Agent name': 'langchain ArXiv Article Fetche...,{'Agent name': 'llamaindex ArXiv Article Fetch...,A is better


In [5]:
fig = px.bar(ratings["Rating"].value_counts(),
             title="Counts of Battle Outcomes", text_auto=True, height=400)
fig.update_layout(xaxis_title="Battle Outcome", yaxis_title="Count",
                  showlegend=False)
fig

In [6]:
ratings_no_tie = ratings[~ratings["Rating"].str.contains("Tie")]

In [7]:
import plotly.express as px

# Extract agent names from both Agent_A and Agent_B
agent_names = ratings['Agent_A'].apply(lambda x: x['Agent name']).tolist() + \
              ratings['Agent_B'].apply(lambda x: x['Agent name']).tolist()

# Count the occurrences of each agent
agent_counts = pd.Series(agent_names).value_counts()

# Create the bar plot
fig = px.bar(agent_counts,
             x=agent_counts.index,
             y=agent_counts.values,
             title="Battle Count for Each Agent",
             text_auto=True)

fig.update_layout(xaxis_title="Agent",
                  yaxis_title="Battle Count",
                  height=400,
                  showlegend=False)

fig.show()

In [9]:
import plotly.express as px
import pandas as pd

def visualize_battle_count(battles, title, show_num_models=30):
    # Extract agent names and create a DataFrame
    battle_df = pd.DataFrame({
        'leftAgent': battles['Agent_A'].apply(lambda x: x['Agent name']),
        'rightAgent': battles['Agent_B'].apply(lambda x: x['Agent name']),
        'rating': battles['Rating']
    })

    # Create the pivot table
    ptbl = pd.pivot_table(battle_df, index="leftAgent", columns="rightAgent", aggfunc="size", fill_value=0)

    # Get all unique agents from both leftAgent and rightAgent columns
    all_agents = pd.Index(ptbl.index.union(ptbl.columns))

    # Reindex the pivot table to be square, filling missing values with 0
    ptbl = ptbl.reindex(index=all_agents, columns=all_agents, fill_value=0)

    # Sum the pivot table and its transpose to get symmetric battle counts
    battle_counts = ptbl + ptbl.T

    # Fill any remaining NaN values with 0 (just in case)
    battle_counts.fillna(0, inplace=True)

    # Sort the agents by the sum of their counts and get the top N agents
    ordering = battle_counts.sum().sort_values(ascending=False).index
    ordering = ordering[:show_num_models]  # Limit to top N agents

    # Extract the ordered battle counts matrix
    ordered_battle_counts = battle_counts.loc[ordering, ordering]

    # Generate the heatmap
    fig = px.imshow(ordered_battle_counts,
                    title=title, text_auto=True)
    fig.update_layout(xaxis_title="Agent B",
                      yaxis_title="Agent A",
                      xaxis_side="top", height=800, width=800,
                      title_y=0.07, title_x=0.5,
                      font=dict(size=10))
    fig.update_traces(hovertemplate=
                      "Agent A: %{y}<br>Agent B: %{x}<br>Count: %{z}<extra></extra>")
    return fig

# Generate heatmap for Tie ratings
fig = visualize_battle_count(ratings[ratings['Rating'] == 'Tie'],
                             title="Tie Count for Each Combination of Agents", show_num_models=30)
fig.show()

In [10]:
visualize_battle_count(ratings_no_tie, "Battle Count for Each Combination of Models (without Ties)")

In [11]:


visualize_battle_count(ratings[ratings['Rating'].str.contains("Tie")], "Tie Count for Each Combination of Models")




# Preliminary Ranking - Basic Elo

In [12]:
from collections import defaultdict

def compute_online_elo(battles, K=4, SCALE=400, BASE=10, INIT_RATING=1000):
    rating = defaultdict(lambda: INIT_RATING)

    for _, battle in battles.iterrows():
        model_a = battle['Agent_A']['Agent name']
        model_b = battle['Agent_B']['Agent name']
        winner = battle['Rating']

        ra = rating[model_a]
        rb = rating[model_b]
        ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
        eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))

        if winner == "A is better":
            sa = 1
        elif winner == "B is better":
            sa = 0
        elif winner == "Tie" or winner == "Both are bad":
            sa = 0.5
        else:
            raise Exception(f"unexpected rating {winner}")

        rating[model_a] += K * (sa - ea)
        rating[model_b] += K * (1 - sa - eb)

    return rating

# Example usage:
elo_ratings = compute_online_elo(ratings)

# Convert to DataFrame for easier viewing
elo_df = pd.DataFrame.from_dict(elo_ratings, orient='index', columns=['Elo Rating'])
elo_df = elo_df.sort_values('Elo Rating', ascending=False)
print(elo_df)

                                                    Elo Rating
langchain google-serper search agent (claude-3-...     1040.01
llamaindex brave-search agent (gpt-4o-2024-08-06)      1039.42
langchain google-serper search agent (open-mixt...     1038.13
langchain google-serper search agent (gpt-4-tur...     1037.83
langchain Wikipedia (claude-3-5-sonnet-20240620)       1037.70
...                                                        ...
llamaindex wikipedia (llama-3.1-405B-instruct)          977.21
langchain Wikipedia (llama-3.1-8B-instruct)             972.64
llamaindex ArXiv Article Fetcher (gpt-4o-2024-0...      972.53
langchain Wikipedia (gemini-1.5-flash-001)              971.67
openai general assistant (gpt-4o-2024-08-06)            947.47

[481 rows x 1 columns]


In [13]:
def preety_print_model_ratings(ratings):
    df = pd.DataFrame([
        [n, ratings[n]] for n in ratings.keys()
    ], columns=["Model", "Elo rating"]).sort_values("Elo rating", ascending=False).reset_index(drop=True)
    # df["Elo rating"] = (df["Elo rating"] + 0.5).astype(int)
    df.index = df.index + 1
    return df

online_elo_ratings = compute_online_elo(ratings)
print((online_elo_ratings))
preety_print_model_ratings(online_elo_ratings)

defaultdict(<function compute_online_elo.<locals>.<lambda> at 0x1644465f0>, {'langchain brave-search agent (gpt-4o-2024-08-06)': 995.2349919209466, 'langchain google-serper search agent (gpt-4o-2024-08-06)': 1031.1695812364785, 'sql agent plotter langchain (gpt-4o-2024-08-06)': 994.3393786867174, 'langchain ArXiv Article Fetcher (gpt-4o-2024-08-06)': 998.6021678600542, 'langchain alpha-vantage stock agent (gpt-4o-2024-08-06)': 1023.676228590289, 'langchain alpha-vantage stock agent (claude-3-5-sonnet-20240620)': 989.5085329184217, 'langchain alpha-vantage stock agent (gpt-4-0613)': 1001.9773198092618, 'langchain alpha-vantage stock agent (claude-3-opus-20240229)': 1003.5425937851317, 'langchain Google Jobs (gpt-4o-2024-08-06)': 1001.9772402495474, 'langchain Google Jobs (gpt-4-turbo-2024-04-09)': 999.9784723208489, 'langchain google-serper search agent (claude-3-5-sonnet-20240620)': 1030.224030973781, 'llamaindex Yelp Tool (claude-3-opus-20240229)': 998.0, 'langchain brave-search agent

Unnamed: 0,Model,Elo rating
1,langchain google-serper search agent (claude-3...,1040.01
2,llamaindex brave-search agent (gpt-4o-2024-08-06),1039.42
3,langchain google-serper search agent (open-mix...,1038.13
4,langchain google-serper search agent (gpt-4-tu...,1037.83
5,langchain Wikipedia (claude-3-5-sonnet-20240620),1037.70
...,...,...
477,llamaindex wikipedia (llama-3.1-405B-instruct),977.21
478,langchain Wikipedia (llama-3.1-8B-instruct),972.64
479,llamaindex ArXiv Article Fetcher (gpt-4o-2024-...,972.53
480,langchain Wikipedia (gemini-1.5-flash-001),971.67


In [14]:
def preety_print_two_ratings(ratings_1, ratings_2, column_names):
    df = pd.DataFrame([
        [n, ratings_1[n], ratings_2[n]] for n in ratings_1.keys()
    ], columns=["Model", column_names[0], column_names[1]]).sort_values(column_names[0], ascending=False).reset_index(drop=True)
    df[column_names[0]] = (df[column_names[0]] + 0.5).astype(int)
    df[column_names[1]] = (df[column_names[1]] + 0.5).astype(int)
    df.index = df.index + 1
    return df

elo_mle_ratings_reverse = compute_online_elo(ratings.iloc[::-1])
preety_print_two_ratings(online_elo_ratings,
                         elo_mle_ratings_reverse,
                         column_names=["Elo rating", "Elo rating with reverse order"])

Unnamed: 0,Model,Elo rating,Elo rating with reverse order
1,langchain google-serper search agent (claude-3...,1040,1037
2,llamaindex brave-search agent (gpt-4o-2024-08-06),1039,1045
3,langchain google-serper search agent (open-mix...,1038,1037
4,langchain google-serper search agent (gpt-4-tu...,1038,1034
5,langchain Wikipedia (claude-3-5-sonnet-20240620),1038,1043
...,...,...,...
477,llamaindex wikipedia (llama-3.1-405B-instruct),977,977
478,langchain Wikipedia (llama-3.1-8B-instruct),973,978
479,llamaindex ArXiv Article Fetcher (gpt-4o-2024-...,973,971
480,langchain Wikipedia (gemini-1.5-flash-001),972,977


 ### Maximum Likelihood Estimation with [Bradley-Terry model](https://en.wikipedia.org/wiki/Bradley%E2%80%93Terry_model), (Agents)



In [15]:
import numpy as np
import math
import pandas as pd
from sklearn.linear_model import LogisticRegression
from IPython.display import display

def compute_mle_elo_by_category(df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None):
    # Define new categories
    categories = {
        'Search Engines': { 'normal': '#c5cae9', 'hover': '#9fa8da' },
        'Simple Math': { 'normal': '#ffd3b6', 'hover': '#ffbfa0' },
        'Knowledge Bases': { 'normal': '#ffe0b2', 'hover': '#ffcc80' },
        'Math/CS Academic Search': { 'normal': '#ffccbc', 'hover': '#ffab91' },
        'Code Interpreter': { 'normal': '#a9cce3', 'hover': '#87bdd8' }
    }
    
    results = {}

    def category_overlap(cat_a, cat_b):
        """Check if there is any overlap between two category lists."""
        return len(set(cat_a).intersection(set(cat_b))) > 0

    for category in categories.keys():
        # Filter DataFrame to get all battles where Agent_A and Agent_B share any category with the current category
        category_df = df[
            df.apply(lambda x: category_overlap(x['Agent_A']['Category'], [category]) and
                               category_overlap(x['Agent_B']['Category'], [category]), axis=1)
        ]

        if len(category_df) == 0:
            continue  # Skip categories with no battles

        # Create a DataFrame with agent names and ratings for this category
        battle_df = pd.DataFrame({
            'leftAgent': category_df['Agent_A'].apply(lambda x: x['Agent name']),
            'rightAgent': category_df['Agent_B'].apply(lambda x: x['Agent name']),
            'rating': category_df['Rating']
        })

        # Create win/tie/loss pivot tables
        ptbl_a_win = pd.pivot_table(
            battle_df[battle_df["rating"] == "A is better"],
            index="leftAgent",
            columns="rightAgent",
            aggfunc="size",
            fill_value=0,
        )
        all_agents = pd.Index(ptbl_a_win.index.union(ptbl_a_win.columns))

        ptbl_a_win = ptbl_a_win.reindex(index=all_agents, columns=all_agents, fill_value=0)

        if sum(battle_df["rating"].isin(["Tie", "Both are bad"])) == 0:
            ptbl_tie = pd.DataFrame(0, index=ptbl_a_win.index, columns=ptbl_a_win.columns)
        else:
            ptbl_tie = pd.pivot_table(
                battle_df[battle_df["rating"].isin(["Tie", "Both are bad"])],
                index="leftAgent",
                columns="rightAgent",
                aggfunc="size",
                fill_value=0,
            )
            ptbl_tie = ptbl_tie.reindex(index=all_agents, columns=all_agents, fill_value=0)
            ptbl_tie = ptbl_tie + ptbl_tie.T

        ptbl_b_win = pd.pivot_table(
            battle_df[battle_df["rating"] == "B is better"],
            index="leftAgent",
            columns="rightAgent",
            aggfunc="size",
            fill_value=0,
        )
        ptbl_b_win = ptbl_b_win.reindex(index=all_agents, columns=all_agents, fill_value=0)

        ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie

        agents = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index)

        p = len(agents)
        X = np.zeros([p * (p - 1) * 2, p])
        Y = np.zeros(p * (p - 1) * 2)

        cur_row = 0
        sample_weights = []
        for m_a in ptbl_win.index:
            for m_b in ptbl_win.columns:
                if m_a == m_b:
                    continue
                if math.isnan(ptbl_win.loc[m_a, m_b]) or math.isnan(ptbl_win.loc[m_b, m_a]):
                    continue
                X[cur_row, agents[m_a]] = +math.log(BASE)
                X[cur_row, agents[m_b]] = -math.log(BASE)
                Y[cur_row] = 1.0
                sample_weights.append(ptbl_win.loc[m_a, m_b])

                X[cur_row + 1, agents[m_a]] = math.log(BASE)
                X[cur_row + 1, agents[m_b]] = -math.log(BASE)
                Y[cur_row + 1] = 0.0
                sample_weights.append(ptbl_win.loc[m_b, m_a])
                cur_row += 2
        X = X[:cur_row]
        Y = Y[:cur_row]

        if len(X) > 0:
            lr = LogisticRegression(fit_intercept=False, penalty='l2', C=0.7, tol=1e-6)
            lr.fit(X, Y, sample_weight=sample_weights)
            elo_scores = SCALE * lr.coef_[0] + INIT_RATING
            results[category] = pd.Series(elo_scores, index=agents.index).sort_values(ascending=False)
        else:
            results[category] = pd.Series(dtype=float)  # Empty series for categories with no valid battles

    return results

# Function to write MLE ELO ratings to a text file
def write_elo_ratings_to_file(ratings_by_category, filename="elo_ratings_by_category.txt"):
    with open(filename, "w") as file:
        for category, ratings in ratings_by_category.items():
            file.write(f"ELO Ratings for {category} category:\n")
            for agent, rating in ratings.items():
                file.write(f"{agent}: {rating}\n")
            file.write("\n")

# Example usage:
mle_elo_ratings_by_category = compute_mle_elo_by_category(ratings)
write_elo_ratings_to_file(mle_elo_ratings_by_category)
mle_elo_ratings_by_category


{'Search Engines': langchain google-serper search agent (llama-3.1-405B-instruct)      1437.33
 langchain brave-search agent (gemini-1.5-pro-002)                   1358.39
 langchain google-serper search agent (gemini-1.5-pro-001)           1344.82
 langchain brave-search agent (llama-3.1-70B-instruct)               1331.80
 langchain You.com Search (gemini-1.5-pro-001)                       1199.40
 langchain brave-search agent (open-mixtral-8x7b)                    1185.75
 langchain You.com Search (gpt-4-turbo-2024-04-09)                   1174.85
 langchain google-serper search agent (open-mixtral-8x22b)           1172.74
 langchain brave-search agent (claude-3-opus-20240229)               1172.72
 langchain google-serper search agent (gpt-4-turbo-2024-04-09)       1159.87
 langchain brave-search agent (gemini-1.5-pro-001)                   1159.14
 langchain google-serper search agent (claude-3-5-sonnet-20240620)   1116.27
 langchain google-serper search agent (gpt-4o-2024-05-13) 

 ### Maximum Likelihood Estimation with [Bradley-Terry model](https://en.wikipedia.org/wiki/Bradley%E2%80%93Terry_model) (Models)



In [16]:
import matplotlib.pyplot as plt
import numpy as np
import math
import pandas as pd
from sklearn.linear_model import LogisticRegression
from IPython.display import display


def compute_mle_elo_models(df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None):
    ptbl_a_win = pd.pivot_table(
        df[df["rating"] == "A is better"],
        index="leftSubcomponent",
        columns="rightSubcomponent",
        aggfunc="size",
        fill_value=0,
    )
    all_models = pd.Index(ptbl_a_win.index.union(ptbl_a_win.columns))

    # Reindex the pivot table to be square, filling missing values with 0
    print(ptbl_a_win.shape)

    ptbl_a_win = ptbl_a_win.reindex(index=all_models, columns=all_models, fill_value=0)
    print(ptbl_a_win.shape)


    # Handle the case where there are no ties
    if sum(df["rating"].isin(["Tie", "Both are bad"])) == 0:
        ptbl_tie = pd.DataFrame(0, index=ptbl_a_win.index, columns=ptbl_a_win.columns)
    else:
        ptbl_tie = pd.pivot_table(
            df[df["rating"].isin(["Tie", "Both are bad"])],
            index="leftSubcomponent",
            columns="rightSubcomponent",
            aggfunc="size",
            fill_value=0,
        )
        # Get all unique models from both leftAgent and rightAgent columns
        all_models = pd.Index(ptbl_tie.index.union(ptbl_tie.columns))

        # Reindex the pivot table to be square, filling missing values with 0
        ptbl_tie = ptbl_tie.reindex(index=all_models, columns=all_models, fill_value=0)
        ptbl_tie = ptbl_tie + ptbl_tie.T

    ptbl_b_win = pd.pivot_table(
        df[df["rating"] == "B is better"],
        index="leftSubcomponent",
        columns="rightSubcomponent",
        aggfunc="size",
        fill_value=0,
    )
    print(ptbl_b_win.shape)

    all_models = pd.Index(ptbl_b_win.index.union(ptbl_b_win.columns))

    # Reindex the pivot table to be square, filling missing values with 0
    ptbl_b_win = ptbl_b_win.reindex(index=all_models, columns=all_models, fill_value=0)
    print(ptbl_b_win.shape)

    # Combine pivot tables
    ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie

    # Visualize the combined pivot table
    # visualize_ptbl_win(ptbl_win)
    models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index)

    p = len(models)
    X = np.zeros([p * (p - 1) * 2, p])
    Y = np.zeros(p * (p - 1) * 2)

    cur_row = 0
    sample_weights = []
    for m_a in ptbl_win.index:
        for m_b in ptbl_win.columns:
            if m_a == m_b:
                continue
            # Skip if NaN
            if math.isnan(ptbl_win.loc[m_a, m_b]) or math.isnan(ptbl_win.loc[m_b, m_a]):
                continue
            X[cur_row, models[m_a]] = +math.log(BASE)
            X[cur_row, models[m_b]] = -math.log(BASE)
            Y[cur_row] = 1.0
            sample_weights.append(ptbl_win.loc[m_a, m_b])

            X[cur_row + 1, models[m_a]] = math.log(BASE)
            X[cur_row + 1, models[m_b]] = -math.log(BASE)
            Y[cur_row + 1] = 0.0
            sample_weights.append(ptbl_win.loc[m_b, m_a])
            cur_row += 2
    X = X[:cur_row]
    Y = Y[:cur_row]

    lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-6)
    lr.fit(X, Y, sample_weight=sample_weights)
    elo_scores = SCALE * lr.coef_[0] + INIT_RATING
    return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)

In [17]:
model_rankings = compute_mle_elo_models(modelratings)
preety_print_model_ratings(model_rankings)

(16, 16)
(16, 16)
(16, 16)
(16, 16)


Unnamed: 0,Model,Elo rating
1,open-mixtral-8x7b,1056.02
2,llama-3.1-70B-instruct,1049.77
3,open-mixtral-8x22b,1048.15
4,claude-3-5-sonnet-20240620,1037.36
5,claude-3-opus-20240229,1025.12
6,gpt-4o-2024-05-13,1020.9
7,gpt-4o-2024-08-06,1017.88
8,gpt-4o-mini-2024-07-18,1014.26
9,gpt-4-turbo-2024-04-09,1011.87
10,claude-3-haiku-20240307,997.59


 ### Maximum Likelihood Estimation with [Bradley-Terry model](https://en.wikipedia.org/wiki/Bradley%E2%80%93Terry_model) (Tools)



In [18]:
import matplotlib.pyplot as plt
import numpy as np
import math
import pandas as pd
from sklearn.linear_model import LogisticRegression
from IPython.display import display


def compute_mle_elo_tools(df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None):
    ptbl_a_win = pd.pivot_table(
        df[df["rating"] == "A is better"],
        index="leftSubcomponent",
        columns="rightSubcomponent",
        aggfunc="size",
        fill_value=0,
    )
    all_tools = pd.Index(ptbl_a_win.index.union(ptbl_a_win.columns))

    # Reindex the pivot table to be square, filling missing values with 0
    print(ptbl_a_win.shape)

    ptbl_a_win = ptbl_a_win.reindex(index=all_tools, columns=all_tools, fill_value=0)
    print(ptbl_a_win.shape)



    # Handle the case where there are no ties
    if sum(df["rating"].isin(["Tie", "Both are bad"])) == 0:
        ptbl_tie = pd.DataFrame(0, index=ptbl_a_win.index, columns=ptbl_a_win.columns)
    else:
        ptbl_tie = pd.pivot_table(
            df[df["rating"].isin(["Tie", "Both are bad"])],
            index="leftSubcomponent",
            columns="rightSubcomponent",
            aggfunc="size",
            fill_value=0,
        )
        # Get all unique tools from both leftAgent and rightAgent columns
        all_tools = pd.Index(ptbl_tie.index.union(ptbl_tie.columns))

        # Reindex the pivot table to be square, filling missing values with 0
        ptbl_tie = ptbl_tie.reindex(index=all_tools, columns=all_tools, fill_value=0)
        ptbl_tie = ptbl_tie + ptbl_tie.T

    ptbl_b_win = pd.pivot_table(
        df[df["rating"] == "B is better"],
        index="leftSubcomponent",
        columns="rightSubcomponent",
        aggfunc="size",
        fill_value=0,
    )
    print(ptbl_b_win.shape)

    all_tools = pd.Index(ptbl_b_win.index.union(ptbl_b_win.columns))

    # Reindex the pivot table to be square, filling missing values with 0
    ptbl_b_win = ptbl_b_win.reindex(index=all_tools, columns=all_tools, fill_value=0)
    print(ptbl_b_win.shape)

    # Combine pivot tables
    ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie

    # Visualize the combined pivot table
    # visualize_ptbl_win(ptbl_win)
    tools = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index)

    p = len(tools)
    X = np.zeros([p * (p - 1) * 2, p])
    Y = np.zeros(p * (p - 1) * 2)

    cur_row = 0
    sample_weights = []
    for m_a in ptbl_win.index:
        for m_b in ptbl_win.columns:
            if m_a == m_b:
                continue
            # Skip if NaN
            if math.isnan(ptbl_win.loc[m_a, m_b]) or math.isnan(ptbl_win.loc[m_b, m_a]):
                continue
            X[cur_row, tools[m_a]] = +math.log(BASE)
            X[cur_row, tools[m_b]] = -math.log(BASE)
            Y[cur_row] = 1.0
            sample_weights.append(ptbl_win.loc[m_a, m_b])

            X[cur_row + 1, tools[m_a]] = math.log(BASE)
            X[cur_row + 1, tools[m_b]] = -math.log(BASE)
            Y[cur_row + 1] = 0.0
            sample_weights.append(ptbl_win.loc[m_b, m_a])
            cur_row += 2
    X = X[:cur_row]
    Y = Y[:cur_row]

    lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-6)
    lr.fit(X, Y, sample_weight=sample_weights)
    elo_scores = SCALE * lr.coef_[0] + INIT_RATING
    return pd.Series(elo_scores, index=tools.index).sort_values(ascending=False)

In [19]:
tool_rankings = compute_mle_elo_tools(toolratings)
preety_print_model_ratings(tool_rankings)

(26, 33)
(35, 35)
(26, 29)
(32, 32)


Unnamed: 0,Model,Elo rating
1,tavily-search,1222.21
2,pandas,1188.21
3,brave-search,1154.33
4,asknews,1119.67
5,google-serper,1108.85
6,pubmed,1100.57
7,arxiv,1095.66
8,alpha-vantage,1095.19
9,wikipedia,1094.33
10,dall-e,1068.83


 ### Maximum Likelihood Estimation with [Bradley-Terry model](https://en.wikipedia.org/wiki/Bradley%E2%80%93Terry_model) (Frameworks)


In [20]:
import matplotlib.pyplot as plt
import numpy as np
import math
import pandas as pd
from sklearn.linear_model import LogisticRegression
from IPython.display import display


def compute_mle_elo_frameworks(df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None):
    ptbl_a_win = pd.pivot_table(
        df[df["rating"] == "A is better"],
        index="leftSubcomponent",
        columns="rightSubcomponent",
        aggfunc="size",
        fill_value=0,
    )
    all_frameworks = pd.Index(ptbl_a_win.index.union(ptbl_a_win.columns))

    # Reindex the pivot table to be square, filling missing values with 0
    print(ptbl_a_win.shape)

    ptbl_a_win = ptbl_a_win.reindex(index=all_frameworks, columns=all_frameworks, fill_value=0)
    print(ptbl_a_win.shape)

    all_frameworks = pd.Index(ptbl_a_win.index.union(ptbl_a_win.columns))


    # Handle the case where there are no ties
    if sum(df["rating"].isin(["Tie", "Both are bad"])) == 0:
        ptbl_tie = pd.DataFrame(0, index=ptbl_a_win.index, columns=ptbl_a_win.columns)
    else:
        ptbl_tie = pd.pivot_table(
            df[df["rating"].isin(["Tie", "Both are bad"])],
            index="leftSubcomponent",
            columns="rightSubcomponent",
            aggfunc="size",
            fill_value=0,
        )
        # Get all unique tools from both leftAgent and rightAgent columns
        all_frameworks = pd.Index(ptbl_tie.index.union(ptbl_tie.columns))

        # Reindex the pivot table to be square, filling missing values with 0
        ptbl_tie = ptbl_tie.reindex(index=all_frameworks, columns=all_frameworks, fill_value=0)
        ptbl_tie = ptbl_tie + ptbl_tie.T

    ptbl_b_win = pd.pivot_table(
        df[df["rating"] == "B is better"],
        index="leftSubcomponent",
        columns="rightSubcomponent",
        aggfunc="size",
        fill_value=0,
    )
    print(ptbl_b_win.shape)

    all_frameworks = pd.Index(ptbl_b_win.index.union(ptbl_b_win.columns))

    # Reindex the pivot table to be square, filling missing values with 0
    ptbl_b_win = ptbl_b_win.reindex(index=all_frameworks, columns=all_frameworks, fill_value=0)
    print(ptbl_b_win.shape)

    # Combine pivot tables
    ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie

    # Visualize the combined pivot table
    # visualize_ptbl_win(ptbl_win)
    frameworks = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index)

    p = len(frameworks)
    X = np.zeros([p * (p - 1) * 2, p])
    Y = np.zeros(p * (p - 1) * 2)

    cur_row = 0
    sample_weights = []
    for m_a in ptbl_win.index:
        for m_b in ptbl_win.columns:
            if m_a == m_b:
                continue
            # Skip if NaN
            if math.isnan(ptbl_win.loc[m_a, m_b]) or math.isnan(ptbl_win.loc[m_b, m_a]):
                continue
            X[cur_row, frameworks[m_a]] = +math.log(BASE)
            X[cur_row, frameworks[m_b]] = -math.log(BASE)
            Y[cur_row] = 1.0
            sample_weights.append(ptbl_win.loc[m_a, m_b])

            X[cur_row + 1, frameworks[m_a]] = math.log(BASE)
            X[cur_row + 1, frameworks[m_b]] = -math.log(BASE)
            Y[cur_row + 1] = 0.0
            sample_weights.append(ptbl_win.loc[m_b, m_a])
            cur_row += 2
    X = X[:cur_row]
    Y = Y[:cur_row]

    lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-6)
    lr.fit(X, Y, sample_weight=sample_weights)
    elo_scores = SCALE * lr.coef_[0] + INIT_RATING
    return pd.Series(elo_scores, index=frameworks.index).sort_values(ascending=False)

In [21]:
framework_rankings = compute_mle_elo_frameworks(frameworkratings)
preety_print_model_ratings(framework_rankings)

(6, 6)
(6, 6)
(5, 6)
(6, 6)


Unnamed: 0,Model,Elo rating
1,langchain,1123.1
2,anthropic tool use,1117.37
3,llamaindex,1040.58
4,openai assistants,978.6
5,crewai,915.86
6,composio,824.48


# ELO Rating Combined

To improve the combined subcomponent ratings, we should create a larger feature matrix where each row represents a battle and each column represents the combination of models, tools, and frameworks used. This will allow us to account for confounding factors between frequently co-occurring components (e.g., a tool frequently paired with a stronger model) and ensure we make full use of the data, even in battles where models are the same but tools differ. This approach will yield more accurate estimates by considering all component combinations in a unified framework.









In [23]:
def extract_agent_info(agent_data):
    return {
        'name': agent_data['Agent name'],
        'frameworks': agent_data['Frameworks'],
        'models': agent_data['Models'],
        'tools': agent_data['Tools']
    }

# Extract information for Agent A and Agent B
ratings['Agent_A_info'] = ratings['Agent_A'].apply(extract_agent_info)
ratings['Agent_B_info'] = ratings['Agent_B'].apply(extract_agent_info)

# Prepare the ratings data for Elo calculation
def prepare_ratings_data(ratings_df):
    # Create a new DataFrame with the required structure
    prepared_df = pd.DataFrame({
        'leftAgent': ratings_df['Agent_A'].apply(lambda x: x['Agent name']),
        'rightAgent': ratings_df['Agent_B'].apply(lambda x: x['Agent name']),
        'tools_left': ratings_df['Agent_A_info'].apply(lambda x: x['tools']),
        'tools_right': ratings_df['Agent_B_info'].apply(lambda x: x['tools']),
        'models_left': ratings_df['Agent_A_info'].apply(lambda x: x['models']),
        'models_right': ratings_df['Agent_B_info'].apply(lambda x: x['models']),
        'frameworks_left': ratings_df['Agent_A_info'].apply(lambda x: x['frameworks']),
        'frameworks_right': ratings_df['Agent_B_info'].apply(lambda x: x['frameworks']),
        'rating': ratings_df['Rating']
    })

    # Instead of exploding, we'll create a single row for each battle
    prepared_df['tools_left'] = prepared_df['tools_left'].apply(lambda x: x[0] if x else None)
    prepared_df['tools_right'] = prepared_df['tools_right'].apply(lambda x: x[0] if x else None)
    prepared_df['models_left'] = prepared_df['models_left'].apply(lambda x: x[0] if x else None)
    prepared_df['models_right'] = prepared_df['models_right'].apply(lambda x: x[0] if x else None)
    prepared_df['frameworks_left'] = prepared_df['frameworks_left'].apply(lambda x: x[0] if x else None)
    prepared_df['frameworks_right'] = prepared_df['frameworks_right'].apply(lambda x: x[0] if x else None)

    return prepared_df

ratings_prepared = prepare_ratings_data(ratings)

# Now you can use ratings_prepared for your Elo calculations or other analyses

# Example: Print the first few rows of the prepared data
print(ratings_prepared.head())

# Example: Get unique agents
unique_agents = set(ratings_prepared['leftAgent'].unique()) | set(ratings_prepared['rightAgent'].unique())
print(f"Number of unique agents: {len(unique_agents)}")

                                          leftAgent  \
0  langchain brave-search agent (gpt-4o-2024-08-06)   
1  langchain brave-search agent (gpt-4o-2024-08-06)   
2  langchain brave-search agent (gpt-4o-2024-08-06)   
3  langchain brave-search agent (gpt-4o-2024-08-06)   
4   sql agent plotter langchain (gpt-4o-2024-08-06)   

                                          rightAgent    tools_left  \
0  langchain google-serper search agent (gpt-4o-2...  brave-search   
1  langchain google-serper search agent (gpt-4o-2...  brave-search   
2  langchain google-serper search agent (gpt-4o-2...  brave-search   
3  langchain google-serper search agent (gpt-4o-2...  brave-search   
4  langchain ArXiv Article Fetcher (gpt-4o-2024-0...           sql   

     tools_right        models_left       models_right frameworks_left  \
0  google-serper  gpt-4o-2024-08-06  gpt-4o-2024-08-06       langchain   
1  google-serper  gpt-4o-2024-08-06  gpt-4o-2024-08-06       langchain   
2  google-serper  gpt-4o-2

In [24]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import math

def compute_mle_elo_combined(ratings_df, SCALE=400, BASE=10, INIT_RATING=1000):
    # Create index for all unique components (tools, models, frameworks)
    all_tools = pd.Index(ratings_df['tools_left'].dropna().unique().tolist() + ratings_df['tools_right'].dropna().unique().tolist())
    all_models = pd.Index(ratings_df['models_left'].dropna().unique().tolist() + ratings_df['models_right'].dropna().unique().tolist())
    all_frameworks = pd.Index(ratings_df['frameworks_left'].dropna().unique().tolist() + ratings_df['frameworks_right'].dropna().unique().tolist())

    all_components = pd.Index(all_tools.tolist() + all_models.tolist() + all_frameworks.tolist())

    p = len(all_components)
    X = np.zeros([ratings_df.shape[0] * 2, p])  # 2 rows per battle
    Y = np.zeros(ratings_df.shape[0] * 2)
    sample_weights = []
    cur_row = 0

    for i, row in ratings_df.iterrows():
        # Get the indices of the subcomponents
        left_tool_idx = all_components.get_loc(row['tools_left']) if pd.notna(row['tools_left']) else None
        right_tool_idx = all_components.get_loc(row['tools_right']) if pd.notna(row['tools_right']) else None
        left_model_idx = all_components.get_loc(row['models_left']) if pd.notna(row['models_left']) else None
        right_model_idx = all_components.get_loc(row['models_right']) if pd.notna(row['models_right']) else None
        left_framework_idx = all_components.get_loc(row['frameworks_left']) if pd.notna(row['frameworks_left']) else None
        right_framework_idx = all_components.get_loc(row['frameworks_right']) if pd.notna(row['frameworks_right']) else None

        # Check for tool category overlap
        left_tool_categories = set(row['tool_categories_left'])
        right_tool_categories = set(row['tool_categories_right'])
        tool_category_overlap = len(left_tool_categories.intersection(right_tool_categories)) > 0

        # Set the design matrix for the left and right subcomponents
        if left_tool_idx is not None and right_tool_idx is not None and tool_category_overlap:
            X[cur_row, left_tool_idx] = +math.log(BASE)
            X[cur_row, right_tool_idx] = -math.log(BASE)
        if left_model_idx is not None and right_model_idx is not None:
            X[cur_row, left_model_idx] = +math.log(BASE)
            X[cur_row, right_model_idx] = -math.log(BASE)
        if left_framework_idx is not None and right_framework_idx is not None:
            X[cur_row, left_framework_idx] = +math.log(BASE)
            X[cur_row, right_framework_idx] = -math.log(BASE)

        # Define the outcome for this row
        Y[cur_row] = 1.0 if row['rating'] == 'A is better' else 0.0
        sample_weights.append(1)
        cur_row += 1

        # Reverse for the next row
        if left_tool_idx is not None and right_tool_idx is not None and tool_category_overlap:
            X[cur_row, left_tool_idx] = -math.log(BASE)
            X[cur_row, right_tool_idx] = +math.log(BASE)
        if left_model_idx is not None and right_model_idx is not None:
            X[cur_row, left_model_idx] = -math.log(BASE)
            X[cur_row, right_model_idx] = +math.log(BASE)
        if left_framework_idx is not None and right_framework_idx is not None:
            X[cur_row, left_framework_idx] = -math.log(BASE)
            X[cur_row, right_framework_idx] = +math.log(BASE)

        # Define the outcome for the reversed row
        Y[cur_row] = 0.0 if row['rating'] == 'A is better' else 1.0
        sample_weights.append(1)
        cur_row += 1

    X = X[:cur_row]
    Y = Y[:cur_row]

    # Logistic Regression
    lr = LogisticRegression(fit_intercept=False, tol=1e-6, penalty='l2', C=1.0, solver='liblinear')
    lr.fit(X, Y, sample_weight=sample_weights)

    # Scale the coefficients to get Elo ratings
    elo_scores = SCALE * lr.coef_[0] + INIT_RATING

    # Split back the scores
    tool_elo = pd.Series(elo_scores[:len(all_tools)], index=all_tools)
    model_elo = pd.Series(elo_scores[len(all_tools):len(all_tools) + len(all_models)], index=all_models)
    framework_elo = pd.Series(elo_scores[len(all_tools) + len(all_models):], index=all_frameworks)

    return tool_elo.sort_values(ascending=False), model_elo.sort_values(ascending=False), framework_elo.sort_values(ascending=False)

# Prepare the ratings data for Elo calculation
def prepare_ratings_data(ratings_df):
    prepared_df = pd.DataFrame({
        'leftAgent': ratings_df['Agent_A'].apply(lambda x: x['Agent name']),
        'rightAgent': ratings_df['Agent_B'].apply(lambda x: x['Agent name']),
        'tools_left': ratings_df['Agent_A_info'].apply(lambda x: x['tools'][0] if x['tools'] else None),
        'tools_right': ratings_df['Agent_B_info'].apply(lambda x: x['tools'][0] if x['tools'] else None),
        'models_left': ratings_df['Agent_A_info'].apply(lambda x: x['models'][0] if x['models'] else None),
        'models_right': ratings_df['Agent_B_info'].apply(lambda x: x['models'][0] if x['models'] else None),
        'frameworks_left': ratings_df['Agent_A_info'].apply(lambda x: x['frameworks'][0] if x['frameworks'] else None),
        'frameworks_right': ratings_df['Agent_B_info'].apply(lambda x: x['frameworks'][0] if x['frameworks'] else None),
        'tool_categories_left': ratings_df['Agent_A'].apply(lambda x: x['Tool Categories']),
        'tool_categories_right': ratings_df['Agent_B'].apply(lambda x: x['Tool Categories']),
        'rating': ratings_df['Rating']
    })
    return prepared_df

# Prepare the data and calculate the Elo ratings
ratings_prepared = prepare_ratings_data(ratings)
tool_elo, model_elo, framework_elo = compute_mle_elo_combined(ratings_prepared)

# Output the results
print("Tool Elo Ratings:\n", tool_elo)
print("Model Elo Ratings:\n", model_elo)
print("Framework Elo Ratings:\n", framework_elo)

Tool Elo Ratings:
 riza-code-interpreter   1228.03
riza-code-interpreter   1228.03
dall-e                  1132.32
dall-e                  1132.32
shell                   1027.20
                          ...  
asknews                  806.86
google-lens              798.16
google-lens              798.16
file-search              788.80
file-search              788.80
Length: 67, dtype: float64
Model Elo Ratings:
 llama-3.1-70B-instruct       1045.42
llama-3.1-70B-instruct       1045.42
open-mixtral-8x7b            1041.94
open-mixtral-8x7b            1041.94
gemini-1.5-flash-002         1038.01
gemini-1.5-flash-002         1038.01
gpt-4-turbo-2024-04-09       1030.83
gpt-4-turbo-2024-04-09       1030.83
open-mixtral-8x22b           1028.28
open-mixtral-8x22b           1028.28
gpt-4o-2024-05-13            1024.76
gpt-4o-2024-05-13            1024.76
gpt-4o-2024-08-06            1023.52
gpt-4o-2024-08-06            1023.52
claude-3-opus-20240229       1023.26
claude-3-opus-20240229     

In [25]:
import pandas as pd

def save_elo_ratings_as_text(tool_elo, model_elo, framework_elo, tool_file='tool_elo_ratings.txt', model_file='model_elo_ratings.txt', framework_file='framework_elo_ratings.txt'):
    # Save tool ratings to a separate text file
    with open(tool_file, 'w') as f:
        f.write(tool_elo.to_string(header=True))
    print(f"Tool Elo ratings saved to {tool_file}")

    # Save model ratings to a separate text file
    with open(model_file, 'w') as f:
        f.write(model_elo.to_string(header=True))
    print(f"Model Elo ratings saved to {model_file}")

    # Save framework ratings to a separate text file
    with open(framework_file, 'w') as f:
        f.write(framework_elo.to_string(header=True))
    print(f"Framework Elo ratings saved to {framework_file}")

# Example usage:
save_elo_ratings_as_text(tool_elo, model_elo, framework_elo, 
                         tool_file='tool_elo_ratings.txt', 
                         model_file='model_elo_ratings.txt', 
                         framework_file='framework_elo_ratings.txt')


Tool Elo ratings saved to tool_elo_ratings.txt
Model Elo ratings saved to model_elo_ratings.txt
Framework Elo ratings saved to framework_elo_ratings.txt


In [26]:
def save_tool_ratings_by_category(tool_elo, ratings_df, output_file='tool_ratings_by_category.txt'):
    # Define the specific categories
    categories = [
        'Search Engines',
        'Simple Math',
        'Knowledge Bases',
        'Math/CS Academic Search',
        'Code Interpreter'
    ]

    # Create a dictionary to map tools to their categories
    tool_to_categories = {}

    # Loop through the DataFrame and map tools to their categories
    for i, row in ratings_df.iterrows():
        for side in ['left', 'right']:
            tool = row[f'tools_{side}']
            tool_categories = row[f'tool_categories_{side}']
            if pd.notna(tool):
                if tool not in tool_to_categories:
                    tool_to_categories[tool] = set()
                tool_to_categories[tool].update(cat for cat in tool_categories if cat in categories)

    # Write the tool Elo ratings by category to a text file
    with open(output_file, 'w') as f:
        for category in categories:
            f.write(f"Category: {category}\n")
            for tool, tool_categories in tool_to_categories.items():
                if category in tool_categories:
                    if tool in tool_elo.index:
                        rating = tool_elo.loc[tool]
                        if isinstance(rating, pd.Series):
                            rating = rating.iloc[0]  # Take the first value if it's a Series
                        if pd.notna(rating):
                            f.write(f"{tool}: {rating:.2f}\n")
                        else:
                            f.write(f"{tool}: No rating available\n")
                    else:
                        f.write(f"{tool}: No rating available\n")
            f.write("\n")
    print(f"Tool ratings by category saved to {output_file}")

# Save the tool ratings by category in a text file
save_tool_ratings_by_category(tool_elo, ratings_prepared, output_file='tool_ratings_by_category.txt')

Tool ratings by category saved to tool_ratings_by_category.txt
