In [1]:
import pandas as pd

# Read the CSV files
df_stan = pd.read_csv('../results/results_stan.csv')
df_nico = pd.read_csv('../results/results_nico.csv')

# Rename stan columns before the merge: player_1_won20240324081115 to player_1_won
df_stan.rename(columns={' player_1_won20240324081115': ' player_1_won'}, inplace=True)

# Merge the dataframes in a df DataFrame
df = pd.concat([df_stan, df_nico])

# Strip all whitespaces from the column names
df.columns = df.columns.str.strip()

df_stan.columns = df_stan.columns.str.strip()

print(f"df shape: {df.shape}")
print("stan df shape: ", df_stan.shape)

# Print the DataFrame
df.head()

df shape: (342, 11)
stan df shape:  (111, 11)


Unnamed: 0,id,player_1_model,player_1_temperature,player_2_model,player_2_temperature,player_1_won,openai:gpt-3.5-turbo-0125,0.0,openai:gpt-3.5-turbo-0125.1,0.0.1,False
0,20240324081116,mistral:mistral-small-latest,0.0,openai:gpt-3.5-turbo-0125,0.0,True,,,,,
1,20240324081132,mistral:mistral-small-latest,0.0,openai:gpt-4-0125-preview,0.0,True,,,,,
2,20240324081140,mistral:mistral-large-latest,0.0,openai:gpt-3.5-turbo-0125,0.0,False,,,,,
3,20240324081140,openai:gpt-4-0125-preview,0.0,mistral:mistral-small-latest,0.0,False,,,,,
4,20240324081154,mistral:mistral-medium-latest,0.0,mistral:mistral-medium-latest,0.0,False,,,,,


In [28]:
# Unique values for player_1_model
model_names

array([' mistral:mistral-small-latest', ' mistral:mistral-large-latest',
       ' openai:gpt-4-0125-preview', ' mistral:mistral-medium-latest',
       ' openai:gpt-4', ' openai:gpt-3.5-turbo-0125',
       ' openai:gpt-4-1106-preview', ' openai:gpt-4-turbo-preview'],
      dtype=object)

In [2]:
# Check the type of player_1_won
print(f"player_1_won type: {df['player_1_won'].dtype}")

for index, row in df.iterrows():
    print(type(row["player_1_won"]))
    break

# Check the value types
print(f"player_1_won value types: {df['player_1_won'].value_counts()}")

player_1_won type: object
<class 'str'>
player_1_won value types: player_1_won
False    175
True     167
Name: count, dtype: int64


In [21]:
# Get the number of wins for each model
player_nb_wins = {player_id: 0 for player_id in pd.concat([df['player_1_model'], df['player_2_model']]).unique()}

# Go over the rows of the DataFrame
for index, row in df.iterrows():
    if row["player_1_won"] == "True":
        player_nb_wins[row["player_1_model"]] += 1
    else:
        player_nb_wins[row["player_2_model"]] += 1

print("Number of wins for each model:")
print(player_nb_wins)

Number of wins for each model:
{' mistral:mistral-small-latest': 42, ' mistral:mistral-large-latest': 41, ' openai:gpt-4-0125-preview': 62, ' mistral:mistral-medium-latest': 37, ' openai:gpt-4': 76, ' openai:gpt-3.5-turbo-0125': 55, ' openai:gpt-4-1106-preview': 28, ' openai:gpt-4-turbo-preview': 1}


## Get the relative scores

In [3]:
# Get the win rate of each model

# Get a list of each model name in player_1_model and player_2_model
model_names = pd.concat([df['player_1_model'], df['player_2_model']]).unique()

# Go over the rows of the DataFrame
model_wins = {model_name: 0 for model_name in model_names}

for index, row in df.iterrows():
    if row["player_1_won"] == "True":
        model_wins[row["player_1_model"]] += 1
    else:
        model_wins[row["player_2_model"]] += 1

print("Number of wins for each model:")
print(model_wins)

Number of wins for each model:
{' mistral:mistral-small-latest': 42, ' mistral:mistral-large-latest': 41, ' openai:gpt-4-0125-preview': 62, ' mistral:mistral-medium-latest': 37, ' openai:gpt-4': 76, ' openai:gpt-3.5-turbo-0125': 55, ' openai:gpt-4-1106-preview': 28, ' openai:gpt-4-turbo-preview': 1}


In [4]:
# Get the number of fight for each model
model_fights = {model_name: 0 for model_name in model_names}

for index, row in df.iterrows():
    model_fights[row["player_1_model"]] += 1
    model_fights[row["player_2_model"]] += 1

print("Number of fights for each model:")
print(model_fights)

Number of fights for each model:
{' mistral:mistral-small-latest': 132, ' mistral:mistral-large-latest': 112, ' openai:gpt-4-0125-preview': 85, ' mistral:mistral-medium-latest': 128, ' openai:gpt-4': 111, ' openai:gpt-3.5-turbo-0125': 78, ' openai:gpt-4-1106-preview': 37, ' openai:gpt-4-turbo-preview': 1}


In [5]:
# Get the win rate of each model
model_win_rate = {model_name: model_wins[model_name] / model_fights[model_name] for model_name in model_names}

# Make it a nice dataframe for visualization
df_win_rate = pd.DataFrame(model_win_rate.items(), columns=["model_name", "win_rate"])

# Order it by desceding win rate
df_win_rate = df_win_rate.sort_values("win_rate", ascending=False)

df_win_rate.head(n=len(model_names))

Unnamed: 0,model_name,win_rate
7,openai:gpt-4-turbo-preview,1.0
6,openai:gpt-4-1106-preview,0.756757
2,openai:gpt-4-0125-preview,0.729412
5,openai:gpt-3.5-turbo-0125,0.705128
4,openai:gpt-4,0.684685
1,mistral:mistral-large-latest,0.366071
0,mistral:mistral-small-latest,0.318182
3,mistral:mistral-medium-latest,0.289062


## Calculate the ELO rating for each model

In [2]:
# Get the model 
model_names = pd.concat([df['player_1_model'], df['player_2_model']]).unique()

# Initialize player ratings
player_ratings = {player_id: 1500 for player_id in model_names}

def elo_expected_score(rating_a, rating_b):
    return 1 / (1 + 10**((rating_b - rating_a) / 400))

def elo_update(winner_rating, loser_rating, k=32):
    expected_score_winner = elo_expected_score(winner_rating, loser_rating)
    expected_score_loser = 1 - expected_score_winner
    new_winner_rating = winner_rating + k * (1 - expected_score_winner)
    new_loser_rating = loser_rating + k * (0 - expected_score_loser)
    return new_winner_rating, new_loser_rating

# Iterate through matches to update ELO ratings
for index, row in df.iterrows():
    player1, player2, player1_won = row['player_1_model'], row['player_2_model'], row['player_1_won']

    # If it's a match against itself, skip
    if player1 == player2:
        print(f"Skipping match between {player1} and {player2}")
        continue
    
    if player1_won == " True":
        winner, loser = player1, player2
    else:
        winner, loser = player2, player1
    
    new_winner_rating, new_loser_rating = elo_update(player_ratings[winner], player_ratings[loser])
    
    player_ratings[winner] = new_winner_rating
    player_ratings[loser] = new_loser_rating

# Print updated ratings
print(player_ratings)

# Make it a DataFrame so we can have a nice display
ratings_df = pd.DataFrame(player_ratings.items(), columns=['Model', 'Rating'])

# Sort the DataFrame by rating
ratings_df = ratings_df.sort_values(by='Rating', ascending=False)

# Display the ratings
ratings_df.head(n=len(model_names))

Skipping match between  mistral:mistral-medium-latest and  mistral:mistral-medium-latest
Skipping match between  mistral:mistral-medium-latest and  mistral:mistral-medium-latest
Skipping match between  openai:gpt-4-0125-preview and  openai:gpt-4-0125-preview
Skipping match between  openai:gpt-4-0125-preview and  openai:gpt-4-0125-preview
Skipping match between  openai:gpt-4 and  openai:gpt-4
Skipping match between  openai:gpt-3.5-turbo-0125 and  openai:gpt-3.5-turbo-0125
Skipping match between  openai:gpt-4-0125-preview and  openai:gpt-4-0125-preview
Skipping match between  mistral:mistral-medium-latest and  mistral:mistral-medium-latest
Skipping match between  mistral:mistral-medium-latest and  mistral:mistral-medium-latest
Skipping match between  mistral:mistral-medium-latest and  mistral:mistral-medium-latest
Skipping match between  openai:gpt-4 and  openai:gpt-4
Skipping match between  mistral:mistral-small-latest and  mistral:mistral-small-latest
Skipping match between  mistral:mi

Unnamed: 0,Model,Rating
5,openai:gpt-3.5-turbo-0125,1776.109628
0,mistral:mistral-small-latest,1586.156742
6,openai:gpt-4-1106-preview,1584.781485
4,openai:gpt-4,1517.200818
7,openai:gpt-4-turbo-preview,1509.277531
2,openai:gpt-4-0125-preview,1438.923126
3,mistral:mistral-medium-latest,1356.186879
1,mistral:mistral-large-latest,1231.36379


In [4]:
mdwn = ratings_df.to_markdown()

In [6]:
print(mdwn)

|    | Model                         |   Rating |
|---:|:------------------------------|---------:|
|  5 | openai:gpt-3.5-turbo-0125     |  1776.11 |
|  0 | mistral:mistral-small-latest  |  1586.16 |
|  6 | openai:gpt-4-1106-preview     |  1584.78 |
|  4 | openai:gpt-4                  |  1517.2  |
|  7 | openai:gpt-4-turbo-preview    |  1509.28 |
|  2 | openai:gpt-4-0125-preview     |  1438.92 |
|  3 | mistral:mistral-medium-latest |  1356.19 |
|  1 | mistral:mistral-large-latest  |  1231.36 |


In [11]:
# Get the model names
model_names = pd.concat([df_stan['player_1_model'], df_stan['player_2_model']]).unique()
row_size = 0

player_ratings = {}

# Initialize player ratings
player_ratings = {player_id: 1500 for player_id in model_names}

def elo_expected_score(rating_a, rating_b):
    return 1 / (1 + 10**((rating_b - rating_a) / 400))

def elo_update(winner_rating, loser_rating, k=32):
    expected_score_winner = elo_expected_score(winner_rating, loser_rating)
    expected_score_loser = 1 - expected_score_winner
    new_winner_rating = winner_rating + k * (1 - expected_score_winner)
    new_loser_rating = loser_rating + k * (0 - expected_score_loser)
    return new_winner_rating, new_loser_rating

# Iterate through matches to update ELO ratings
for index, row in df_stan.iterrows():
    player1, player2, player1_won = row['player_1_model'], row['player_2_model'], row['player_1_won']

    print(f"-{player1_won}-")

    # If it's a match against itself, skip
    if player1 == player2:
        print(f"Skipping match between {player1} and {player2}")
        continue

    row_size += 1
    
    if player1_won == " True":
        print("player1_won")
        winner, loser = player1, player2
    else:
        winner, loser = player2, player1
    
    new_winner_rating, new_loser_rating = elo_update(player_ratings[winner], player_ratings[loser])
    
    player_ratings[winner] = new_winner_rating
    player_ratings[loser] = new_loser_rating

# Print updated ratings
print(player_ratings)

# Make it a DataFrame so we can have a nice display
ratings_df_stan = pd.DataFrame(player_ratings.items(), columns=['Model', 'Rating'])

# Sort the DataFrame by rating
ratings_df_stan = ratings_df_stan.sort_values(by='Rating', ascending=False)

# Display the ratings
ratings_df_stan.head(n=len(model_names))

- True-
player1_won
- True-
player1_won
- False-
- False-
- False-
Skipping match between  mistral:mistral-medium-latest and  mistral:mistral-medium-latest
- False-
- False-
- False-
- False-
- False-
- True-
player1_won
- False-
- False-
- True-
player1_won
- False-
Skipping match between  mistral:mistral-medium-latest and  mistral:mistral-medium-latest
- False-
- False-
- True-
player1_won
- False-
- False-
- True-
player1_won
- True-
player1_won
- True-
player1_won
- False-
- True-
player1_won
- True-
player1_won
- True-
player1_won
- True-
Skipping match between  openai:gpt-4-0125-preview and  openai:gpt-4-0125-preview
- True-
Skipping match between  openai:gpt-4-0125-preview and  openai:gpt-4-0125-preview
- False-
- True-
player1_won
- True-
player1_won
- False-
- False-
- False-
- False-
Skipping match between  openai:gpt-4 and  openai:gpt-4
- True-
player1_won
- True-
player1_won
- False-
- True-
player1_won
- True-
player1_won
- True-
player1_won
- False-
- True-
player1_won
- 

Unnamed: 0,Model,Rating
5,openai:gpt-3.5-turbo-0125,1756.591332
4,openai:gpt-4,1565.103125
0,mistral:mistral-small-latest,1551.234582
2,openai:gpt-4-0125-preview,1500.505211
1,mistral:mistral-large-latest,1344.39076
3,mistral:mistral-medium-latest,1282.174991


In [6]:
# Get the win rate of mistral-small
mistral_small_win_rate = df_win_rate[df_win_rate["model_name"] == " mistral:mistral-small-latest"]["win_rate"].values[0]

NameError: name 'df_win_rate' is not defined

In [29]:
df_stan.head()

Unnamed: 0,id,player_1_model,player_1_temperature,player_2_model,player_2_temperature,player_1_won,openai:gpt-3.5-turbo-0125,0.0,openai:gpt-3.5-turbo-0125.1,0.0.1,False
0,20240324081116,mistral:mistral-small-latest,0.0,openai:gpt-3.5-turbo-0125,0.0,True,,,,,
1,20240324081132,mistral:mistral-small-latest,0.0,openai:gpt-4-0125-preview,0.0,True,,,,,
2,20240324081140,mistral:mistral-large-latest,0.0,openai:gpt-3.5-turbo-0125,0.0,False,,,,,
3,20240324081140,openai:gpt-4-0125-preview,0.0,mistral:mistral-small-latest,0.0,False,,,,,
4,20240324081154,mistral:mistral-medium-latest,0.0,mistral:mistral-medium-latest,0.0,False,,,,,
