In [24]:
import pandas as pd
import random
from copy import deepcopy

In [3]:
df = pd.read_csv('main.tsv', sep='\t')

In [11]:
models_list = ['SD3', 'SDXL', 'SDXL_Turbo', 'Kandinsky', 'PixArt_Sigma', 'Playground', 'IF', 'Openjourney', 'Hunyuan-DiT', 'SD_V1.5']

In [13]:
def sample_model_pair(models_list):
    return random.sample(models_list, 2)

# Adding the sampled pairs to the dataframe
df[['model_1', 'model_2']] = df.apply(lambda row: pd.Series(sample_model_pair(models_list)), axis=1)


In [17]:
df['model_1'].value_counts() + df['model_2'].value_counts()

Hunyuan-DiT     701
IF              685
Kandinsky       699
Openjourney     708
PixArt_Sigma    662
Playground      637
SD3             637
SDXL            646
SDXL_Turbo      661
SD_V1.5         704
dtype: int64

In [18]:
weights = [0.4, 0.4, 0.1, 0.1]  # Weights for values 0, 1, 2, 3, 4
values = [0, 1, 2, 3]

df['assessor_vitya'] = random.choices(values, weights=weights, k=len(df))

In [20]:
df['assessor_vitya'].value_counts()

1    1382
0    1333
2     332
3     323
Name: assessor_vitya, dtype: int64

In [21]:
df = pd.read_csv('main.tsv', sep='\t')

initial_elo = 1200
elo_ratings = {model: initial_elo for model in set(df['model_1']).union(df['model_2'])}

# Function to calculate expected score
def expected_score(rating_a, rating_b):
    return 1 / (1 + 10**((rating_b - rating_a) / 400))

# Function to update ELO ratings
def update_elo(rating_a, rating_b, score_a, k=32):
    expected_a = expected_score(rating_a, rating_b)
    new_rating_a = rating_a + k * (score_a - expected_a)
    new_rating_b = rating_b + k * ((1 - score_a) - (1 - expected_a))
    return new_rating_a, new_rating_b


for index, row in df.iterrows():
    model_1 = row['model_1']
    model_2 = row['model_2']
    outcome = row['assessor_vitya']
    
    if outcome == 0:  # model_1 wins
        score_1 = 1
        score_2 = 0
    elif outcome == 1:  # model_2 wins
        score_1 = 0
        score_2 = 1
    elif outcome == 2:  # tie
        score_1 = 0.5
        score_2 = 0.5
    elif outcome == 3:  # both models are bad
        continue  # No change in ratings for this outcome
    
    rating_1 = elo_ratings[model_1]
    rating_2 = elo_ratings[model_2]
    
    new_rating_1, new_rating_2 = update_elo(rating_1, rating_2, score_1)
    
    elo_ratings[model_1] = new_rating_1
    elo_ratings[model_2] = new_rating_2

    # Convert ratings_changes to dataframe
    changes_df = pd.DataFrame.from_dict(ratings_changes, orient='index').fillna(0).T

    # Calculate sandwich robust standard errors
    robust_cov = cov_hc0(changes_df)
    robust_std_errors = np.sqrt(np.diag(robust_cov))

    # Calculate confidence intervals
    z_score = 1.96  # For 95% confidence interval
    confidence_intervals = {
        model: (elo_ratings[model] - z_score * robust_std_errors[i], elo_ratings[model] + z_score * robust_std_errors[i])
        for i, model in enumerate(elo_ratings)
    }

    with open(output_file, 'w') as f:
        for model, rating in elo_ratings.items():
            ci = confidence_intervals[model]
            f.write(f"{model}: {rating} (95% CI: {ci[0]:.2f} - {ci[1]:.2f})\n")

# Convert the ELO ratings dictionary to a dataframe for display
elo_df = pd.DataFrame(list(elo_ratings.items()), columns=['Model', 'ELO Rating'])
elo_df = elo_df.sort_values(by='ELO Rating', ascending=False).reset_index(drop=True)

elo_txt_path = "elo_ratings.txt"
with open(elo_txt_path, 'w') as f:
    for model, rating in elo_ratings.items():
        f.write(f"{model}: {rating}\n")


In [59]:
import pandas as pd
import random
import argparse
import numpy as np
from statsmodels.stats.sandwich_covariance import cov_hc0

def expected_score(rating_a, rating_b):
    return 1 / (1 + 10**((rating_b - rating_a) / 400))

def update_elo(rating_a, rating_b, score_a, k=32):
    expected_a = expected_score(rating_a, rating_b)
    new_rating_a = rating_a + k * (score_a - expected_a)
    new_rating_b = rating_b + k * ((1 - score_a) - (1 - expected_a))
    return new_rating_a, new_rating_b

elo_ratings = {model: initial_elo for model in set(df['model_1']).union(df['model_2'])}
label_column='assessor_vitya'
output_file = 'ELO_test.txt'
ratings_changes = {model: [] for model in elo_ratings.keys()}

for index, row in df.iterrows():
    model_1 = row['model_1']
    model_2 = row['model_2']
    outcome = row[label_column]

    if outcome == 0:  # model_1 wins
        score_1 = 1
        score_2 = 0
    elif outcome == 1:  # model_2 wins
        score_1 = 0
        score_2 = 1
    elif outcome == 2:  # tie
        score_1 = 0.5
        score_2 = 0.5
    elif outcome == 3:  # both models are bad
        continue  # No change in ratings for this outcome

    rating_1 = elo_ratings[model_1]
    rating_2 = elo_ratings[model_2]

    new_rating_1, new_rating_2 = update_elo(rating_1, rating_2, score_1)

    ratings_changes[model_1].append(new_rating_1 - rating_1)
    ratings_changes[model_2].append(new_rating_2 - rating_2)

    elo_ratings[model_1] = new_rating_1
    elo_ratings[model_2] = new_rating_2

# Calculate standard deviation of rating changes
ratings_std = {model: np.std(changes) for model, changes in ratings_changes.items()}

# Calculate confidence intervals
z_score = 1.96  # For 95% confidence interval
confidence_intervals = {
    model: (elo_ratings[model] - z_score * ratings_std[model], elo_ratings[model] + z_score * ratings_std[model])
    for model in elo_ratings
}


with open(output_file, 'w') as f:
    for model, rating in elo_ratings.items():
        ci = confidence_intervals[model]
        f.write(f"{model}: {rating} (95% CI: {ci[0]:.2f} - {ci[1]:.2f})\n")




In [57]:
!pip install statsmodels

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting statsmodels
  Downloading statsmodels-0.14.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.2 kB)
Collecting patsy>=0.5.6 (from statsmodels)
  Downloading patsy-0.5.6-py2.py3-none-any.whl.metadata (3.5 kB)
Downloading statsmodels-0.14.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading patsy-0.5.6-py2.py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.9/233.9 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: patsy, statsmodels
Successfully installed patsy-0.5.6 statsmodels-0.14.2
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;4

In [None]:
df.drop(c)

In [47]:
# https://github.com/bjlkeng/Bradley-Terry-Model/blob/master/update_model.py

import numpy as np
import os
import pandas as pd
import time

from datetime import datetime
from collections import Counter

DUMMY_PLAYER = 'DUMMY PLAYER'

def extract_game_data(df):
    df1 = deepcopy(df[df['assessor_vitya'] < 2])
    df1['Player A'] = df1['model_1']
    df1['Player B'] = df1['model_2']
    df1['Wins A'] = df1['assessor_vitya'] == 0
    df1['Wins B'] = df1['assessor_vitya'] == 1

    # assert all(c in df.columns for c in ['Date', 'Player A', 'Player B', 'Wins A', 'Wins B']), \
    #     'Expecting columns Date, Player A, Player B, Wins A, Wins B'

   # df['Date'] = df['Date'].astype(datetime)
    df1['Wins A'] = df1['Wins A'].astype(int)
    df1['Wins B'] = df1['Wins B'].astype(int)

    df1 = df1.drop(columns = df.columns)

    return df1



def add_dummy_games(game_data, alpha=1):
    ''' Regularizes the estimate by adding games against a dummy player.

        :param alpha: regularization parameter, number dummy wins/loses to add
    '''
    players = sorted(list(set(game_data['Player A']) | set(game_data['Player B'])))

    # Add dummy games
    dummy_data = [[p, DUMMY_PLAYER, alpha, alpha] for p in players]
    df = pd.DataFrame(dummy_data, columns=game_data.columns)
    df = pd.concat([game_data, df])
    df

    return df


def compute_rank_scores(game_data, max_iters=1000, error_tol=1e-3):
    ''' Computes Bradley-Terry using iterative algorithm

        See: https://en.wikipedia.org/wiki/Bradley%E2%80%93Terry_model
    '''
    # Do some aggregations for convenience
    # Total wins per player
    winsA = game_data.groupby('Player A').agg(sum)['Wins A'].reset_index()
    winsA = winsA[winsA['Wins A'] > 0]
    winsA.columns = ['Player', 'Wins']
    winsB = game_data.groupby('Player B').agg(sum)['Wins B'].reset_index()
    winsB = winsB[winsB['Wins B'] > 0]
    winsB.columns = ['Player', 'Wins']
    wins = pd.concat([winsA, winsB]).groupby('Player').agg(sum)['Wins']

    # Total games played between pairs
    num_games = Counter()
    for index, row in game_data.iterrows():
        key = tuple(sorted([row['Player A'], row['Player B']]))
        total = sum([row['Wins A'], row['Wins B']])
        num_games[key] += total

    # Iteratively update 'ranks' scores
    players = sorted(list(set(game_data['Player A']) | set(game_data['Player B'])))
    ranks = pd.Series(np.ones(len(players)) / len(players), index=players)
    for iters in range(max_iters):
        oldranks = ranks.copy()
        for player in ranks.index:
            denom = np.sum(num_games[tuple(sorted([player, p]))]
                           / (ranks[p] + ranks[player])
                           for p in ranks.index if p != player)
            ranks[player] = 1.0 * wins[player] / denom

        ranks /= sum(ranks)

        if np.sum((ranks - oldranks).abs()) < error_tol:
            break

    if np.sum((ranks - oldranks).abs()) < error_tol:
        print(f" * Converged after {iters} iterations.")
    else:
        print(f" * Max iterations reached ({max_iters} iters).")

    del ranks[DUMMY_PLAYER]

    # Scale logarithm of score to be between 1 and 1000
    ranks = ranks.sort_values(ascending=False) \
                 .apply(lambda x: np.log1p(1000 * x) / np.log1p(1000) * 1000) \
                 .astype(int) \
                 .clip(1)

    return ranks

df = pd.read_csv('main.tsv', sep='\t')

df1 = extract_game_data(df)
games = add_dummy_games(df1)
ranks = compute_rank_scores(games)

BT_txt_path = "BT_ratings.txt"
with open(BT_txt_path, 'w') as f:
    for model, rating in ranks.items():
        f.write(f"{model}: {rating}\n")

In [54]:
for model_name, rating in ranks.items():
    print(model_name, rating)

Kandinsky 666
Hunyuan-DiT 662
SD3 661
IF 660
SDXL 659
Openjourney 654
PixArt_Sigma 646
SDXL_Turbo 644
SD_V1.5 643
Playground 642


 * Converged after 5 iterations.


  winsA = game_data.groupby('Player A').agg(sum)['Wins A'].reset_index()
  winsB = game_data.groupby('Player B').agg(sum)['Wins B'].reset_index()
  denom = np.sum(num_games[tuple(sorted([player, p]))]


In [51]:
ranks

Kandinsky       666
Hunyuan-DiT     662
SD3             661
IF              660
SDXL            659
Openjourney     654
PixArt_Sigma    646
SDXL_Turbo      644
SD_V1.5         643
Playground      642
dtype: int64