In [1]:
import pandas as pd
import numpy as np
import sys

sys.path.append("..")

from votenrank import Leaderboard
from votenrank.iia_exp import compute_iia
from votenrank.data_processing import preprocess_glue, preprocess_sglue, preprocess_value

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import warnings
warnings.filterwarnings("ignore")

## GLUE

In [2]:
glue, glue_weights = preprocess_glue(pd.read_csv("../tables/leaderboards - GLUE.csv"), head=30)
glue.head()

Unnamed: 0_level_0,CoLA,SST-2,MNLI-m,MNLI-mm,QNLI,RTE,WNLI,AX,MRPC_n1,MRPC_n2,STS-B_n1,STS-B_n2,QQP_n1,QQP_n2
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
ERNIE,75.5,97.8,92.3,91.7,97.3,92.6,95.9,51.7,93.9,91.8,93.0,92.6,75.2,90.9
StructBERT + CLEVER,75.3,97.7,91.7,91.5,97.4,92.5,95.2,49.1,93.9,91.9,93.5,93.1,75.6,90.8
DeBERTa / TuringNLRv4,71.5,97.5,91.9,91.6,99.2,93.2,94.5,53.2,94.0,92.0,92.9,92.6,76.2,90.8
DeBERTa + CLEVER,73.4,97.5,92.1,91.7,96.5,92.8,96.6,35.2,92.8,90.4,93.2,92.9,76.3,90.8
MacALBERT + DKM,74.8,97.0,91.3,91.1,97.8,92.0,94.5,52.6,94.5,92.6,92.8,92.6,74.7,90.6


In [3]:
for method in ["borda", "copeland", "dowdall", "mean", "minimax", "plurality"]:
    print(f"{method}:")
    print(compute_iia(method, glue.dropna(), glue_weights, num_repetitions=50), end="\n\n")

borda:


  0%|          | 0/50 [00:00<?, ?it/s]

(6.64, [6, 10, 5, 5, 6, 4, 13, 9, 3, 8, 4, 7, 8, 7, 9, 12, 6, 7, 5, 4, 2, 6, 3, 9, 3, 9, 4, 10, 4, 6, 9, 10, 6, 4, 4, 8, 5, 3, 7, 13, 10, 9, 7, 7, 7, 10, 4, 7, 4, 4])

copeland:


  0%|          | 0/50 [00:00<?, ?it/s]

(3.14, [5, 2, 3, 0, 3, 3, 3, 5, 4, 3, 3, 1, 3, 2, 4, 3, 2, 0, 2, 4, 4, 0, 1, 4, 5, 2, 4, 4, 6, 5, 5, 4, 2, 4, 2, 0, 4, 3, 2, 3, 2, 3, 4, 2, 1, 4, 6, 5, 6, 5])

dowdall:


  0%|          | 0/50 [00:00<?, ?it/s]

(8.68, [9, 8, 10, 7, 15, 6, 11, 9, 10, 10, 6, 7, 8, 7, 10, 6, 8, 9, 10, 10, 11, 10, 10, 11, 9, 14, 4, 7, 8, 8, 9, 7, 9, 11, 10, 13, 5, 9, 7, 6, 11, 6, 8, 8, 10, 5, 9, 3, 7, 13])

mean:


  0%|          | 0/50 [00:00<?, ?it/s]

(0.0, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

minimax:


  0%|          | 0/50 [00:00<?, ?it/s]

(2.66, [3, 7, 3, 1, 2, 2, 4, 4, 2, 3, 4, 1, 1, 3, 3, 4, 4, 2, 0, 2, 2, 1, 3, 2, 5, 4, 1, 1, 3, 3, 3, 3, 3, 3, 2, 2, 1, 1, 4, 0, 1, 2, 2, 5, 2, 1, 5, 5, 5, 3])

plurality:


  0%|          | 0/50 [00:00<?, ?it/s]

(4.94, [5, 6, 4, 5, 5, 4, 7, 3, 4, 7, 5, 7, 5, 6, 5, 4, 2, 4, 7, 3, 6, 6, 7, 4, 5, 4, 5, 4, 4, 8, 5, 6, 5, 3, 6, 5, 4, 6, 4, 4, 3, 6, 5, 4, 4, 5, 4, 5, 6, 6])



## SGLUE

In [4]:
from votenrank.data_processing import preprocess_sglue

sglue, sglue_weights = preprocess_sglue(pd.read_csv("../tables/leaderboards - SuperGLUE.csv"))
sglue.head()

Unnamed: 0_level_0,BoolQ,COPA,RTE,WiC,WSC,AX-b,CB_n1,CB_n2,MultiRC_n1,MultiRC_n2,ReCoRD_n1,ReCoRD_n2,AX-g_n1,AX-g_n2
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
ERNIE 3.0,91.0,97.4,92.6,77.4,97.3,68.6,98.6,99.2,88.6,63.2,94.7,94.2,92.7,94.7
"T5 + UDG, Single Model (Google Brain)",91.4,98.0,93.0,77.9,96.6,69.1,95.8,97.6,88.3,63.0,94.2,93.5,92.7,91.9
DeBERTa / TuringNLRv4,90.4,98.4,93.2,77.5,95.9,66.7,95.7,97.6,88.2,63.7,94.5,94.1,93.3,93.8
SuperGLUE Human Baselines,89.0,100.0,93.6,80.0,100.0,76.6,95.8,98.9,81.8,51.9,91.7,91.3,99.3,99.7
T5,91.2,94.8,92.5,76.9,93.8,65.6,93.9,96.8,88.1,63.3,94.1,93.4,92.7,91.9


In [5]:
for method in ["borda", "copeland", "dowdall", "mean", "minimax", "plurality"]:
    print(f"{method}:")
    print(compute_iia(method, sglue.dropna(), sglue_weights, num_repetitions=50), end="\n\n")

borda:


  0%|          | 0/50 [00:00<?, ?it/s]

(4.32, [2, 5, 4, 3, 1, 3, 6, 7, 5, 5, 6, 5, 7, 6, 6, 1, 6, 4, 4, 3, 3, 7, 6, 3, 1, 6, 3, 5, 2, 2, 3, 6, 3, 3, 5, 6, 4, 2, 6, 5, 6, 5, 2, 3, 4, 4, 5, 4, 7, 6])

copeland:


  0%|          | 0/50 [00:00<?, ?it/s]

(0.92, [1, 1, 1, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 2, 2, 1, 2, 3, 0, 0, 0, 2, 0, 2, 4, 0, 0, 0, 0, 2, 3, 1, 1, 2, 1, 0, 0, 1, 0, 0, 1, 3, 0, 2, 2, 1, 0, 0])

dowdall:


  0%|          | 0/50 [00:00<?, ?it/s]

(4.36, [6, 4, 1, 2, 4, 4, 2, 5, 4, 6, 5, 4, 6, 3, 5, 9, 4, 8, 1, 3, 7, 4, 1, 3, 6, 2, 3, 3, 13, 1, 2, 5, 3, 4, 8, 5, 4, 9, 6, 3, 10, 2, 4, 6, 1, 5, 3, 3, 0, 6])

mean:


  0%|          | 0/50 [00:00<?, ?it/s]

(0.0, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

minimax:


  0%|          | 0/50 [00:00<?, ?it/s]

(1.08, [1, 1, 1, 1, 2, 0, 2, 0, 1, 1, 1, 0, 1, 2, 0, 0, 1, 2, 1, 1, 2, 3, 1, 1, 1, 1, 1, 2, 1, 3, 1, 0, 2, 2, 1, 1, 0, 0, 0, 0, 1, 2, 0, 2, 1, 3, 0, 0, 2, 1])

plurality:


  0%|          | 0/50 [00:00<?, ?it/s]

(1.76, [1, 2, 0, 1, 2, 3, 1, 3, 3, 1, 1, 1, 3, 4, 0, 2, 3, 2, 0, 2, 0, 2, 1, 4, 1, 3, 2, 3, 1, 2, 1, 2, 3, 3, 2, 1, 1, 1, 0, 2, 4, 2, 2, 0, 2, 1, 1, 2, 2, 2])

