<a target="_blank" href="https://colab.research.google.com/github/Techtonique/nnetsauce/blob/master/nnetsauce/demo/thierrymoudiki_20240519_deep_qrns.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

# 1 - utils

In [1]:
!pip install nnetsauce numpy scipy scikit-learn pandas tqdm joblib category_encoders ucimlrepo



In [2]:
import pandas
pandas.options.display.float_format = '{:,.4f}'.format

## 1 - 1 Replace nan with median

In [3]:
import numpy as np

def replace_nan_with_median(arr):
    # Calculate the median of each column ignoring NaN values
    median_vals = np.nanmedian(arr, axis=0)

    # Iterate over each column index and replace NaN with the corresponding median value
    for col_idx in range(arr.shape[1]):
        col_values = arr[:, col_idx]
        nan_indices = np.isnan(col_values)
        if np.any(nan_indices):
            col_values[nan_indices] = median_vals[col_idx]

    return arr

## 1 - 2 Ids for UCIML repo data sets

In [4]:
# Data Type: Tabular; Task: Classification;
# Features 10 to 100; #Instances 100 to 1000
DATASETS_IDS_UCIML = [109, 759, 936,
                      915, 942, 890, 848,
                      967]

DATASETS_SKLEARN = ["iris", "wine", "breast_cancer",
                    "covertype", "kddcup99", "adult"]

NROWS = 1000                    

## 1 - 3 Obtain data sets of size (1000, 10)

In [5]:
def select_NROWS_ten(X, y):
    print(f"X.shape (initial): {X.shape}")
    print(f"y.shape (initial): {y.shape}")
    print("Encoding features and response...")
    le = LabelEncoder()
    encoder = ce.HashingEncoder(return_df=False)
    X = np.asarray(encoder.fit_transform(X, y)).astype(np.float32)
    y = np.asarray(le.fit_transform(y)).astype(np.uint8)
    print("Done.")
    print("Finding top 10 features if necessary...")
    if X.shape[1] > 10:
        rf = RandomForestClassifier(n_estimators=50, random_state=42)
        rf.fit(X, y)
        indices = np.argsort(rf.feature_importances_)[::-1]
        top_ten_indices = indices[:10]
        print(f"Top 10 indices: {top_ten_indices}")
        X = X[:,top_ten_indices]
        print(f"X reduced shape: {X.shape}")
    print("Done.")
    if X.shape[0] > NROWS:
      print(f"Subsampling to {NROWS} if necessary...")
      start = time()
      sub = ns.SubSampler(y=y.ravel().astype(np.uint8),
                          n_samples=NROWS, seed=123, n_jobs=-1)
      idx_rows  = sub.subsample().ravel()
      print(f"... Elapsed time for subsampling: {time() - start}")
      print("Number of rows in the subsample: ", len(idx_rows))
      print("Rows in the subsample: ", idx_rows)
      return_X = replace_nan_with_median(X[idx_rows,:])
      return_y = y[idx_rows].ravel().astype(np.uint8)
      print("Done.")
      return return_X, return_y
    else:
      return X, y


# 2 - Download data sets from UCI ML repo and sklearn

## 2 - 1 UCI ML data sets

In [6]:
import joblib
import nnetsauce as ns
import numpy as np
import pandas as pd
from time import time
from ucimlrepo import fetch_ucirepo
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce

def load_uciml_data(dataset_id):
  # fetch dataset
  dataset = fetch_ucirepo(id=int(dataset_id))
  le = LabelEncoder()
  encoder = ce.HashingEncoder(return_df=True)
  # data (as pandas dataframes)
  y = le.fit_transform(dataset.data.targets.values)
  X = encoder.fit_transform(dataset.data.features, y)
  if len(y) > NROWS:
      print("Subsampling...")
      start = time()
      sub = ns.SubSampler(y=y.ravel().astype(np.uint8),
                          n_samples=NROWS,
                          seed=123,
                          n_jobs=-1)
      idx_rows  = sub.subsample()
      X, y = X.copy().iloc[idx_rows,:], y.copy()[idx_rows] # X is a data frame
      print(f"Elapsed time for subsampling: {time() - start}")
  X.fillna(X.median(), inplace=True)
  print(f"dataset_id: {dataset_id} --------------------")
  print(f"# of classes: {len(np.unique(y))}")
  return select_NROWS_ten(X.values.astype(np.float32), y.ravel().astype(np.uint8))

## 2 - 2 `sklearn`'s/TabSurvey real-world data

In [7]:
import category_encoders as ce
import sklearn.datasets
from sklearn.ensemble import RandomForestClassifier
import nnetsauce as ns
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine, load_iris, load_breast_cancer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from time import time
from functools import lru_cache


def discretize_colum(data_clm, num_values=10):
    """ Discretize a column by quantiles """
    r = np.argsort(data_clm)
    bin_sz = (len(r) / num_values) + 1  # make sure all quantiles are in range 0-(num_quarts-1)
    q = r // bin_sz
    return q


def load_data_sklearn(dataset="covertype"):

    print("Loading dataset " + dataset + "...")

    if dataset == "breast_cancer":
      loaded_dataset = load_breast_cancer()
      X, y = select_NROWS_ten(loaded_dataset.data, loaded_dataset.target)

    elif dataset == "iris":
      loaded_dataset = load_iris()
      X, y = loaded_dataset.data, loaded_dataset.target

    elif dataset == "wine":
      loaded_dataset = load_wine()
      X, y = loaded_dataset.data, loaded_dataset.target

    elif dataset == "covertype":  # Multi-class classification dataset
        X_temp, y_temp = sklearn.datasets.fetch_covtype(return_X_y=True)
        print("Is data frame: ", isinstance(X_temp, pd.DataFrame))
        X, y = select_NROWS_ten(X_temp, y_temp)

    elif dataset == "kddcup99":  # Multi-class classification dataset with categorical data
        X_temp, y_temp = sklearn.datasets.fetch_kddcup99(return_X_y=True)
        print("Is data frame: ", isinstance(X_temp, pd.DataFrame))
        X, y = select_NROWS_ten(X_temp, y_temp)

    elif dataset == "adult" or dataset == "adultcat":  # Binary classification dataset with categorical data, if you pass AdultCat, the numerical columns will be discretized.
        url_data = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

        features = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital-status', 'occupation',
                    'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
        label = "income"
        columns = features + [label]
        df = pd.read_csv(url_data, names=columns)

        # Fill NaN with something better?
        df.fillna(0, inplace=True)
        if dataset == "adultcat":
            columns_to_discr = [('age', 10), ('fnlwgt', 25), ('capital-gain', 10), ('capital-loss', 10),
                                ('hours-per-week', 10)]
            for clm, nvals in columns_to_discr:
                df[clm] = discretize_colum(df[clm], num_values=nvals)
                df[clm] = df[clm].astype(int).astype(str)
            df['education_num'] = df['education_num'].astype(int).astype(str)
        X_temp = df[features].to_numpy()
        y_temp = df[label].to_numpy()
        return select_NROWS_ten(X_temp, y_temp)
    else:
        raise AttributeError("Dataset \"" + dataset + "\" not available")

    print("Dataset loaded!")
    print(X.shape)

    return X, y

# 3 - Create data sets

In [8]:
## check #1
#idx_dataset = 8 # 0 to 8
#load_uciml_data(DATASETS_IDS_UCIML[idx_dataset])

In [9]:
## check #2
#idx_dataset2 = 5 # 0 to 5
#load_data_sklearn(DATASETS_SKLEARN[idx_dataset2])

In [10]:
#Xys = []

#for idx_dataset in range(len(DATASETS_IDS_UCIML)):
#  Xys.append(load_uciml_data(DATASETS_IDS_UCIML[idx_dataset]))

#for idx_dataset2 in range(len(DATASETS_SKLEARN)):
#  Xys.append(load_data_sklearn(DATASETS_SKLEARN[idx_dataset2]))

In [11]:
import warnings
import joblib
#from google.colab import files

#warnings.filterwarnings('ignore')

#joblib.dump(Xys, "Xys.pkl")
#files.download("Xys.pkl")

# 4 - Try import + fit baselines

In [1]:
!pip install joblib tabpfn nnetsauce xgboost scikit-optimize



In [2]:
import joblib
import nnetsauce as ns
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from tabpfn import TabPFNClassifier
from time import time

import numpy as np
import pandas as pd

import warnings
import xgboost as xgb
from sklearn.exceptions import ConvergenceWarning, UndefinedMetricWarning
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.metrics import make_scorer, balanced_accuracy_score

In [3]:
Xys = joblib.load("../datasets/Xys.pkl")

scores_array = np.empty((14, 5))

scores_array[:,:] = np.nan

scores_df = pd.DataFrame(scores_array, columns=["lazy_deep", "rf",
                                                "et", "tabpfn",
                                                "xgboost"])
timings_df = pd.DataFrame(scores_array, columns=["lazy_deep", "rf",
                                                      "et", "tabpfn",
                                                      "xgboost"])

In [4]:
for elt in Xys:
    print(elt[0].shape)
    print(elt[1].shape)

(178, 10)
(178,)
(839, 10)
(839,)
(714, 10)
(714,)
(383, 9)
(383,)
(995, 10)
(995,)
(999, 10)
(999,)
(999, 10)
(999,)
(999, 10)
(999,)
(150, 4)
(150,)
(178, 13)
(178,)
(569, 10)
(569,)
(996, 10)
(996,)
(994, 8)
(994,)
(999, 8)
(999,)


In [22]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import uniform, randint

# Define a custom scoring function using accuracy_score with normalize=True
def balanced_accuracy(y_true, y_pred):
    return balanced_accuracy_score(y_true, y_pred)

balanced_accuracy_scorer = make_scorer(balanced_accuracy)

# Define the warning types to ignore
sklearn_warnings = (ConvergenceWarning, UndefinedMetricWarning, FutureWarning, UserWarning)

with warnings.catch_warnings():

  # Filter out specified warnings
  warnings.filterwarnings('ignore')
  warnings.simplefilter("ignore")
  for warning_type in sklearn_warnings:
    warnings.filterwarnings("ignore", category=warning_type)

  for dataset_idx, (X, y) in enumerate(Xys):

    print(f"data set #:{dataset_idx + 1} --------------------")
    clf_lazydeep = ns.LazyDeepClassifier(verbose=0, ignore_warnings=True,
                                #estimators="all",
                                estimators=["AdaBoostRegressor",
                                            "BaggingRegressor",
                                            "ExtraTreesRegressor",
                                            "AdaBoostClassifier",
                                            "BaggingClassifier",
                                            "ExtraTreesClassifier",
                                            "SVC",
                                            "SVR"],
                                sort_by="Balanced Accuracy")
    clf_rf = RandomForestClassifier()
    clf_et = ExtraTreesClassifier()
    clf_tabpfn = TabPFNClassifier(device='cpu',
                                  N_ensemble_configurations=32)
    # clf_xgboost = BayesSearchCV(xgb.XGBClassifier(n_estimators=1000,
    #                                       random_state=13),
    #     {
    #     'eta': Real(1e-4, 1e-1),
    #     'max_depth': Integer( 1, 10),
    #     'subsample': Real(0.5, 1.0),
    #     'colsample_bytree': Real(0.5, 1.0),
    #     #'min_child_weight': Integer(1, 20)
    #     }, random_state=13,
    #     n_iter=50,
    #    cv=3,
    #     scoring=balanced_accuracy_scorer,
    #     #n_jobs = None,
    #     verbose=0)
    # clf_xgboost = xgb.XGBClassifier(random_state=13, 
    #                                 subsample=0.9, 
    #                                 colsample_bytree=0.9)
    
    # clf_xgboost = GridSearchCV(estimator=xgb.XGBClassifier(n_estimators=1000, 
    #                                                        random_state=13), 
    #                            param_grid={
    #                             'eta': [0.001, 0.01, 0.1],  # example values, you can adjust based on your range
    #                             'max_depth': [2, 5, 8],
    #                             #'subsample': [0.5, 0.8, 1.0],
    #                             #'colsample_bytree': [0.5, 0.8, 1.0]
    #                               },
    #                             scoring=balanced_accuracy_scorer,
    #                             cv=3,
    #                             verbose=2)

    clf_xgboost = xgb.XGBClassifier(random_state=13,
                                    n_estimators=1000,
                                    eta=0.01,
                                    max_depth=5,
                                    subsample=0.8,
                                    colsample_bytree=0.8)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

    start = time()
    models, _ = clf_lazydeep.fit(X_train, X_test, y_train, y_test)
    timings_df.loc[dataset_idx, "lazy_deep"] = time()-start
    print(f"Elapsed: {timings_df.loc[dataset_idx, 'lazy_deep']}")
    print(f"  balanced accuracy lazydeep ({models.index[0]}): {models.iloc[0, 1]}")
    scores_df.loc[dataset_idx, "lazy_deep"] = models.iloc[0, 1]

    start = time()
    preds_rf = clf_rf.fit(X_train, y_train).predict(X_test)
    timings_df.loc[dataset_idx, "rf"] = time()-start
    print(f"Elapsed: {timings_df.loc[dataset_idx, 'rf']}")
    print(f"  balanced accuracy rf: {balanced_accuracy_score(y_test, preds_rf)}")
    scores_df.loc[dataset_idx, "rf"] = balanced_accuracy_score(y_test, preds_rf)

    start = time()
    preds_et = clf_et.fit(X_train, y_train).predict(X_test)
    timings_df.loc[dataset_idx, "et"] = time()-start
    print(f"Elapsed: {timings_df.loc[dataset_idx, 'et']}")
    print(f"  balanced accuracy et: {balanced_accuracy_score(y_test, preds_et)}")
    scores_df.loc[dataset_idx, "et"] = balanced_accuracy_score(y_test, preds_et)

    
    start = time()
    preds_tabpfn = clf_tabpfn.fit(X_train, y_train).predict(X_test)
    timings_df.loc[dataset_idx, "tabpfn"] = time()-start
    print(f"Elapsed: {timings_df.loc[dataset_idx, 'tabpfn']}")
    print(f"  balanced accuracy tabpfn: {balanced_accuracy_score(y_test, preds_tabpfn)}")
    scores_df.loc[dataset_idx, "tabpfn"] = balanced_accuracy_score(y_test, preds_tabpfn)
  
    try:
      start = time()
      preds_xgboost = clf_xgboost.fit(X_train, y_train).predict(X_test)
      timings_df.loc[dataset_idx, "xgboost"] = time()-start
      print(f"Elapsed: {timings_df.loc[dataset_idx, 'xgboost']}")
      print(f"  balanced accuracy xgboost: {balanced_accuracy_score(y_test, preds_xgboost)}")
      scores_df.loc[dataset_idx, "xgboost"] = balanced_accuracy_score(y_test, preds_xgboost)
    except ValueError: 
      continue
      


data set #:1 --------------------


100%|██████████| 12/12 [00:15<00:00,  1.29s/it]


Elapsed: 15.918371200561523
  balanced accuracy lazydeep (DeepSimpleMultitaskClassifier(AdaBoostRegressor)): 1.0
Elapsed: 0.5418291091918945
  balanced accuracy rf: 1.0
Elapsed: 0.3734550476074219
  balanced accuracy et: 1.0




Elapsed: 10.8556649684906
  balanced accuracy tabpfn: 1.0
Elapsed: 33.509448289871216
  balanced accuracy xgboost: 1.0
data set #:2 --------------------


100%|██████████| 12/12 [00:24<00:00,  2.02s/it]


Elapsed: 24.821156978607178
  balanced accuracy lazydeep (DeepMultitaskClassifier(AdaBoostRegressor)): 0.9240598228546537
Elapsed: 0.6184830665588379
  balanced accuracy rf: 0.8658341803397706
Elapsed: 0.6047379970550537
  balanced accuracy et: 0.8376651662552634




Elapsed: 57.47028183937073
  balanced accuracy tabpfn: 0.917017569333527
Elapsed: 70.97452807426453
  balanced accuracy xgboost: 0.9010454479454044
data set #:3 --------------------


100%|██████████| 12/12 [00:49<00:00,  4.09s/it]


Elapsed: 50.47680902481079
  balanced accuracy lazydeep (DeepCustomClassifier(AdaBoostClassifier)): 0.40251989389920423
Elapsed: 0.7725811004638672
  balanced accuracy rf: 0.3476765890558993
Elapsed: 0.6546797752380371
  balanced accuracy et: 0.3950781019746537




Elapsed: 52.246299743652344
  balanced accuracy tabpfn: 0.3333333333333333
Elapsed: 102.50091290473938
  balanced accuracy xgboost: 0.34981334119265156
data set #:4 --------------------


100%|██████████| 12/12 [00:25<00:00,  2.12s/it]


Elapsed: 26.25139617919922
  balanced accuracy lazydeep (DeepCustomClassifier(ExtraTreesClassifier)): 0.9245098039215687
Elapsed: 0.6670870780944824
  balanced accuracy rf: 0.8573529411764705
Elapsed: 0.5556387901306152
  balanced accuracy et: 0.8867647058823529




Elapsed: 24.03065299987793
  balanced accuracy tabpfn: 0.9245098039215687
Elapsed: 73.7140793800354
  balanced accuracy xgboost: 0.8450980392156863
data set #:5 --------------------


100%|██████████| 12/12 [01:00<00:00,  5.00s/it]


Elapsed: 60.720869064331055
  balanced accuracy lazydeep (DeepSimpleMultitaskClassifier(SVR)): 0.7982993197278913
Elapsed: 1.0290651321411133
  balanced accuracy rf: 0.7440476190476192




Elapsed: 0.6665999889373779
  balanced accuracy et: 0.7535714285714287




Elapsed: 70.9992790222168
  balanced accuracy tabpfn: 0.85
Elapsed: 113.80038595199585
  balanced accuracy xgboost: 0.6862244897959184
data set #:6 --------------------


100%|██████████| 12/12 [00:22<00:00,  1.88s/it]


Elapsed: 23.202695846557617
  balanced accuracy lazydeep (DeepSimpleMultitaskClassifier(ExtraTreesRegressor)): 0.868382710053424
Elapsed: 0.8924689292907715
  balanced accuracy rf: 0.8374210781932977
Elapsed: 0.584679365158081
  balanced accuracy et: 0.8460417678484702




Elapsed: 107.22565197944641
  balanced accuracy tabpfn: 0.868382710053424
Elapsed: 76.66879796981812
  balanced accuracy xgboost: 0.8805245264691597
data set #:7 --------------------


100%|██████████| 12/12 [01:08<00:00,  5.68s/it]


Elapsed: 69.21928882598877
  balanced accuracy lazydeep (DeepSimpleMultitaskClassifier(ExtraTreesRegressor)): 0.9340155257586451
Elapsed: 1.8620860576629639
  balanced accuracy rf: 0.9349228752898477
Elapsed: 1.4548838138580322
  balanced accuracy et: 0.9322008266962396




Elapsed: 87.02385401725769
  balanced accuracy tabpfn: 0.9175320092751285
Elapsed: 16.957337856292725
  balanced accuracy xgboost: 0.901048492791612
data set #:8 --------------------


100%|██████████| 12/12 [00:24<00:00,  2.04s/it]


Elapsed: 25.20522427558899
  balanced accuracy lazydeep (DeepCustomClassifier(AdaBoostClassifier)): 1.0
Elapsed: 1.4238839149475098
  balanced accuracy rf: 1.0
Elapsed: 0.5890712738037109
  balanced accuracy et: 1.0




Elapsed: 67.35388588905334
  balanced accuracy tabpfn: 1.0
Elapsed: 12.342914819717407
  balanced accuracy xgboost: 1.0
data set #:9 --------------------


100%|██████████| 12/12 [00:15<00:00,  1.26s/it]


Elapsed: 15.84173583984375
  balanced accuracy lazydeep (DeepCustomClassifier(BaggingClassifier)): 1.0
Elapsed: 0.6872539520263672
  balanced accuracy rf: 0.9393939393939394
Elapsed: 0.40033984184265137
  balanced accuracy et: 0.9696969696969697




Elapsed: 6.466933965682983
  balanced accuracy tabpfn: 0.9090909090909092
Elapsed: 29.612331867218018
  balanced accuracy xgboost: 0.9393939393939394
data set #:10 --------------------


100%|██████████| 12/12 [00:13<00:00,  1.16s/it]


Elapsed: 14.316891193389893
  balanced accuracy lazydeep (DeepSimpleMultitaskClassifier(AdaBoostRegressor)): 1.0
Elapsed: 0.5630757808685303
  balanced accuracy rf: 1.0
Elapsed: 0.4758110046386719
  balanced accuracy et: 1.0




Elapsed: 6.820991039276123
  balanced accuracy tabpfn: 0.9803921568627452
Elapsed: 22.6512131690979
  balanced accuracy xgboost: 1.0
data set #:11 --------------------


100%|██████████| 12/12 [00:18<00:00,  1.53s/it]


Elapsed: 18.744131088256836
  balanced accuracy lazydeep (DeepMultitaskClassifier(ExtraTreesRegressor)): 0.975609756097561
Elapsed: 0.7693641185760498
  balanced accuracy rf: 0.9687604410290678
Elapsed: 0.4544851779937744
  balanced accuracy et: 0.9687604410290678




Elapsed: 28.19751501083374
  balanced accuracy tabpfn: 0.975609756097561
Elapsed: 14.778143882751465
  balanced accuracy xgboost: 0.9687604410290678
data set #:12 --------------------


100%|██████████| 12/12 [00:39<00:00,  3.30s/it]


Elapsed: 39.87502908706665
  balanced accuracy lazydeep (DeepMultitaskClassifier(AdaBoostRegressor)): 0.5517618040873855
Elapsed: 1.0847280025482178
  balanced accuracy rf: 0.35260117789187556
Elapsed: 1.1039371490478516
  balanced accuracy et: 0.33899124131682273




Elapsed: 61.29803681373596
  balanced accuracy tabpfn: 0.33724327997583814
Elapsed: 102.93468689918518
  balanced accuracy xgboost: 0.4312216852914527
data set #:13 --------------------


100%|██████████| 12/12 [00:05<00:00,  2.27it/s]


Elapsed: 5.642559051513672
  balanced accuracy lazydeep (DeepCustomClassifier(ExtraTreesClassifier)): 0.5877551020408164
Elapsed: 0.5576310157775879
  balanced accuracy rf: 0.5795918367346939




Elapsed: 0.4942011833190918
  balanced accuracy et: 0.5877551020408164




Elapsed: 73.90874004364014
  balanced accuracy tabpfn: 0.5836734693877551
data set #:14 --------------------


100%|██████████| 12/12 [00:26<00:00,  2.21s/it]


Elapsed: 27.08159112930298
  balanced accuracy lazydeep (DeepMultitaskClassifier(ExtraTreesRegressor)): 0.5620039682539683
Elapsed: 0.8359658718109131
  balanced accuracy rf: 0.5238095238095238
Elapsed: 0.6228969097137451
  balanced accuracy et: 0.5223214285714286




Elapsed: 56.68902921676636
  balanced accuracy tabpfn: 0.5054563492063492
Elapsed: 9.532980918884277
  balanced accuracy xgboost: 0.5545634920634921


In [17]:
display(scores_df)

Unnamed: 0,lazy_deep,rf,et,tabpfn,xgboost
0,1.0,1.0,1.0,1.0,1.0
1,0.92,0.87,0.84,0.92,0.83
2,0.4,0.35,0.4,0.33,0.33
3,0.92,0.86,0.89,0.92,0.85
4,0.8,0.74,0.75,0.85,0.69
5,0.87,0.84,0.85,0.87,0.88
6,0.93,0.93,0.93,0.92,0.93
7,1.0,1.0,1.0,1.0,1.0
8,1.0,0.94,0.97,0.91,0.97
9,1.0,1.0,1.0,0.98,1.0


In [18]:
scores_ranks_df = scores_df.rank(axis=1, method='max', ascending=False)
display(scores_ranks_df)

Unnamed: 0,lazy_deep,rf,et,tabpfn,xgboost
0,5.0,5.0,5.0,5.0,5.0
1,1.0,3.0,4.0,2.0,5.0
2,1.0,3.0,2.0,4.0,5.0
3,2.0,4.0,3.0,2.0,5.0
4,2.0,4.0,3.0,1.0,5.0
5,3.0,5.0,4.0,3.0,1.0
6,2.0,1.0,3.0,5.0,4.0
7,5.0,5.0,5.0,5.0,5.0
8,1.0,4.0,3.0,5.0,3.0
9,4.0,4.0,4.0,5.0,4.0


In [19]:
scores_ranks_df.describe()

Unnamed: 0,lazy_deep,rf,et,tabpfn,xgboost
count,14.0,14.0,14.0,14.0,13.0
mean,2.29,3.79,3.64,3.71,3.92
std,1.44,1.12,1.01,1.49,1.44
min,1.0,1.0,2.0,1.0,1.0
25%,1.0,3.0,3.0,2.25,3.0
50%,2.0,4.0,4.0,4.5,5.0
75%,2.75,4.75,4.0,5.0,5.0
max,5.0,5.0,5.0,5.0,5.0
