In [136]:
from pauls_functions_advanced_v3 import *
from experiment_functions import *
import pandas as pd
from pmlb import fetch_data, classification_dataset_names
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif

In [137]:
classification_dataset_names = classification_dataset_names[20:24]

In [138]:
from tqdm.auto import tqdm
from joblib import Parallel

class ProgressParallel(Parallel):
    def __init__(self, use_tqdm=True, total=None, *args, **kwargs):
        self._use_tqdm = use_tqdm
        self._total = total
        super().__init__(*args, **kwargs)

    def __call__(self, *args, **kwargs):
        with tqdm(disable=not self._use_tqdm, total=self._total) as self._pbar:
            return Parallel.__call__(self, *args, **kwargs)

    def print_progress(self):
        if self._total is None:
            self._pbar.total = self.n_dispatched_tasks
        self._pbar.n = self.n_completed_tasks
        self._pbar.refresh()

In [139]:
def get_feature_type(x, include_binary=False):
    x.dropna(inplace=True)
    if not check_if_all_integers(x):
        return 'continuous'
    else:
        if x.nunique() > 10:
            return 'continuous'
        if include_binary:
            if x.nunique() == 2:
                return 'binary'
        return 'categorical'

def get_target_type(x, include_binary=False):
    x.dropna(inplace=True)
    if x.dtype=='float64':
        return 'continuous'
    elif x.dtype=='int64':
        if include_binary:
            if x.nunique() == 2:
                return 'binary'
        return 'categorical'
    else:
        raise ValueError("Error getting type")

def check_if_all_integers(x):
    "check a pandas.Series is made of all integers."
    return all(float(i).is_integer() for i in x.unique())
def corr_data_for(df):
    TARGET_NAME = 'target'
    feat_names = [col for col in df.columns if col!=TARGET_NAME]
    types = [get_feature_type(df[col], include_binary=True) for col in feat_names]
    col = pd.DataFrame(feat_names,types)
    num_col = col[col.index == 'continuous']
    bin_col = col[col.index == 'binary']
    cat_col = col[col.index == 'categorical']
    cat_col = cat_col[0].tolist()
    dummy_col = pd.get_dummies(data=df, columns=cat_col)
    add_col = dummy_col.shape[1] - df.shape[1]
    if (add_col < df.shape[0] *0.3) & (dummy_col.shape[1] <  df.shape[0]):
        df = dummy_col
        df.columns = df.columns.str.replace('.','_',regex=True)
    else:
        del df
        df = pd.DataFrame()
    return df, num_col, bin_col, cat_col

In [140]:
for data in classification_dataset_names:
    data = fetch_data(data)
    print(data.shape)

(100, 7)
(97, 11)
(92, 11)
(797, 5)


In [141]:
classification_dataset_names

['analcatdata_creditscore',
 'analcatdata_cyyoung8092',
 'analcatdata_cyyoung9302',
 'analcatdata_dmft']

In [146]:
def experimentation(classification_dataset):
    iters=2
    res_rul = {}
    sc = StandardScaler()
    names = ['Reg-CART','CART','ORT','OCT','ORT-H','OCT-H','ORT+ORT-H','OCT+OCT-H']
    df = fetch_data(classification_dataset)
    df, num_col, bin_col, cat_col = corr_data_for(df)

    y = df['target']
    X = df.loc[:, df.columns != 'target']
    #performance_by_iter = pd.DataFrame(columns = ["Logistic Regression", "CART_rules", "OCT_rules", "OCTH_rules", "CART_rules_and_features", "OCT_rules_and_features", "OCTH_rules_and_features"], index = np.arange(0, iters))
    print(color.BOLD + '\n\n    ----------------------------------------- {} -----------------------------------------'.format(classification_dataset) + color.END)
    rows_data, columns_data = X.shape
    print('Dataset Information')
    print('Rows:',rows_data,)
    print('Columns:',columns_data)
    print('Number of classes:',y.nunique())
    print('Continous columns:', len(num_col))
    print('Binary columns:', len(bin_col))
    print('Categorical columns:',len(cat_col))
    print('-------------------------------------------------')
    for it in range(iters):
        X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = it, stratify=y)
        X_col = X_train.columns
        col_len = len(X_col)
        X_test.name = "X_test"
        X_train.name = "X_train"
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
        X_train = pd.DataFrame(X_train,columns=X_col)
        X_test = pd.DataFrame(X_test,columns=X_col)



        models, performance = generate_tree(X_train, y_train, X_test, y_test, n_num=1, feat_size=len(X.columns),  max_iter_hy=2,depth_grid=range(1,4), depth_grid_hy=range(1,3), complexity_bi = 0.001, complexity_hy=0.001,  Reg_CART=False, ORT=False, ORT_H=False, Clas_CART=True, OCT=True, OCT_H=True)
        for perf,name in zip(performance,names):
            if not not perf:
                res_rul[(classification_dataset,name,it,col_len)] = sum(perf) / len(perf)

        act_name = []
        act_rules = []
        for model,name in zip(models,names):
            if not not model:
                act_name += [name]
                act_rules += [model]

        datasets = gen_train_and_test_features(act_rules ,act_name , X_train, X_test)
        for model in datasets.keys():

            X_train_rules_and_features, X_test_rules_and_features = datasets[model][0]
            X_train_only_rules, X_test_only_rules = datasets[model][1]

            factors = [round(col_len*0.5),col_len,round(col_len*1.25),round(col_len*1.5),round(col_len*2)]
            factors_name = [0.5,1,1.25,1.5,2]

            for len_c,fac_name in zip(factors,factors_name):
                if len_c > len(X_train_only_rules.columns):
                    min_len = len(X_train_only_rules.columns)
                    min_name = 1
                else:
                    min_len = len_c
                    min_name = fac_name
                if len_c > X_train.shape[1]:
                    res_rul[(classification_dataset,model + "_LG_rules",it,factors_name)] = np.nan
                    res_rul[(classification_dataset,model + "_LG_rules_and_features",it,factors_name)] = np.nan
                    res_rul[(classification_dataset,model + "_SVM_rules",it,factors_name)] = np.nan
                    res_rul[(classification_dataset,model + "_SVM_rules_and_features",it,factors_name)] = np.nan
                    res_rul[(classification_dataset,model + "_NB_rules",it,factors_name)] = np.nan
                    res_rul[(classification_dataset,model + "_NB_rules_and_features",it,factors_name)] = np.nan
                    res_rul[(classification_dataset,model + "_KNN_rules",it,factors_name)] = np.nan
                    res_rul[(classification_dataset,model + "_KNN_rules_and_features",it,factors_name)] = np.nan
                else:
                    cols = SelectKBest(k=len_c).fit(X_train_rules_and_features,y_train).get_feature_names_out()
                    X_train_rules_features = X_train_rules_and_features[cols]
                    X_test_rules_features = X_test_rules_and_features[cols]

                    cols_1 = SelectKBest(k=min_len).fit(X_train_only_rules,y_train).get_feature_names_out()
                    X_train_rules = X_train_only_rules[cols_1]
                    X_test_rules = X_test_only_rules[cols_1]

                    only_rules_acc = log_regression_pipeline(X_train_rules, X_test_rules, y_train, y_test)
                    rules_and_features_acc = log_regression_pipeline(X_train_rules_features, X_test_rules_features, y_train, y_test)
                    res_rul[(classification_dataset,model + "_LG_rules",it,min_name)] = only_rules_acc
                    res_rul[(classification_dataset,model + "_LG_rules_and_features",it,fac_name)] = rules_and_features_acc

                    only_rules_acc_SVM = SVM_pipeline(X_train_rules, X_test_rules, y_train, y_test)
                    rules_and_features_acc_SVM = SVM_pipeline(X_train_rules_features, X_test_rules_features, y_train, y_test)
                    res_rul[(classification_dataset,model + "_SVM_rules",it,min_name)] = only_rules_acc_SVM
                    res_rul[(classification_dataset,model + "_SVM_rules_and_features",it,fac_name)] = rules_and_features_acc_SVM

                    only_rules_acc_NB = NB_pipeline(X_train_rules, X_test_rules, y_train, y_test)
                    rules_and_features_acc_NB = NB_pipeline(X_train_rules_features, X_test_rules_features, y_train, y_test)
                    res_rul[(classification_dataset,model + "_NB_rules",it,min_name)] = only_rules_acc_NB
                    res_rul[(classification_dataset,model + "_NB_rules_and_features",it,fac_name)] = rules_and_features_acc_NB

                    only_rules_acc_KNN = KNN_pipeline(X_train_rules, X_test_rules, y_train, y_test)
                    rules_and_features_acc_KNN = KNN_pipeline(X_train_rules_features, X_test_rules_features, y_train, y_test)
                    res_rul[(classification_dataset,model + "_KNN_rules",it,min_name)] = only_rules_acc_KNN
                    res_rul[(classification_dataset,model + "_KNN_rules_and_features",it,fac_name)] = rules_and_features_acc_KNN

        res_rul[(classification_dataset,'Logistic_Regression',it,col_len)] = log_regression_pipeline(X_train, X_test, y_train, y_test)

        res_rul[(classification_dataset,"Support Vector Machine",it,col_len)] = SVM_pipeline(X_train, X_test, y_train, y_test)

        res_rul[(classification_dataset,"Naive Bayes",it,col_len)] = NB_pipeline(X_train, X_test, y_train, y_test)

        res_rul[(classification_dataset,"K-Nearest-Neighbor",it,col_len)] = KNN_pipeline(X_train, X_test, y_train, y_test)

    return res_rul


In [147]:
from joblib import delayed
from tqdm import tqdm
res_rul = ProgressParallel(n_jobs=-1)(delayed(experimentation)(data) for data in classification_dataset_names)

100%|██████████| 4/4 [00:29<00:00,  7.29s/it]


TypeError: unhashable type: 'list'

In [123]:
res_rul

[{('analcatdata_creditscore', 'Reg-CART', 0, 11): nan,
  ('analcatdata_creditscore', 'CART', 0, 11): 0.9833333333333334,
  ('analcatdata_creditscore', 'ORT', 0, 11): nan,
  ('analcatdata_creditscore', 'OCT', 0, 11): 1.0,
  ('analcatdata_creditscore', 'ORT-H', 0, 11): nan,
  ('analcatdata_creditscore', 'OCT-H', 0, 11): 1.0,
  ('analcatdata_creditscore', 'CART_LG_rules', 0, 6): nan,
  ('analcatdata_creditscore', 'CART_LG_rules_and_features', 0, 6): 1.0,
  ('analcatdata_creditscore', 'CART_SVM_rules', 0, 6): nan,
  ('analcatdata_creditscore', 'CART_SVM_rules_and_features', 0, 6): 0.95,
  ('analcatdata_creditscore', 'CART_NB_rules', 0, 6): nan,
  ('analcatdata_creditscore', 'CART_NB_rules_and_features', 0, 6): 0.95,
  ('analcatdata_creditscore', 'CART_KNN_rules', 0, 6): nan,
  ('analcatdata_creditscore', 'CART_KNN_rules_and_features', 0, 6): 1.0,
  ('analcatdata_creditscore', 'CART_LG_rules_and_features', 0, 11): 1.0,
  ('analcatdata_creditscore', 'CART_SVM_rules_and_features', 0, 11): 0.9

In [117]:
result = {}
for d in res_rul:
    result.update(d)

In [118]:
k = pd.DataFrame(result,index=[0])

In [121]:
k = k.stack(level=2).sort_index()
k = k.stack(level=2).sort_index()

In [122]:
k

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,analcatdata_creditscore,analcatdata_creditscore,analcatdata_creditscore,analcatdata_creditscore,analcatdata_creditscore,analcatdata_creditscore,analcatdata_creditscore,analcatdata_creditscore,analcatdata_creditscore,analcatdata_creditscore,...,analcatdata_dmft,analcatdata_dmft,analcatdata_dmft,analcatdata_dmft,analcatdata_dmft,analcatdata_dmft,analcatdata_dmft,analcatdata_dmft,analcatdata_dmft,analcatdata_dmft
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,CART,CART_KNN_rules,CART_KNN_rules_and_features,CART_LG_rules,CART_LG_rules_and_features,CART_NB_rules,CART_NB_rules_and_features,CART_SVM_rules,CART_SVM_rules_and_features,K-Nearest-Neighbor,...,OCT_LG_rules,OCT_LG_rules_and_features,OCT_NB_rules,OCT_NB_rules_and_features,OCT_SVM_rules,OCT_SVM_rules_and_features,ORT,ORT-H,Reg-CART,Support Vector Machine
0,0,5,,,,,,,,,,,...,,,,,,,,,,
0,0,6,,,1.0,,1.0,,0.95,,0.95,,...,,,,,,,,,,
0,0,9,,,,,,,,,,,...,,,,,,,,,,
0,0,10,,,,,,,,,,,...,0.16875,0.2125,0.15625,0.18125,0.18125,0.1875,,,,
0,0,11,0.983333,,1.0,,1.0,,0.95,,0.95,0.9,...,,,,,,,,,,
0,0,18,,,,,,,,,,,...,,,,,,,,,,
0,0,20,,,,,,,,,,,...,,0.25,,0.16875,,0.20625,,,,0.16875
0,1,5,,,,,,,,,,,...,,,,,,,,,,
0,1,6,,,0.95,,0.95,,0.95,,0.9,,...,,,,,,,,,,
0,1,9,,,,,,,,,,,...,,,,,,,,,,


In [114]:
k =
k = k.stack(level=2).sort_index()
k = k.swaplevel(axis=1)
k = k.droplevel(0)
t=k.mean(level=0,axis=1)
t = t.mean(axis=0)

  t=k.mean(level=0,axis=1)


In [115]:
t.sort_values(ascending  = False)

analcatdata_creditscore    0.936673
analcatdata_cyyoung9302    0.820632
analcatdata_cyyoung8092    0.820040
analcatdata_dmft           0.207782
dtype: float64

In [9]:
y = k.swaplevel(axis=1)
y = y.var(level=0,axis=1)
y = y.mean(axis=0)
good_tests = y[y < 0.01].index
good = list(good_tests)

NameError: name 'k' is not defined

In [10]:
vaild_results = k.iloc[:,k.columns.isin(good, level=1)]
vaild_results=vaild_results.mean(level=0,axis=1)
vaild_results.mean(axis=0)

NameError: name 'k' is not defined

In [124]:
classification_dataset = classification_dataset_names[1]

In [145]:

iters=2
res_rul = {}
sc = StandardScaler()
names = ['Reg-CART','CART','ORT','OCT','ORT-H','OCT-H','ORT+ORT-H','OCT+OCT-H']
df = fetch_data(classification_dataset)
df, num_col, bin_col, cat_col = corr_data_for(df)

y = df['target']
X = df.loc[:, df.columns != 'target']
#performance_by_iter = pd.DataFrame(columns = ["Logistic Regression", "CART_rules", "OCT_rules", "OCTH_rules", "CART_rules_and_features", "OCT_rules_and_features", "OCTH_rules_and_features"], index = np.arange(0, iters))
print(color.BOLD + '\n\n    ----------------------------------------- {} -----------------------------------------'.format(classification_dataset) + color.END)
rows_data, columns_data = X.shape
print('Dataset Information')
print('Rows:',rows_data,)
print('Columns:',columns_data)
print('Number of classes:',y.nunique())
print('Continous columns:', len(num_col))
print('Binary columns:', len(bin_col))
print('Categorical columns:',len(cat_col))
print('-------------------------------------------------')
for it in range(iters):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = it, stratify=y)
    X_col = X_train.columns
    col_len = len(X_col)
    X_test.name = "X_test"
    X_train.name = "X_train"
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    X_train = pd.DataFrame(X_train,columns=X_col)
    X_test = pd.DataFrame(X_test,columns=X_col)



    models, performance = generate_tree(X_train, y_train, X_test, y_test, n_num=1, feat_size=len(X.columns),  max_iter_hy=2,depth_grid=range(1,4), depth_grid_hy=range(1,3), complexity_bi = 0.001, complexity_hy=0.001,  Reg_CART=False, ORT=False, ORT_H=False, Clas_CART=True, OCT=True, OCT_H=True)
    for perf,name in zip(performance,names):
        if not not perf:
            res_rul[(classification_dataset,name,it,col_len)] = sum(perf) / len(perf)

    act_name = []
    act_rules = []
    for model,name in zip(models,names):
        if not not model:
            act_name += [name]
            act_rules += [model]

    datasets = gen_train_and_test_features(act_rules ,act_name , X_train, X_test)
    for model in datasets.keys():

        X_train_rules_and_features, X_test_rules_and_features = datasets[model][0]
        X_train_only_rules, X_test_only_rules = datasets[model][1]

        factors = [round(col_len*0.5),col_len,round(col_len*1.25),round(col_len*1.5),round(col_len*2)]
        factors_name = [0.5,1,1.25,1.5,2]

        for len_c,fac_name in zip(factors,factors_name):
            if len_c > len(X_train_only_rules.columns):
                min_len = len(X_train_only_rules.columns)
                min_name = 1
            else:
                min_len = len_c
                min_name = fac_name
            if len_c > X_train.shape[1]:
                res_rul[(classification_dataset,model + "_LG_rules",it,factors_name)] = np.nan
                res_rul[(classification_dataset,model + "_LG_rules_and_features",it,factors_name)] = np.nan
                res_rul[(classification_dataset,model + "_SVM_rules",it,factors_name)] = np.nan
                res_rul[(classification_dataset,model + "_SVM_rules_and_features",it,factors_name)] = np.nan
                res_rul[(classification_dataset,model + "_NB_rules",it,factors_name)] = np.nan
                res_rul[(classification_dataset,model + "_NB_rules_and_features",it,factors_name)] = np.nan
                res_rul[(classification_dataset,model + "_KNN_rules",it,factors_name)] = np.nan
                res_rul[(classification_dataset,model + "_KNN_rules_and_features",it,factors_name)] = np.nan
            else:
                cols = SelectKBest(k=len_c).fit(X_train_rules_and_features,y_train).get_feature_names_out()
                X_train_rules_features = X_train_rules_and_features[cols]
                X_test_rules_features = X_test_rules_and_features[cols]

                cols_1 = SelectKBest(k=min_len).fit(X_train_only_rules,y_train).get_feature_names_out()
                X_train_rules = X_train_only_rules[cols_1]
                X_test_rules = X_test_only_rules[cols_1]

                only_rules_acc = log_regression_pipeline(X_train_rules, X_test_rules, y_train, y_test)
                rules_and_features_acc = log_regression_pipeline(X_train_rules_features, X_test_rules_features, y_train, y_test)
                res_rul[(classification_dataset,model + "_LG_rules",it,min_name)] = only_rules_acc
                res_rul[(classification_dataset,model + "_LG_rules_and_features",it,fac_name)] = rules_and_features_acc

                only_rules_acc_SVM = SVM_pipeline(X_train_rules, X_test_rules, y_train, y_test)
                rules_and_features_acc_SVM = SVM_pipeline(X_train_rules_features, X_test_rules_features, y_train, y_test)
                res_rul[(classification_dataset,model + "_SVM_rules",it,min_name)] = only_rules_acc_SVM
                res_rul[(classification_dataset,model + "_SVM_rules_and_features",it,fac_name)] = rules_and_features_acc_SVM

                only_rules_acc_NB = NB_pipeline(X_train_rules, X_test_rules, y_train, y_test)
                rules_and_features_acc_NB = NB_pipeline(X_train_rules_features, X_test_rules_features, y_train, y_test)
                res_rul[(classification_dataset,model + "_NB_rules",it,min_name)] = only_rules_acc_NB
                res_rul[(classification_dataset,model + "_NB_rules_and_features",it,fac_name)] = rules_and_features_acc_NB

                only_rules_acc_KNN = KNN_pipeline(X_train_rules, X_test_rules, y_train, y_test)
                rules_and_features_acc_KNN = KNN_pipeline(X_train_rules_features, X_test_rules_features, y_train, y_test)
                res_rul[(classification_dataset,model + "_KNN_rules",it,min_name)] = only_rules_acc_KNN
                res_rul[(classification_dataset,model + "_KNN_rules_and_features",it,fac_name)] = rules_and_features_acc_KNN

    res_rul[(classification_dataset,'Logistic_Regression',it,col_len)] = log_regression_pipeline(X_train, X_test, y_train, y_test)

    res_rul[(classification_dataset,"Support Vector Machine",it,col_len)] = SVM_pipeline(X_train, X_test, y_train, y_test)

    res_rul[(classification_dataset,"Naive Bayes",it,col_len)] = NB_pipeline(X_train, X_test, y_train, y_test)

    res_rul[(classification_dataset,"K-Nearest-Neighbor",it,col_len)] = KNN_pipeline(X_train, X_test, y_train, y_test)




[1m

    ----------------------------------------- analcatdata_cyyoung8092 -----------------------------------------[0m
Dataset Information
Rows: 97
Columns: 10
Number of classes: 2
Continous columns: 8
Binary columns: 2
Categorical columns: 0
-------------------------------------------------
Classification CART mean performance:  0.85


Classification OCT performance:  0.85


Classification OCT_H performance:  0.85




TypeError: unhashable type: 'list'

In [132]:
[round(col_len*0.5),col_len,round(col_len*1.25),round(col_len*1.5),round(col_len*2)]

[5, 10, 12, 15, 20]

In [135]:
datasets.keys()

dict_keys(['CART', 'OCT', 'OCT-H', 'OCT+OCT-H'])

In [None]:
from keras.layers import Dense
from keras.layers import Dropout
from keras.models import Sequential
import keras.utils
import keras_tuner
#from tensorflow import keras
from keras import utils as np_utils




def NN_creator(hp):
  model = Sequential()
  model.add(Dense(30, activation='relu', input_dim=30))

  # Tune the number of dense layers
  for i in range(hp.Int('num_layers', 1, 5)):

    # Tune the number of units in the each dense layer
    hp_units = hp.Int('units_'+str(i), min_value=3, max_value=18,step=1)
    model.add(keras.layers.Dense(units=hp_units, activation='relu'))

    # Tune the dropout rate in the each dense layer
    hp_dropout = hp.Float('rate', min_value=0.0, max_value=0.5, step=0.1)
    model.add(keras.layers.Dropout(hp_dropout))

  # Add dense output layer
    model.add(Dense(1, activation='sigmoid'))

  # Tune the learning rate for the optimizer
  hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
  model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                loss='binary_crossentropy',
                metrics=['accuracy'])

  return model

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(nrows = 5, ncols = 4, gridspec_kw = {"hspace": 0.25})
import seaborn as sns
fig.set_size_inches(30, 25)
iteration = 0

for m in range(5):
    for j in range(4):

        dataset = classification_dataset_names[:20][iteration]

        columns = [i for i in k.columns if dataset in i]
        sns.boxplot(k[columns], ax = ax[m, j])

        ax[m, j].set_title(dataset)

        ax[m, j].set_xticklabels(['CART Rules', "OCT Rules", "Logistic Regression", "RuleFit", "ORRFA"])

        iteration += 1



In [None]:
CART_rules

In [None]:
import seaborn as sns

sns.violinplot(data=k)

In [None]:
del performance_by_iter['OCTH_rules']

In [None]:
del performance_by_iter['OCTH_rules_and_features']

In [None]:
df.loc[eval(rule)].index.values

In [None]:
df = X_train.copy()

In [None]:
rule = rule.replace("feature", "df")

In [None]:
rule

In [None]:
loc[eval(rule)].index.values

In [None]:
rule = rules[1]

In [None]:
for i, rules in enumerate(act_rules):
    print(i)
    print(rules)

In [None]:
act_rules

In [None]:
for i, rules in enumerate(act_rules):
    print(i)
    print(rules)

In [None]:
performance_by_iter.rename(columns = {column: column.replace("OCT_rules_and_features", "ORRFA")}, inplace = True)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
fig, ax = plt.subplots()
sns.boxplot(data = performance_by_iter)
fig.set_size_inches(20, 10)
ax.set_xticklabels(performance_by_iter.columns.values)
# ax.set_ylim(0.93, 0.995)
ax.tick_params(rotation = 0, labelsize = 14)
ax.set_ylabel("Accuracy", fontsize = 14)
ax.set_title("Accuracy of Logistic Regression, RuleFit and ORRFA", fontsize = 15)
# ax.set_ylabel()
plt.savefig('Benchmark ORRFA.png')

In [None]:
performance_by_iter.mean()