In [1]:
from pauls_functions_advanced_v3 import *
from experiment_functions import *
import pandas as pd
from pmlb import fetch_data, classification_dataset_names
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
import pickle



In [2]:
classification_dataset_names = classification_dataset_names[1:10]

In [3]:
from tqdm.auto import tqdm
from joblib import Parallel

class ProgressParallel(Parallel):
    def __init__(self, use_tqdm=True, total=None, *args, **kwargs):
        self._use_tqdm = use_tqdm
        self._total = total
        super().__init__(*args, **kwargs)

    def __call__(self, *args, **kwargs):
        with tqdm(disable=not self._use_tqdm, total=self._total) as self._pbar:
            return Parallel.__call__(self, *args, **kwargs)

    def print_progress(self):
        if self._total is None:
            self._pbar.total = self.n_dispatched_tasks
        self._pbar.n = self.n_completed_tasks
        self._pbar.refresh()

In [4]:
def get_feature_type(x, include_binary=False):
    x.dropna(inplace=True)
    if not check_if_all_integers(x):
        return 'continuous'
    else:
        if x.nunique() > 10:
            return 'continuous'
        if include_binary:
            if x.nunique() == 2:
                return 'binary'
        return 'categorical'

def get_target_type(x, include_binary=False):
    x.dropna(inplace=True)
    if x.dtype=='float64':
        return 'continuous'
    elif x.dtype=='int64':
        if include_binary:
            if x.nunique() == 2:
                return 'binary'
        return 'categorical'
    else:
        raise ValueError("Error getting type")

def check_if_all_integers(x):
    "check a pandas.Series is made of all integers."
    return all(float(i).is_integer() for i in x.unique())
def corr_data_for(df):
    TARGET_NAME = 'target'
    feat_names = [col for col in df.columns if col!=TARGET_NAME]
    types = [get_feature_type(df[col], include_binary=True) for col in feat_names]
    col = pd.DataFrame(feat_names,types)
    num_col = col[col.index == 'continuous']
    bin_col = col[col.index == 'binary']
    cat_col = col[col.index == 'categorical']
    cat_col = cat_col[0].tolist()
    dummy_col = pd.get_dummies(data=df, columns=cat_col)
    add_col = dummy_col.shape[1] - df.shape[1]
    if (add_col < df.shape[0] *0.3) & (dummy_col.shape[1] <  df.shape[0]):
        df = dummy_col
        df.columns = df.columns.str.replace('.','_',regex=True)
    else:
        del df
        df = pd.DataFrame()
    return df, num_col, bin_col, cat_col

In [5]:
# for data in classification_dataset_names:
#     data = fetch_data(data)
#     print(data.shape)

In [6]:
classification_dataset_names

['GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1',
 'GAMETES_Epistasis_2_Way_20atts_0.4H_EDM_1_1',
 'GAMETES_Epistasis_3_Way_20atts_0.2H_EDM_1_1',
 'GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_50_EDM_2_001',
 'GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_75_EDM_2_001',
 'Hill_Valley_with_noise',
 'Hill_Valley_without_noise',
 'adult',
 'agaricus_lepiota']

In [7]:
def experimentation(classification_dataset):
    iters=5
    res_rul = {}
    sc = StandardScaler()
    names = ['Reg-CART','CART','ORT','OCT','ORT-H','OCT-H','ORT+ORT-H','OCT+OCT-H']
    df = fetch_data(classification_dataset)
    if df.shape[0] > 50000:
        return
    if df.shape[1] > 100:
        return
    df, num_col, bin_col, cat_col = corr_data_for(df)
    if df.empty:
        return
    y = df['target']
    X = df.loc[:, df.columns != 'target']
    #performance_by_iter = pd.DataFrame(columns = ["Logistic Regression", "CART_rules", "OCT_rules", "OCTH_rules", "CART_rules_and_features", "OCT_rules_and_features", "OCTH_rules_and_features"], index = np.arange(0, iters))
    print(color.BOLD + '\n\n    ----------------------------------------- {} -----------------------------------------'.format(classification_dataset) + color.END)
    rows_data, columns_data = X.shape
    print('Dataset Information')
    print('Rows:',rows_data,)
    print('Columns:',columns_data)
    print('Number of classes:',y.nunique())
    print('Continous columns:', len(num_col))
    print('Binary columns:', len(bin_col))
    print('Categorical columns:',len(cat_col))
    print('-------------------------------------------------')
    for it in range(iters):
        X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = it, stratify=y)
        X_col = X_train.columns
        col_len = len(X_col)
        X_test.name = "X_test"
        X_train.name = "X_train"
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
        X_train = pd.DataFrame(X_train,columns=X_col)
        X_test = pd.DataFrame(X_test,columns=X_col)
        factors = [round(col_len*0.5),col_len,round(col_len*1.2),round(col_len*1.4),round(col_len*1.6),round(col_len*1.8),round(col_len*2),round(col_len*2.5),round(col_len*3)]
        factors_name = [0.5,1,1.2,1.4,1.6,1.8,2,2.5,3]



        models, performance = generate_tree(X_train, y_train, X_test, y_test, n_num=1, feat_size=len(X.columns),  max_iter_hy=2,depth_grid=range(1,7), depth_grid_hy=range(1,3), complexity_bi = 0.001, complexity_hy=0.001,  Reg_CART=False, ORT=False, ORT_H=False, Clas_CART=True, OCT=True, OCT_H=True)
        for perf,name in zip(performance,names):
            if not not perf:
                res_rul[(classification_dataset,name,it,1)] = sum(perf) / len(perf)

        act_name = []
        act_rules = []
        for model,name in zip(models,names):
            if not not model:
                act_name += [name]
                act_rules += [model]

        datasets = gen_train_and_test_features(act_rules ,act_name , X_train, X_test)
        for model in datasets.keys():
            print(model)
            X_train_rules_and_features, X_test_rules_and_features = datasets[model][0]
            X_train_only_rules, X_test_only_rules = datasets[model][1]
            print(len(X_train_rules_and_features.columns))
            for len_c,fac_name in zip(factors,factors_name):
                if len_c > len(X_train_only_rules.columns):
                    min_len = len(X_train_only_rules.columns)
                    min_name = 1
                else:
                    min_len = len_c
                    min_name = fac_name
                if (len_c <= X_train.shape[0]) & (len_c <= len(X_train_rules_and_features.columns)):
                    cols = SelectKBest(k=len_c).fit(X_train_rules_and_features,y_train).get_feature_names_out()
                    X_train_rules_features = X_train_rules_and_features[cols]
                    X_test_rules_features = X_test_rules_and_features[cols]

                    cols_1 = SelectKBest(k=min_len).fit(X_train_only_rules,y_train).get_feature_names_out()
                    X_train_rules = X_train_only_rules[cols_1]
                    X_test_rules = X_test_only_rules[cols_1]

                    only_rules_acc = log_regression_pipeline(X_train_rules, X_test_rules, y_train, y_test)
                    rules_and_features_acc = log_regression_pipeline(X_train_rules_features, X_test_rules_features, y_train, y_test)
                    res_rul[(classification_dataset,model + "_LG_rules",it,min_name)] = only_rules_acc
                    res_rul[(classification_dataset,model + "_LG_rules_and_features",it,fac_name)] = rules_and_features_acc

                    only_rules_acc_SVM = SVM_pipeline(X_train_rules, X_test_rules, y_train, y_test)
                    rules_and_features_acc_SVM = SVM_pipeline(X_train_rules_features, X_test_rules_features, y_train, y_test)
                    res_rul[(classification_dataset,model + "_SVM_rules",it,min_name)] = only_rules_acc_SVM
                    res_rul[(classification_dataset,model + "_SVM_rules_and_features",it,fac_name)] = rules_and_features_acc_SVM

                    only_rules_acc_NB = NB_pipeline(X_train_rules, X_test_rules, y_train, y_test)
                    rules_and_features_acc_NB = NB_pipeline(X_train_rules_features, X_test_rules_features, y_train, y_test)
                    res_rul[(classification_dataset,model + "_NB_rules",it,min_name)] = only_rules_acc_NB
                    res_rul[(classification_dataset,model + "_NB_rules_and_features",it,fac_name)] = rules_and_features_acc_NB

                    only_rules_acc_KNN = KNN_pipeline(X_train_rules, X_test_rules, y_train, y_test)
                    rules_and_features_acc_KNN = KNN_pipeline(X_train_rules_features, X_test_rules_features, y_train, y_test)
                    res_rul[(classification_dataset,model + "_KNN_rules",it,min_name)] = only_rules_acc_KNN
                    res_rul[(classification_dataset,model + "_KNN_rules_and_features",it,fac_name)] = rules_and_features_acc_KNN
                else:
                     continue

        res_rul[(classification_dataset,'Logistic_Regression',it,1)] = log_regression_pipeline(X_train, X_test, y_train, y_test)

        res_rul[(classification_dataset,"Support Vector Machine",it,1)] = SVM_pipeline(X_train, X_test, y_train, y_test)

        res_rul[(classification_dataset,"Naive Bayes",it,1)] = NB_pipeline(X_train, X_test, y_train, y_test)

        res_rul[(classification_dataset,"K-Nearest-Neighbor",it,1)] = KNN_pipeline(X_train, X_test, y_train, y_test)
        with open('filename.pickle', 'wb') as handle:
            pickle.dump(res_rul, handle)
    return res_rul

In [None]:
from joblib import delayed
from tqdm import tqdm
res_rul = ProgressParallel(n_jobs=10)(delayed(experimentation)(data) for data in classification_dataset_names)

 22%|██▏       | 2/9 [00:15<00:54,  7.74s/it]

In [None]:
b

In [None]:
res_rul = b

In [None]:
result = {}
for d in res_rul:
    result.update(d)

In [None]:
k = pd.DataFrame(result,index=[0])
k = k.stack(level=2).sort_index()
k = k.stack(level=2).sort_index()
k = k.swaplevel(axis=1)
k = k.droplevel(0)
t=k.mean(level=0,axis=1)
t = t.mean(axis=0)
t.sort_values(ascending = False)

In [None]:
k

In [None]:
y = k.swaplevel(axis=1)
y = y.var(level=0,axis=1)
y = y.mean(axis=0)
good_tests = y[y < 0.01].index
good = list(good_tests)

In [None]:
vaild_results = k.iloc[:,k.columns.isin(good, level=1)]
vaild_results=vaild_results.mean(level=0,axis=1)
vaild_results.mean(axis=0)

In [None]:
with open('filename.pickle', 'rb') as handle:
    b = pickle.load(handle)

In [None]:
classification_dataset

In [None]:
b

In [None]:
classification_dataset = classification_dataset_names[0]

In [None]:

iters=1
res_rul = {}
sc = StandardScaler()
names = ['Reg-CART','CART','ORT','OCT','ORT-H','OCT-H','ORT+ORT-H','OCT+OCT-H']
df = fetch_data(classification_dataset)

df, num_col, bin_col, cat_col = corr_data_for(df)

y = df['target']
X = df.loc[:, df.columns != 'target']
#performance_by_iter = pd.DataFrame(columns = ["Logistic Regression", "CART_rules", "OCT_rules", "OCTH_rules", "CART_rules_and_features", "OCT_rules_and_features", "OCTH_rules_and_features"], index = np.arange(0, iters))
print(color.BOLD + '\n\n    ----------------------------------------- {} -----------------------------------------'.format(classification_dataset) + color.END)
rows_data, columns_data = X.shape
print('Dataset Information')
print('Rows:',rows_data,)
print('Columns:',columns_data)
print('Number of classes:',y.nunique())
print('Continous columns:', len(num_col))
print('Binary columns:', len(bin_col))
print('Categorical columns:',len(cat_col))
print('-------------------------------------------------')
for it in range(iters):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = it, stratify=y)
    X_col = X_train.columns
    col_len = len(X_col)
    X_test.name = "X_test"
    X_train.name = "X_train"
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    X_train = pd.DataFrame(X_train,columns=X_col)
    X_test = pd.DataFrame(X_test,columns=X_col)
    factors = [round(col_len*0.5),col_len,round(col_len*1.2),round(col_len*1.4),round(col_len*1.6),round(col_len*1.8),round(col_len*2),round(col_len*2.5),round(col_len*3)]
    factors_name = [0.5,1,1.2,1.4,1.6,1.8,2,2.5,3]



    models, performance = generate_tree(X_train, y_train, X_test, y_test, n_num=1, feat_size=len(X.columns),  max_iter_hy=2,depth_grid=range(1,5), depth_grid_hy=range(1,3), complexity_bi = 0.001, complexity_hy=0.001,  Reg_CART=False, ORT=False, ORT_H=False, Clas_CART=True, OCT=True, OCT_H=False)
    for perf,name in zip(performance,names):
        if not not perf:
            res_rul[(classification_dataset,name,it,1)] = sum(perf) / len(perf)

    act_name = []
    act_rules = []
    for model,name in zip(models,names):
        if not not model:
            act_name += [name]
            act_rules += [model]

    datasets = gen_train_and_test_features(act_rules ,act_name , X_train, X_test)
    for model in datasets.keys():
        print(model)
        X_train_rules_and_features, X_test_rules_and_features = datasets[model][0]
        X_train_only_rules, X_test_only_rules = datasets[model][1]
        print(len(X_train_rules_and_features.columns))
        for len_c,fac_name in zip(factors,factors_name):
            if len_c > len(X_train_only_rules.columns):
                min_len = len(X_train_only_rules.columns)
                min_name = 1
            else:
                min_len = len_c
                min_name = fac_name
            if (len_c <= X_train.shape[0]) & (len_c <= len(X_train_rules_and_features.columns)):
                cols = SelectKBest(k=len_c).fit(X_train_rules_and_features,y_train).get_feature_names_out()
                X_train_rules_features = X_train_rules_and_features[cols]
                X_test_rules_features = X_test_rules_and_features[cols]

                cols_1 = SelectKBest(k=min_len).fit(X_train_only_rules,y_train).get_feature_names_out()
                X_train_rules = X_train_only_rules[cols_1]
                X_test_rules = X_test_only_rules[cols_1]

                only_rules_acc = log_regression_pipeline(X_train_rules, X_test_rules, y_train, y_test)
                rules_and_features_acc = log_regression_pipeline(X_train_rules_features, X_test_rules_features, y_train, y_test)
                res_rul[(classification_dataset,model + "_LG_rules",it,min_name)] = only_rules_acc
                res_rul[(classification_dataset,model + "_LG_rules_and_features",it,fac_name)] = rules_and_features_acc

                only_rules_acc_SVM = SVM_pipeline(X_train_rules, X_test_rules, y_train, y_test)
                rules_and_features_acc_SVM = SVM_pipeline(X_train_rules_features, X_test_rules_features, y_train, y_test)
                res_rul[(classification_dataset,model + "_SVM_rules",it,min_name)] = only_rules_acc_SVM
                res_rul[(classification_dataset,model + "_SVM_rules_and_features",it,fac_name)] = rules_and_features_acc_SVM

                only_rules_acc_NB = NB_pipeline(X_train_rules, X_test_rules, y_train, y_test)
                rules_and_features_acc_NB = NB_pipeline(X_train_rules_features, X_test_rules_features, y_train, y_test)
                res_rul[(classification_dataset,model + "_NB_rules",it,min_name)] = only_rules_acc_NB
                res_rul[(classification_dataset,model + "_NB_rules_and_features",it,fac_name)] = rules_and_features_acc_NB

                only_rules_acc_KNN = KNN_pipeline(X_train_rules, X_test_rules, y_train, y_test)
                rules_and_features_acc_KNN = KNN_pipeline(X_train_rules_features, X_test_rules_features, y_train, y_test)
                res_rul[(classification_dataset,model + "_KNN_rules",it,min_name)] = only_rules_acc_KNN
                res_rul[(classification_dataset,model + "_KNN_rules_and_features",it,fac_name)] = rules_and_features_acc_KNN
            else:
                 continue

    res_rul[(classification_dataset,'Logistic_Regression',it,1)] = log_regression_pipeline(X_train, X_test, y_train, y_test)

    res_rul[(classification_dataset,"Support Vector Machine",it,1)] = SVM_pipeline(X_train, X_test, y_train, y_test)

    res_rul[(classification_dataset,"Naive Bayes",it,1)] = NB_pipeline(X_train, X_test, y_train, y_test)

    res_rul[(classification_dataset,"K-Nearest-Neighbor",it,1)] = KNN_pipeline(X_train, X_test, y_train, y_test)
    with open('filename.pickle', 'wb') as handle:
        pickle.dump(res_rul, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
cols = SelectKBest(k=13).fit(X_train_rules_and_features,y_train).get_feature_names_out()
X_train_rules_features = X_train_rules_and_features[cols]
X_test_rules_features = X_test_rules_and_features[cols]

In [None]:
k = pd.DataFrame(res_rul, index=[0])
k = k.stack(level=2).sort_index()
k = k.stack(level=2).sort_index()
k

In [None]:
X_train_only_rules

In [None]:
result = {}
for d in res_rul:
    result.update(d)

In [None]:
res_rul

In [None]:
round(col_len*1.5)

In [None]:
col_len

In [None]:
result = {}
for d in res_rul:
    result.update(d)

In [None]:
k = pd.DataFrame(res_rul, index=[0])
k = k.stack(level=2).sort_index()
k = k.stack(level=2).sort_index()
k

In [None]:
[round(col_len*0.5),col_len,round(col_len*1.25),round(col_len*1.5),round(col_len*2)]

In [None]:
datasets.keys()

In [None]:
from keras.layers import Dense
from keras.layers import Dropout
from keras.models import Sequential
import keras.utils
import keras_tuner
#from tensorflow import keras
from keras import utils as np_utils




def NN_creator(hp):
  model = Sequential()
  model.add(Dense(30, activation='relu', input_dim=30))

  # Tune the number of dense layers
  for i in range(hp.Int('num_layers', 1, 5)):

    # Tune the number of units in the each dense layer
    hp_units = hp.Int('units_'+str(i), min_value=3, max_value=18,step=1)
    model.add(keras.layers.Dense(units=hp_units, activation='relu'))

    # Tune the dropout rate in the each dense layer
    hp_dropout = hp.Float('rate', min_value=0.0, max_value=0.5, step=0.1)
    model.add(keras.layers.Dropout(hp_dropout))

  # Add dense output layer
    model.add(Dense(1, activation='sigmoid'))

  # Tune the learning rate for the optimizer
  hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
  model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                loss='binary_crossentropy',
                metrics=['accuracy'])

  return model

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(nrows = 5, ncols = 4, gridspec_kw = {"hspace": 0.25})
import seaborn as sns
fig.set_size_inches(30, 25)
iteration = 0

for m in range(5):
    for j in range(4):

        dataset = classification_dataset_names[:20][iteration]

        columns = [i for i in k.columns if dataset in i]
        sns.boxplot(k[columns], ax = ax[m, j])

        ax[m, j].set_title(dataset)

        ax[m, j].set_xticklabels(['CART Rules', "OCT Rules", "Logistic Regression", "RuleFit", "ORRFA"])

        iteration += 1



In [None]:
CART_rules

In [None]:
import seaborn as sns

sns.violinplot(data=k)

In [None]:
del performance_by_iter['OCTH_rules']

In [None]:
del performance_by_iter['OCTH_rules_and_features']

In [None]:
df.loc[eval(rule)].index.values

In [None]:
df = X_train.copy()

In [None]:
rule = rule.replace("feature", "df")

In [None]:
rule

In [None]:
loc[eval(rule)].index.values

In [None]:
rule = rules[1]

In [None]:
for i, rules in enumerate(act_rules):
    print(i)
    print(rules)

In [None]:
act_rules

In [None]:
for i, rules in enumerate(act_rules):
    print(i)
    print(rules)

In [None]:
performance_by_iter.rename(columns = {column: column.replace("OCT_rules_and_features", "ORRFA")}, inplace = True)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
fig, ax = plt.subplots()
sns.boxplot(data = performance_by_iter)
fig.set_size_inches(20, 10)
ax.set_xticklabels(performance_by_iter.columns.values)
# ax.set_ylim(0.93, 0.995)
ax.tick_params(rotation = 0, labelsize = 14)
ax.set_ylabel("Accuracy", fontsize = 14)
ax.set_title("Accuracy of Logistic Regression, RuleFit and ORRFA", fontsize = 15)
# ax.set_ylabel()
plt.savefig('Benchmark ORRFA.png')

In [None]:
performance_by_iter.mean()