In [1]:
# import linecache
# from collections import Counter
# import os
# import tracemalloc
#
# def display_top(snapshot, key_type='lineno', limit=3):
#     snapshot = snapshot.filter_traces((
#         tracemalloc.Filter(False, "<frozen importlib._bootstrap>"),
#         tracemalloc.Filter(False, "<unknown>"),
#     ))
#     top_stats = snapshot.statistics(key_type)
#
#     print("Top %s lines" % limit)
#     for index, stat in enumerate(top_stats[:limit], 1):
#         frame = stat.traceback[0]
#         # replace "/path/to/module/file.py" with "module/file.py"
#         filename = os.sep.join(frame.filename.split(os.sep)[-2:])
#         print("#%s: %s:%s: %.1f KiB"
#               % (index, filename, frame.lineno, stat.size / 1024))
#         line = linecache.getline(frame.filename, frame.lineno).strip()
#         if line:
#             print('    %s' % line)
#
#     other = top_stats[limit:]
#     if other:
#         size = sum(stat.size for stat in other)
#         print("%s other: %.1f KiB" % (len(other), size / 1024))
#     total = sum(stat.size for stat in top_stats)
#     print("Total allocated size: %.1f KiB" % (total / 1024))

# tracemalloc.start()
# counts = Counter()
# experimentation(classification_dataset_names[0])
# snapshot = tracemalloc.take_snapshot()
# display_top(snapshot)

In [2]:
from ex_func import *
from experiment_functions import *
import pandas as pd
from pmlb import fetch_data, classification_dataset_names
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
import pickle



In [3]:
def get_feature_type(x, include_binary=False):
    x.dropna(inplace=True)
    if not check_if_all_integers(x):
        return 'continuous'
    else:
        if x.nunique() > 10:
            return 'continuous'
        if include_binary:
            if x.nunique() == 2:
                return 'binary'
        return 'categorical'

def get_target_type(x, include_binary=False):
    x.dropna(inplace=True)
    if x.dtype=='float64':
        return 'continuous'
    elif x.dtype=='int64':
        if include_binary:
            if x.nunique() == 2:
                return 'binary'
        return 'categorical'
    else:
        raise ValueError("Error getting type")

def check_if_all_integers(x):
    "check a pandas.Series is made of all integers."
    return all(float(i).is_integer() for i in x.unique())
def corr_data_for(df):
    TARGET_NAME = 'target'
    feat_names = [col for col in df.columns if col!=TARGET_NAME]
    types = [get_feature_type(df[col], include_binary=True) for col in feat_names]
    col = pd.DataFrame(feat_names,types)
    num_col = col[col.index == 'continuous']
    bin_col = col[col.index == 'binary']
    cat_col = col[col.index == 'categorical']
    cat_col = cat_col[0].tolist()
    dummy_col = pd.get_dummies(data=df, columns=cat_col)
    add_col = dummy_col.shape[1] - df.shape[1]

    if (add_col < df.shape[0] *0.3) & (dummy_col.shape[1] <  df.shape[0]) & (df.shape[0] < 10000) & (df.shape[1] < 100):
        df = dummy_col
        df.columns = df.columns.str.replace('.','_',regex=True)
        y = df['target']
        X = df.loc[:, df.columns != 'target']
        del df
        rows_data, columns_data = X.shape
        print('Dataset Information')
        print('Rows:',rows_data,)
        print('Columns:',columns_data)
        print('Number of classes:',y.nunique())
        print('Continous columns:', len(num_col))
        print('Binary columns:', len(bin_col))
        print('Categorical columns:',len(cat_col))
        print('-------------------------------------------------')
    else:
        del df
        return pd.DataFrame, pd.DataFrame
    return y, X

In [4]:
def split_function(y,X,it):
    sc = StandardScaler()
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = it, stratify=y)
    X_col = X_train.columns
    X_test.name = "X_test"
    X_train.name = "X_train"
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    X_train = pd.DataFrame(X_train,columns=X_col)
    X_test = pd.DataFrame(X_test,columns=X_col)
    return X_train, X_test, y_train, y_test

In [5]:
 #classification_dataset_names = classification_dataset_names[30:33]

In [6]:
def experimentation(classification_dataset,iters):
    res_rul = {}
    names = ['Reg-CART','CART','ORT','OCT','ORT-H','OCT-H','ORT+ORT-H','OCT+OCT-H']
    algorithms = ['LN','SVM','NB','KNN']
    pipelines = [LN_pipeline,SVM_pipeline,NB_pipeline,KNN_pipeline]

    df = fetch_data(classification_dataset)
    print('Numer of NANs: ',df.isna().sum().sum())
    y, X = corr_data_for(df)

    del df
    if X.empty:
        return {}

    print(color.BOLD + '\n\n    ----------------------------------------- {} -----------------------------------------'.format(classification_dataset) + color.END)
    for it in range(iters):
        X_train, X_test, y_train, y_test = split_function(y,X,it)
        col_len = len(X_train.columns)
        factors = [0.5,1,1.2,1.4,1.6,1.8,2,2.5,3]

        models, performance = generate_tree(X_train, y_train, X_test, y_test, n_num=1, feat_size=len(X.columns),  max_iter_hy=2,sub_paths=True,depth_grid=range(3,4), depth_grid_hy=range(1,3), complexity_bi = 0.001, complexity_hy=0.001,  Reg_CART=False, ORT=False, ORT_H=False, Clas_CART=True, OCT=True, OCT_H=False)

        for perf,name in zip(performance,names):
            if not not perf:
                res_rul[(classification_dataset,name,it,1)] = sum(perf) / len(perf)

        act_name = []
        act_rules = []
        for model,name in zip(models,names):
            if (all(model)) & (not not model) & (None not in model):
                act_name.append(name)
                act_rules.append(model)

        datasets = gen_train_and_test_features(act_rules ,act_name , X_train, X_test)

        for model in datasets.keys():
            X_train_rules_and_features, X_test_rules_and_features = datasets[model][0]
            X_train_only_rules, X_test_only_rules = datasets[model][1]

            for algorithm,pipeline in zip(algorithms,pipelines):
                res_rul[(classification_dataset,model + f'_{algorithm}_rules',it,'all')] = pipeline(X_train_only_rules, X_test_only_rules, y_train, y_test)
                res_rul[(classification_dataset,model + f'_{algorithm}_rules_and_features',it,'all')] = pipeline(X_train_rules_and_features, X_test_rules_and_features, y_train, y_test)

            for fact in factors:
                if (round(len(X_train_rules_and_features.columns)*fact) <= X_train.shape[0]) & (round(col_len*fact) <= len(X_train_rules_and_features.columns)):
                    min_feat_rule = round(col_len*fact)

                    if (round(col_len*fact) > len(X_train_only_rules.columns)) & (fact != 0.5):
                        len_rule = 1
                        min_rule = len(X_train_only_rules.columns)
                    else:
                        len_rule = fact
                        min_rule = min(round(col_len*fact),len(X_train_only_rules.columns))

                    cols = SelectKBest(k=min_feat_rule).fit(X_train_rules_and_features,y_train).get_feature_names_out()
                    cols_rule = SelectKBest(k=min_rule).fit(X_train_only_rules,y_train).get_feature_names_out()

                else:
                     continue

                for algorithm,pipeline in zip(algorithms,pipelines):
                    res_rul[(classification_dataset,model + f'_{algorithm}_rules',it,len_rule)] = pipeline(X_train_only_rules[cols_rule], X_test_only_rules[cols_rule], y_train, y_test)
                    res_rul[(classification_dataset,model + f'_{algorithm}_rules_and_features',it,fact)] = pipeline(X_train_rules_and_features[cols], X_test_rules_and_features[cols], y_train, y_test)

        for algorithm,pipeline in zip(algorithms,pipelines):
            res_rul[(classification_dataset,algorithm,it,1)] = pipeline(X_train, X_test, y_train, y_test)
        del X_train, X_test
    return res_rul

In [7]:
# for data in classification_dataset_names:
#     data = fetch_data(data)
#     print(data.shape)

In [7]:
orig = {}
for classification_dataset in classification_dataset_names:
    res_rul = experimentation(classification_dataset,1)
    orig.update(res_rul)

Numer of NANs:  0
Numer of NANs:  0
Dataset Information
Rows: 1600
Columns: 60
Number of classes: 2
Continous columns: 0
Binary columns: 0
Categorical columns: 20
-------------------------------------------------
[1m

    ----------------------------------------- GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1 -----------------------------------------[0m




Classification CART mean performance:  0.50625


Classification OCT performance:  0.65


Numer of NANs:  0
Dataset Information
Rows: 1600
Columns: 56
Number of classes: 2
Continous columns: 0
Binary columns: 2
Categorical columns: 18
-------------------------------------------------
[1m

    ----------------------------------------- GAMETES_Epistasis_2_Way_20atts_0.4H_EDM_1_1 -----------------------------------------[0m
Classification CART mean performance:  0.50625


Classification OCT performance:  0.8




KeyboardInterrupt: 

In [8]:
orig

{('GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1', 'Reg-CART', 0, 1): nan,
 ('GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1', 'CART', 0, 1): 0.50625,
 ('GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1', 'ORT', 0, 1): nan,
 ('GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1', 'OCT', 0, 1): 0.65,
 ('GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1', 'ORT-H', 0, 1): nan,
 ('GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1', 'OCT-H', 0, 1): nan,
 ('GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1',
  'CART_LN_rules',
  0,
  'all'): 0.50625,
 ('GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1',
  'CART_LN_rules_and_features',
  0,
  'all'): 0.471875,
 ('GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1',
  'CART_SVM_rules',
  0,
  'all'): 0.50625,
 ('GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1',
  'CART_SVM_rules_and_features',
  0,
  'all'): 0.55,
 ('GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1',
  'CART_NB_rules',
  0,
  'all'): 0.4875,
 ('GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1',
  'CART_NB_rules_and_features',
  0,
  'all

In [None]:
# from joblib import delayed
# from tqdm import tqdm
# res_rul = ProgressParallel(n_jobs=4)(delayed(experimentation)(data) for data in classification_dataset_names)

# result = {}
# for d in res_rul:
#     result.update(d)

In [None]:
# import os
# files = os.listdir('C:/Users/paulr/PycharmProjects/pythonProject/ORRFA-2/')

In [None]:
# list_of_dfs = []
# for file in files:
#     if file.endswith('pickle'):
#         with open(file, 'rb') as handle:
#             b = pickle.load(handle)
#             # df = pd.concat({k:json_normalize(v, 'scores', ['best']) for k,v in d.items()})
#             # df = df.reset_index(level=1, drop=True).rename_axis('names').reset_index()
# #             list_of_dfs.append(b)
# big_df = pd.concat(list_of_dfs, ignore_index=True)#ignore_index to reset index of big_df
# big_df.head()

In [None]:
k = pd.DataFrame(orig,index=[0])
k = k.stack(level=2).sort_index()
k = k.stack(level=2).sort_index()
k = k.swaplevel(axis=1)
k = k.droplevel(0)

In [None]:
k

In [None]:
# k.to_csv('result_girdsearch.csv')

In [None]:
k.swaplevel(axis=0).mean(level=0).mean(level=0,axis=1).iloc[1].sort_values(ascending=False)

In [None]:
t=k.mean(level=0,axis=1)
t = t.mean(axis=0)
t.sort_values(ascending = False)

In [None]:
y = k.swaplevel(axis=1)
y = y.var(level=0,axis=1)
y = y.mean(axis=0)
good_tests = y[y < 0.01].index
good = list(good_tests)

In [None]:
vaild_results = k.iloc[:,k.columns.isin(good, level=1)]
vaild_results=vaild_results.mean(level=0,axis=1)
vaild_results.mean(axis=0)

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(nrows = 5, ncols = 4, gridspec_kw = {"hspace": 0.25})
import seaborn as sns
fig.set_size_inches(30, 25)
iteration = 0

for m in range(5):
    for j in range(4):

        dataset = classification_dataset_names[:20][iteration]

        columns = [i for i in k.columns if dataset in i]
        sns.boxplot(k[columns], ax = ax[m, j])

        ax[m, j].set_title(dataset)

        ax[m, j].set_xticklabels(['CART Rules', "OCT Rules", "Logistic Regression", "RuleFit", "ORRFA"])

        iteration += 1



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
fig, ax = plt.subplots()
sns.boxplot(data = performance_by_iter)
fig.set_size_inches(20, 10)
ax.set_xticklabels(performance_by_iter.columns.values)
# ax.set_ylim(0.93, 0.995)
ax.tick_params(rotation = 0, labelsize = 14)
ax.set_ylabel("Accuracy", fontsize = 14)
ax.set_title("Accuracy of Logistic Regression, RuleFit and ORRFA", fontsize = 15)
# ax.set_ylabel()
plt.savefig('Benchmark ORRFA.png')

In [None]:
iters=5
res_rul = {}
sc = StandardScaler()
names = ['Reg-CART','CART','ORT','OCT','ORT-H','OCT-H','ORT+ORT-H','OCT+OCT-H']

for classification_dataset in classification_dataset_names:
    df = fetch_data(classification_dataset)
    df, num_col, bin_col, cat_col = corr_data_for(df)
    if (df.shape[0] > 10000) | (df.shape[1] > 100) | (df.empty):
        continue
    y = df['target']
    X = df.loc[:, df.columns != 'target']

    print(color.BOLD + '\n\n    ----------------------------------------- {} -----------------------------------------'.format(classification_dataset) + color.END)
    rows_data, columns_data = X.shape
    print('Dataset Information')
    print('Rows:',rows_data,)
    print('Columns:',columns_data)
    print('Number of classes:',y.nunique())
    print('Continous columns:', len(num_col))
    print('Binary columns:', len(bin_col))
    print('Categorical columns:',len(cat_col))
    print('-------------------------------------------------')

    for it in range(iters):
        X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = it, stratify=y)
        X_col = X_train.columns
        col_len = len(X_col)
        X_test.name = "X_test"
        X_train.name = "X_train"
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
        X_train = pd.DataFrame(X_train,columns=X_col)
        X_test = pd.DataFrame(X_test,columns=X_col)
        factors = ['all',0.5,1,1.2,1.4,1.6,1.8,2,2.5,3]

        models, performance = generate_tree(X_train, y_train, X_test, y_test, n_num=1, feat_size=len(X.columns),  max_iter_hy=2,sub_paths=True,depth_grid=range(1,7), depth_grid_hy=range(1,3), complexity_bi = 0.001, complexity_hy=0.001,  Reg_CART=False, ORT=False, ORT_H=False, Clas_CART=True, OCT=True, OCT_H=False)
        for perf,name in zip(performance,names):
            if not not perf:
                res_rul[(classification_dataset,name,it,1)] = sum(perf) / len(perf)

        act_name = []
        act_rules = []
        for model,name in zip(models,names):
            if (all(model)) & (not not model) & (None not in model):
                act_name.append(name)
                act_rules.append(model)

        datasets = gen_train_and_test_features(act_rules ,act_name , X_train, X_test)
        for model in datasets.keys():
            X_train_rules_and_features, X_test_rules_and_features = datasets[model][0]
            X_train_only_rules, X_test_only_rules = datasets[model][1]

            for len_c in factors:

                if len_c == 'all':
                    len_rule = 'all'
                    cols = SelectKBest(k='all').fit(X_train_rules_and_features,y_train).get_feature_names_out()
                    X_train_rules_features = X_train_rules_and_features[cols]
                    X_test_rules_features = X_test_rules_and_features[cols]

                    cols_1 = SelectKBest(k='all').fit(X_train_only_rules,y_train).get_feature_names_out()
                    X_train_rules = X_train_only_rules[cols_1]
                    X_test_rules = X_test_only_rules[cols_1]

                elif (round(col_len*len_c) <= X_train.shape[0]) & (round(col_len*len_c) <= len(X_train_rules_and_features.columns)):
                    min_feat_rule = round(col_len*len_c)
                    if (round(col_len*len_c) > len(X_train_only_rules.columns)) & (col_len != 0.5):
                        len_rule = 1
                        min_rule = len(X_train_only_rules.columns)
                    else:
                        len_rule = len_c
                        min_rule = min(round(col_len*len_c),len(X_train_only_rules.columns))


                    cols = SelectKBest(k=min_feat_rule).fit(X_train_rules_and_features,y_train).get_feature_names_out()
                    X_train_rules_features = X_train_rules_and_features[cols]
                    X_test_rules_features = X_test_rules_and_features[cols]

                    cols_1 = SelectKBest(k=min_rule).fit(X_train_only_rules,y_train).get_feature_names_out()
                    X_train_rules = X_train_only_rules[cols_1]
                    X_test_rules = X_test_only_rules[cols_1]

                else:
                     continue

                # Pipeline models
                only_rules_acc_LN = log_regression_pipeline(X_train_rules, X_test_rules, y_train, y_test)
                rules_and_features_acc_LN = log_regression_pipeline(X_train_rules_features, X_test_rules_features, y_train, y_test)
                res_rul[(classification_dataset,model + "_LG_rules",it,len_rule)] = only_rules_acc_LN
                res_rul[(classification_dataset,model + "_LG_rules_and_features",it,len_c)] = rules_and_features_acc_LN

                only_rules_acc_SVM = SVM_pipeline(X_train_rules, X_test_rules, y_train, y_test)
                rules_and_features_acc_SVM = SVM_pipeline(X_train_rules_features, X_test_rules_features, y_train, y_test)
                res_rul[(classification_dataset,model + "_SVM_rules",it,len_rule)] = only_rules_acc_SVM
                res_rul[(classification_dataset,model + "_SVM_rules_and_features",it,len_c)] = rules_and_features_acc_SVM

                only_rules_acc_NB = NB_pipeline(X_train_rules, X_test_rules, y_train, y_test)
                rules_and_features_acc_NB = NB_pipeline(X_train_rules_features, X_test_rules_features, y_train, y_test)
                res_rul[(classification_dataset,model + "_NB_rules",it,len_rule)] = only_rules_acc_NB
                res_rul[(classification_dataset,model + "_NB_rules_and_features",it,len_c)] = rules_and_features_acc_NB

                only_rules_acc_KNN = KNN_pipeline(X_train_rules, X_test_rules, y_train, y_test)
                rules_and_features_acc_KNN = KNN_pipeline(X_train_rules_features, X_test_rules_features, y_train, y_test)
                res_rul[(classification_dataset,model + "_KNN_rules",it,len_rule)] = only_rules_acc_KNN
                res_rul[(classification_dataset,model + "_KNN_rules_and_features",it,len_c)] = rules_and_features_acc_KNN


        res_rul[(classification_dataset,'Logistic_Regression',it,1)] = log_regression_pipeline(X_train, X_test, y_train, y_test)
        res_rul[(classification_dataset,"Support Vector Machine",it,1)] = SVM_pipeline(X_train, X_test, y_train, y_test)
        res_rul[(classification_dataset,"Naive Bayes",it,1)] = NB_pipeline(X_train, X_test, y_train, y_test)
        res_rul[(classification_dataset,"K-Nearest-Neighbor",it,1)] = KNN_pipeline(X_train, X_test, y_train, y_test)

In [None]:
res_rul