# New scikit-FIBERS Run

## Installation and Imports 
How to setup and use scikit-FIBERS

In [1]:
import os
import copy
import dask
import pickle
import pandas as pd
from lifelines import CoxPHFitter
from src.skfibers import FIBERS
from src.skfibers.methods.data_handling import prepare_data
from sklearn.metrics import accuracy_score
from dask.distributed import Client
from dask_jobqueue import SLURMCluster, LSFCluster, SGECluster

## Loading dataset

### Setting Variables for General Experiment

In [2]:
# dataset_name_list = os.listdir('PPSNDatasets')
dataset_name_list = [   'standard_with_noise.csv',
                        'standard_no_noise.csv',
                        'threshold_0_no_noise.csv',
                        'threshold_0_with_noise.csv',
                        'threshold_1_no_noise.csv',
                        'threshold_1_with_noise.csv',
                        'threshold_2_no_noise.csv',
                        'threshold_2_with_noise.csv',
                        'threshold_4_no_noise.csv',
                        'threshold_4_with_noise.csv',]
experiment_list = ['Goal1', 'Goal2', 'Goal3', 'Goal4', 'Goal5', 'Testing']

In [3]:
root_folder = 'PPSNResults/FIBERS2/'
dataset_name = dataset_name_list[0]
experiment_name = experiment_list[4]

In [4]:
# for experiment_name in experiment_list:
#     for dataset_name in dataset_name_list:
#         try:
#             folder = root_folder + '/' + experiment_name + '/' + dataset_name.split('.')[0] + '/'
#             os.makedirs(folder)
#         except FileExistsError:
#             folder = root_folder + '/' + experiment_name + '/' + dataset_name.split('.')[0] + '/'
#             print("Folder Already Exists:" + folder)

In [5]:
def read_dataset(dataset_name):
    data = pd.read_csv('PPSNDatasets/'+ dataset_name)
    true_risk_group = data[['TrueRiskGroup']]
    data = data.drop('TrueRiskGroup', axis=1)
    return data, true_risk_group

In [6]:
# data, true_risk_group = read_dataset(dataset_name)

## Code For Single FIBERS Run

In [7]:
# Pop Size = 50, Iterations = 100, crossover_prob = 0.5, min mutation = 0.1, 
# elitism = 0.1, min_bin_size = 1, max initial bin size = 10, group_strata_min = 0.2.

In [8]:
# min mutation = 0.1 and max mutation prob =0.1 (effectively turning off oscillating mutation rate),
#  and merge prob = 0, group_thresh = 0, fitness metric = log_rank, and diversity_pressure = 0

In [9]:
# fibers = FIBERS(outcome_label="Duration", outcome_type="survival", iterations=100, pop_size=50, tournament_prop=0.2, pop_clean = ‘group_strata’
#                 crossover_prob=0.5, min_mutation_prob=0.1, max_mutation_prob=0.1, merge_prob=0.0, new_gen=1.0, elitism=0.1,
#                 diversity_pressure=0, min_bin_size=1, max_bin_size=None, max_bin_init_size=10, fitness_metric="log_rank", 
#                 log_rank_weighting=None, censor_label="Censoring", group_strata_min=0.2, penalty=0.5, group_thresh=0, min_thresh=0, max_thresh=3, 
#                 int_thresh=True, thresh_evolve_prob=0.5, manual_bin_init=None, covariates=None, report=None, random_seed=42, verbose=False)
# fibers = fibers.fit(data) 

In [10]:
from src.skfibers.methods.data_handling import prepare_data
def get_experiment_output(fibers, X, y=None, dataset=None, filename=None):
        columns = ["Features in Bin", "Number of P", "Number of R", 
                   "Bin Size", "Pred Ratio", "Birth Iteration",
                   "Iterations to Ideal Solution", 
                   "Log-Rank Score",
                   "Unadjusted HR", "HR CI", "HR P-value", "Runtime",
                   "Count At/Below Threshold",
                   "Count Above Threshold", "Group Ratio", 
                   "Log-Rank p-value", "Threshold", 
                   "Accuracy",
                   "Residual",
                   "Residual p-value", 
                   "Dataset Filename"]
        X = fibers.check_x_y(X, None)
        X, feature_names = prepare_data(X, fibers.outcome_label, fibers.censor_label, fibers.covariates)
        assert (feature_names == fibers.feature_names)

        Bin = fibers.get_top_bins()[0]

        # Sum instance values across features specified in the bin
        feature_sums = X.loc[:, fibers.feature_names][Bin.feature_list].sum(axis=1)
        bin_df = pd.DataFrame({'Bin':feature_sums})

        bin_df['Bin'] = bin_df['Bin'].apply(lambda x: 0 if x <= Bin.group_threshold else 1)

        # Create evaluation dataframe including bin sum feature, outcome, and censoring alone
        bin_df = pd.concat([bin_df, X.loc[:, fibers.outcome_label], X.loc[:, fibers.censor_label]],axis=1)
        try:
            cph = CoxPHFitter()
            cph.fit(bin_df, fibers.outcome_label,event_col=fibers.censor_label, show_progress=False)
            summary = cph.summary
            Bin.HR = summary['exp(coef)'].iloc[0]
            Bin.HR_CI = str(summary['exp(coef) lower 95%'].iloc[0])+'-'+str(summary['exp(coef) upper 95%'].iloc[0])
            Bin.HR_p_value = summary['p'].iloc[0]
        except:
            Bin.HR = 0
            Bin.HR_CI = None
            Bin.HR_p_value = None

        # summary = fibers.get_cox_prop_hazard(X, 0)
        # bin_hr = summary['exp(coef)'].iloc[0]
        # bin_low_CI = summary['exp(coef) lower 95%'].iloc[0]
        # bin_upper_CI = summary['exp(coef) upper 95%'].iloc[0]
        # bin_p_val = summary['p'].iloc[0]
        # print("Bin HR: "+str(bin_hr)+" ("+str(bin_low_CI)+"-"+str(bin_upper_CI)+")")
        # print("Bin HR p-value: "+str(bin_p_val))

        pdf = pd.DataFrame([[Bin.feature_list,
                             str(Bin.feature_list).count('P'), str(Bin.feature_list).count('R'), 
                             Bin.bin_size, str(Bin.feature_list).count('P')/Bin.bin_size, 
                             Bin.birth_iteration,
                             None if str(Bin.feature_list).count('P') != 10 else Bin.birth_iteration,
                             Bin.log_rank_score, 
                             Bin.HR, Bin.HR_CI, Bin.HR_p_value, fibers.elapsed_time,
                             Bin.count_at,
                             Bin.count_bt, Bin.count_at/(Bin.count_at+Bin.count_bt), 
                             Bin.log_rank_p_value, Bin.group_threshold, 
                             accuracy_score(fibers.predict(X, 0), y) if y is not None else None,
                             Bin.residuals_score, Bin.residuals_p_value, dataset]],
                           columns=columns).T  # SPHIA
        
        if filename:
            pdf.to_csv(filename)
        return pdf

In [11]:
# get_experiment_output(fibers, data, true_risk_group, dataset_name, root_folder + '/' + experiment_name
#                                                + '/' + dataset_name.split('.')[0] + '/experiment_table.csv')

### Accessing results and internal functions

In [12]:
# experiment_results = get_experiment_output(fibers, data, true_risk_group)
# experiment_results

## Code for Experiment Run

In [13]:
def get_cluster(cluster_type='SLURM', output_path=".", queue='defq', memory=4):
    client = None
    try:
        if cluster_type == 'SLURM':
            cluster = SLURMCluster(queue=queue,
                                   cores=1,
                                   memory=str(memory) + "G",
                                   walltime="24:00:00",
                                   log_directory=output_path + "/dask_logs/")
            cluster.adapt(maximum_jobs=400)
        elif cluster_type == "LSF":
            cluster = LSFCluster(queue=queue,
                                 cores=1,
                                 mem=memory * 1000000000,
                                 memory=str(memory) + "G",
                                 walltime="24:00",
                                 log_directory=output_path + "/dask_logs/")
            cluster.adapt(maximum_jobs=400)
        elif cluster_type == 'UGE':
            cluster = SGECluster(queue=queue,
                                 cores=1,
                                 memory=str(memory) + "G",
                                 resource_spec="mem_free=" + str(memory) + "G",
                                 walltime="24:00:00",
                                 log_directory=output_path + "/dask_logs/")
            cluster.adapt(maximum_jobs=400)
        elif cluster_type == 'Local':
            c = Client()
            cluster = c.cluster
        else:
            raise Exception("Unknown or Unsupported Cluster Type")
        client = Client(cluster)
    except Exception as e:
        print(e)
        raise Exception("Exception: Unknown Exception")
    print("Running dask-cluster")
    print(client.scheduler_info())
    return client

In [14]:
def runner_fn(fibers, params):
    data, true_risk_group = read_dataset(params['dataset_name'])
    params['data'] = data
    params['label'] = true_risk_group
    fibers = fibers.fit(params['data']) 
    experiment_results = get_experiment_output(fibers, params['data'], params['label'], params['dataset_name'],
                                               params['root_folder'] + '/' + params['experiment_name'] 
                                               + '/' + params['dataset_name'].split('.')[0] 
                                               + '/models/model_table_' + str(fibers.random_seed) + '.csv')
    with open(params['root_folder'] + '/' + params['experiment_name'] 
              + '/' + params['dataset_name'].split('.')[0] + '/models/' + str(fibers.random_seed), 'wb') as file:
        pickle.dump(fibers, file)
    print(params)
    return experiment_results

In [15]:
fibers_list = [
    
    FIBERS(outcome_label="Duration", outcome_type="survival", iterations=100, pop_size=50, tournament_prop=0.2, pop_clean = 'group_strata',
                    crossover_prob=0.5, min_mutation_prob=0.1, max_mutation_prob=0.1, merge_prob=0.0, new_gen=1.0, elitism=0.1,
                    diversity_pressure=0, min_bin_size=1, max_bin_size=None, max_bin_init_size=10, fitness_metric="log_rank", 
                    log_rank_weighting=None, censor_label="Censoring", group_strata_min=0.2, penalty=0.5, group_thresh=0, min_thresh=0, max_thresh=5, 
                    int_thresh=True, thresh_evolve_prob=0.5, manual_bin_init=None, covariates=None, report=None, random_seed=None, verbose=False),
    FIBERS(outcome_label="Duration", outcome_type="survival", iterations=100, pop_size=50, tournament_prop=0.2, pop_clean = 'group_strata',
                    crossover_prob=0.5, min_mutation_prob=0.1, max_mutation_prob=0.1, merge_prob=0.1, new_gen=1.0, elitism=0.1,
                    diversity_pressure=0, min_bin_size=1, max_bin_size=None, max_bin_init_size=10, fitness_metric="log_rank", 
                    log_rank_weighting=None, censor_label="Censoring", group_strata_min=0.2, penalty=0.5, group_thresh=0, min_thresh=0, max_thresh=5, 
                    int_thresh=True, thresh_evolve_prob=0.5, manual_bin_init=None, covariates=None, report=None, random_seed=None, verbose=False),
    FIBERS(outcome_label="Duration", outcome_type="survival", iterations=100, pop_size=50, tournament_prop=0.2, pop_clean = 'group_strata',
                    crossover_prob=0.5, min_mutation_prob=0.1, max_mutation_prob=0.5, merge_prob=0.0, new_gen=1.0, elitism=0.1,
                    diversity_pressure=0, min_bin_size=1, max_bin_size=None, max_bin_init_size=10, fitness_metric="log_rank", 
                    log_rank_weighting=None, censor_label="Censoring", group_strata_min=0.2, penalty=0.5, group_thresh=0, min_thresh=0, max_thresh=5, 
                    int_thresh=True, thresh_evolve_prob=0.5, manual_bin_init=None, covariates=None, report=None, random_seed=None, verbose=False),
    FIBERS(outcome_label="Duration", outcome_type="survival", iterations=100, pop_size=50, tournament_prop=0.2, pop_clean = 'group_strata',
                    crossover_prob=0.5, min_mutation_prob=0.1, max_mutation_prob=0.5, merge_prob=0.0, new_gen=1.0, elitism=0.1,
                    diversity_pressure=0, min_bin_size=1, max_bin_size=None, max_bin_init_size=10, fitness_metric="log_rank", 
                    log_rank_weighting=None, censor_label="Censoring", group_strata_min=0.2, penalty=0.5, group_thresh=None, min_thresh=0, max_thresh=5, 
                    int_thresh=True, thresh_evolve_prob=0.5, manual_bin_init=None, covariates=None, report=None, random_seed=None, verbose=False),
    FIBERS(outcome_label="Duration", outcome_type="survival", iterations=100, pop_size=50, tournament_prop=0.2, pop_clean = 'group_strata',
                    crossover_prob=0.5, min_mutation_prob=0.1, max_mutation_prob=0.5, merge_prob=0.0, new_gen=1.0, elitism=0.1,
                    diversity_pressure=3, min_bin_size=1, max_bin_size=None, max_bin_init_size=10, fitness_metric="log_rank", 
                    log_rank_weighting=None, censor_label="Censoring", group_strata_min=0.2, penalty=0.5, group_thresh=None, min_thresh=0, max_thresh=5, 
                    int_thresh=True, thresh_evolve_prob=0.5, manual_bin_init=None, covariates=None, report=None, random_seed=None, verbose=False),                       
                ]

In [16]:
dataset_name_list

['standard_with_noise.csv',
 'standard_no_noise.csv',
 'threshold_0_no_noise.csv',
 'threshold_0_with_noise.csv',
 'threshold_1_no_noise.csv',
 'threshold_1_with_noise.csv',
 'threshold_2_no_noise.csv',
 'threshold_2_with_noise.csv',
 'threshold_4_no_noise.csv',
 'threshold_4_with_noise.csv']

In [17]:
param_grid = [
        ("Goal1", dataset_name_list[0], fibers_list[0]),
        ("Goal1", dataset_name_list[1], fibers_list[0]), 
        ("Goal2", dataset_name_list[0], fibers_list[1]), 
        ("Goal2", dataset_name_list[1], fibers_list[1]), 
        ("Goal3", dataset_name_list[0], fibers_list[2]), 
        ("Goal3", dataset_name_list[1], fibers_list[2]), 
        ("Goal4", dataset_name_list[2], fibers_list[3]), 
        ("Goal4", dataset_name_list[3], fibers_list[3]), 
        ("Goal4", dataset_name_list[4], fibers_list[3]), 
        ("Goal4", dataset_name_list[5], fibers_list[3]), 
        ("Goal4", dataset_name_list[6], fibers_list[3]), 
        ("Goal4", dataset_name_list[7], fibers_list[3]), 
        ("Goal4", dataset_name_list[8], fibers_list[3]), 
        ("Goal4", dataset_name_list[9], fibers_list[3]), 
        ("Goal5", dataset_name_list[4], fibers_list[4]), 
        ("Goal5", dataset_name_list[5], fibers_list[4]), 
    ]

In [18]:
# for experiment_name in experiment_list:
#     for dataset_name in dataset_name_list:
DEBUG = False
if DEBUG:
    import shutil
    try:
        shutil.rmtree(root_folder)
    except:
        pass
for experiment_name, dataset_name, _ in param_grid:
    try:
        folder = root_folder + '/' + experiment_name + '/' + dataset_name.split('.')[0] + '/'
        os.makedirs(folder + '/models/')
    except FileExistsError:
        folder = root_folder + '/' + experiment_name + '/' + dataset_name.split('.')[0] + '/'
        print("Folder Already Exists:" + folder)

        continue

Folder Already Exists:PPSNResults/FIBERS2//Goal1/standard_with_noise/
Folder Already Exists:PPSNResults/FIBERS2//Goal1/standard_no_noise/
Folder Already Exists:PPSNResults/FIBERS2//Goal2/standard_with_noise/
Folder Already Exists:PPSNResults/FIBERS2//Goal2/standard_no_noise/
Folder Already Exists:PPSNResults/FIBERS2//Goal3/standard_with_noise/
Folder Already Exists:PPSNResults/FIBERS2//Goal3/standard_no_noise/
Folder Already Exists:PPSNResults/FIBERS2//Goal4/threshold_0_no_noise/
Folder Already Exists:PPSNResults/FIBERS2//Goal4/threshold_0_with_noise/
Folder Already Exists:PPSNResults/FIBERS2//Goal4/threshold_1_no_noise/
Folder Already Exists:PPSNResults/FIBERS2//Goal4/threshold_1_with_noise/
Folder Already Exists:PPSNResults/FIBERS2//Goal4/threshold_2_no_noise/
Folder Already Exists:PPSNResults/FIBERS2//Goal4/threshold_2_with_noise/
Folder Already Exists:PPSNResults/FIBERS2//Goal4/threshold_4_no_noise/
Folder Already Exists:PPSNResults/FIBERS2//Goal4/threshold_4_with_noise/
Folder Alr

In [19]:
def make_obj_list(fibers, param):  
    obj_list = list()
    permutations = 20
    if DEBUG:
        permutations = 2
    for i in range(permutations):
        fibers.random_seed = i+1
        obj_list.append((copy.deepcopy(fibers), param))
    return obj_list

In [20]:
def param_maker(dataset_name, experiment_name):
    param_dict = {
        'root_folder': root_folder,
        'dataset_name': dataset_name,
        'experiment_name': experiment_name,
    }
    return param_dict

In [21]:
job_list = list()
for experiment_name, dataset_name, fibers in param_grid: 
    job_list.extend(make_obj_list(fibers, param_maker(dataset_name, experiment_name)))

In [22]:
client = get_cluster('SLURM')

Running dask-cluster
{'type': 'Scheduler', 'id': 'Scheduler-b5e593a9-fb65-446d-8d16-4ecc80412974', 'address': 'tcp://10.17.134.112:36363', 'services': {'dashboard': 45625}, 'started': 1713417188.9243107, 'workers': {}}


Perhaps you already have a cluster running?
Hosting the HTTP server on port 45625 instead


In [23]:
results = dask.compute([dask.delayed(runner_fn)(fibers_obj, params
                                            ) for fibers_obj, params in job_list])

In [24]:
result_df = pd.concat(results[0], axis=1, ignore_index=False).T

In [25]:
result_df

Unnamed: 0,Features in Bin,Number of P,Number of R,Bin Size,Pred Ratio,Birth Iteration,Iterations to Ideal Solution,Log-Rank Score,Unadjusted HR,HR CI,...,Runtime,Count At/Below Threshold,Count Above Threshold,Group Ratio,Log-Rank p-value,Threshold,Accuracy,Residual,Residual p-value,Dataset Filename
0,"[P_1, P_10, P_3, P_4, P_5, P_7, P_8]",7,0,7,1.0,29,,1555.118393,2.413567,2.3073349398995586-2.5246904024552883,...,172.681008,4928,5072,0.4928,0.0,0,0.9928,,,standard_with_noise.csv
0,"[P_1, P_10, P_3, P_4, P_5, P_7, P_8]",7,0,7,1.0,13,,1555.118393,2.413567,2.3073349398995586-2.5246904024552883,...,184.160742,4928,5072,0.4928,0.0,0,0.9928,,,standard_with_noise.csv
0,"[P_1, P_10, P_3, P_4, P_5, P_7, P_8]",7,0,7,1.0,30,,1555.118393,2.413567,2.3073349398995586-2.5246904024552883,...,186.027752,4928,5072,0.4928,0.0,0,0.9928,,,standard_with_noise.csv
0,"[P_1, P_10, P_3, P_4, P_5, P_8, P_7]",7,0,7,1.0,19,,1555.118393,2.413567,2.3073349398995586-2.5246904024552883,...,162.984207,4928,5072,0.4928,0.0,0,0.9928,,,standard_with_noise.csv
0,"[P_1, P_10, P_3, P_4, P_5, P_7, P_8]",7,0,7,1.0,21,,1555.118393,2.413567,2.3073349398995586-2.5246904024552883,...,167.338677,4928,5072,0.4928,0.0,0,0.9928,,,standard_with_noise.csv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,"[P_1, P_2, P_3, P_4, P_5, P_6, P_7, P_8, P_9]",9,0,9,1.0,80,,1676.757094,2.505191,2.3943569808923995-2.62115471804492,...,193.227539,4927,5073,0.4927,0.0,1,0.9927,,,threshold_1_with_noise.csv
0,"[P_1, P_10, P_2, P_3, P_4, P_5, P_6, P_7, P_9,...",10,0,10,1.0,41,41,1702.099677,2.522881,2.4112109166731734-2.6397221807476394,...,186.783333,5000,5000,0.5,0.0,1,1.0,,,threshold_1_with_noise.csv
0,"[P_1, P_2, P_3, P_4, P_5, P_6, P_7, P_8, P_9]",9,0,9,1.0,75,,1676.757094,2.505191,2.3943569808923995-2.62115471804492,...,199.781475,4927,5073,0.4927,0.0,1,0.9927,,,threshold_1_with_noise.csv
0,"[P_1, P_2, P_3, P_4, P_5, P_6, P_7, P_9]",8,0,8,1.0,35,,1650.787898,2.487886,2.377830522149448-2.603035047344659,...,199.346936,4821,5179,0.4821,0.0,1,0.9821,,,threshold_1_with_noise.csv
