# New scikit-FIBERS Run

## Installation and Imports 
How to setup and use scikit-FIBERS

In [1]:
import os
import copy
import dask
import pickle
import pandas as pd
from lifelines import CoxPHFitter
from src.skfibers import FIBERS
from src.skfibers.methods.data_handling import prepare_data
from sklearn.metrics import accuracy_score
from dask.distributed import Client
from dask_jobqueue import SLURMCluster, LSFCluster, SGECluster

## Loading dataset

### Setting Variables for General Experiment

In [2]:
dataset_name_list = [
                    'covariates_dataset_1.csv',
                    'covariates_dataset_2.csv',
                    'covariates_dataset_3.csv',
                    ]
experiment_list = ['Goal6']

In [3]:
root_folder = 'PPSNResults/FIBERS2/'
dataset_name = dataset_name_list[0]
experiment_name = experiment_list[0]
covariates = ['C_1','C_2'] 

In [4]:
# for experiment_name in experiment_list:
#     for dataset_name in dataset_name_list:
#         try:
#             folder = root_folder + '/' + experiment_name + '/' + dataset_name.split('.')[0] + '/'
#             os.makedirs(folder)
#         except FileExistsError:
#             folder = root_folder + '/' + experiment_name + '/' + dataset_name.split('.')[0] + '/'
#             print("Folder Already Exists:" + folder)

In [5]:
def read_dataset(dataset_name):
    data = pd.read_csv('PPSNDatasets/'+ dataset_name)
    extra_cols = [
        'patient_censoring_time',
        'administrative_censoring_time',
        'graft_failure_time',
    ]
    df_extra_cols = data[extra_cols]
    data = data.drop(extra_cols, axis=1)
    return data, df_extra_cols

In [6]:
data, labels = read_dataset(dataset_name)

## Code For Single FIBERS Run

In [7]:
# Pop Size = 50, Iterations = 100, crossover_prob = 0.5, min mutation = 0.1, 
# elitism = 0.1, min_bin_size = 1, max initial bin size = 10, group_strata_min = 0.2.

In [8]:
# min mutation = 0.1 and max mutation prob =0.1 (effectively turning off oscillating mutation rate),
#  and merge prob = 0, group_thresh = 0, fitness metric = log_rank, and diversity_pressure = 0

In [9]:
# fibers = FIBERS(outcome_label="Duration", outcome_type="survival", iterations=100, pop_size=50, tournament_prop=0.2,
#                 crossover_prob=0.5, min_mutation_prob=0.1, max_mutation_prob=0.5, merge_prob=0.1, 
#                 new_gen=1.0, elitism=0.1, diversity_pressure=0, min_bin_size=1,
#                   max_bin_size=None, max_bin_init_size=10, fitness_metric="log_rank_residuals", censor_label="Censoring", 
#                 group_strata_min=0.2, group_thresh=None, min_thresh=0, max_thresh=5, 
#                 int_thresh=True, covariates=covariates, pop_clean = 'group_strata',  
#                 report=None, random_seed=42, verbose=False)
# fibers = fibers.fit(data)

In [10]:
from src.skfibers.methods.data_handling import prepare_data
def get_experiment_output(fibers, X, y=None, dataset=None, filename=None):
        columns = ["Features in Bin", "Number of P", "Number of R", 
                   "Bin Size", "Pred Ratio", "Birth Iteration",
                   "Iterations to Ideal Solution", 
                   "Log-Rank Score",
                   "Unadjusted HR", "HR CI", "HR P-value", "Runtime",
                   "Count At/Below Threshold",
                   "Count Above Threshold", "Group Ratio", 
                   "Log-Rank p-value", "Threshold", 
                   "Accuracy",
                   "Residual",
                   "Residual p-value", 
                   "Dataset Filename"]
        X = fibers.check_x_y(X, None)
        X, feature_names = prepare_data(X, fibers.outcome_label, fibers.censor_label, fibers.covariates)
        assert (feature_names == fibers.feature_names)

        Bin = fibers.get_top_bins()[0]

        # Sum instance values across features specified in the bin
        feature_sums = X.loc[:, fibers.feature_names][Bin.feature_list].sum(axis=1)
        bin_df = pd.DataFrame({'Bin':feature_sums})

        bin_df['Bin'] = bin_df['Bin'].apply(lambda x: 0 if x <= Bin.group_threshold else 1)

        # Create evaluation dataframe including bin sum feature, outcome, and censoring alone
        bin_df = pd.concat([bin_df, X.loc[:, fibers.outcome_label], X.loc[:, fibers.censor_label]],axis=1)

        if fibers.covariates != None:                 
            try:
                bin_df = pd.concat([bin_df, X.loc[:, fibers.covariates]], axis=1)
                cph = CoxPHFitter()
                cph.fit(bin_df, fibers.outcome_label,event_col=fibers.censor_label, show_progress=False)
                summary = cph.summary
                Bin.adj_HR = summary['exp(coef)'].iloc[0]
                Bin.adj_HR_CI = str(summary['exp(coef) lower 95%'].iloc[0])+'-'+str(summary['exp(coef) upper 95%'].iloc[0])
                Bin.adj_HR_p_value = summary['p'].iloc[0]
            except:
                Bin.adj_HR = 0
                Bin.adj_HR_CI = None
                Bin.adj_HR_p_value = None

        pdf = pd.DataFrame([[Bin.feature_list,
                             str(Bin.feature_list).count('P'), str(Bin.feature_list).count('R'), 
                             Bin.bin_size, str(Bin.feature_list).count('P')/Bin.bin_size, 
                             Bin.birth_iteration,
                             None if str(Bin.feature_list).count('P') != 5 else Bin.birth_iteration,
                             Bin.log_rank_score, 
                             Bin.adj_HR, Bin.adj_HR_CI, Bin.adj_HR_p_value, fibers.elapsed_time,
                             Bin.count_at,
                             Bin.count_bt, Bin.count_at/(Bin.count_at+Bin.count_bt), 
                             Bin.log_rank_p_value, Bin.group_threshold, 
                             accuracy_score(fibers.predict(X, 0), y) if y is not None else None,
                             Bin.residuals_score, Bin.residuals_p_value, dataset]],
                           columns=columns).T  # SPHIA
        
        if filename:
            pdf.to_csv(filename)
        return pdf

In [11]:
# get_experiment_output(fibers, data, None, dataset_name, root_folder + '/' + experiment_name
#                                                + '/' + dataset_name.split('.')[0] + '/experiment_table.csv')

### Accessing results and internal functions

In [12]:
# experiment_results = get_experiment_output(fibers, data)
# experiment_results

## Code for Experiment Run

In [13]:
def get_cluster(cluster_type='SLURM', output_path=".", queue='defq', memory=4):
    client = None
    try:
        if cluster_type == 'SLURM':
            cluster = SLURMCluster(queue=queue,
                                   cores=1,
                                   memory=str(memory) + "G",
                                   walltime="24:00:00",
                                   log_directory=output_path + "/dask_logs/")
            cluster.adapt(maximum_jobs=400)
        elif cluster_type == "LSF":
            cluster = LSFCluster(queue=queue,
                                 cores=1,
                                 mem=memory * 1000000000,
                                 memory=str(memory) + "G",
                                 walltime="24:00",
                                 log_directory=output_path + "/dask_logs/")
            cluster.adapt(maximum_jobs=400)
        elif cluster_type == 'UGE':
            cluster = SGECluster(queue=queue,
                                 cores=1,
                                 memory=str(memory) + "G",
                                 resource_spec="mem_free=" + str(memory) + "G",
                                 walltime="24:00:00",
                                 log_directory=output_path + "/dask_logs/")
            cluster.adapt(maximum_jobs=400)
        elif cluster_type == 'Local':
            c = Client()
            cluster = c.cluster
        else:
            raise Exception("Unknown or Unsupported Cluster Type")
        client = Client(cluster)
    except Exception as e:
        print(e)
        raise Exception("Exception: Unknown Exception")
    print("Running dask-cluster")
    print(client.scheduler_info())
    return client

In [14]:
def runner_fn(fibers, params):
    data, labels = read_dataset(params['dataset_name'])
    params['data'] = data
    params['label'] = labels
    fibers = fibers.fit(params['data']) 
    experiment_results = get_experiment_output(fibers, params['data'], None, params['dataset_name'],
                                               params['root_folder'] + '/' + params['experiment_name'] 
                                               + '/' + params['dataset_name'].split('.')[0] 
                                               + '/models/model_table_' + str(fibers.random_seed) + '.csv')
    with open(params['root_folder'] + '/' + params['experiment_name'] 
              + '/' + params['dataset_name'].split('.')[0] + '/models/' + str(fibers.random_seed), 'wb') as file:
        pickle.dump(fibers, file)
    print(params)
    return experiment_results

In [15]:
fibers_list = [
            FIBERS(outcome_label="Duration", outcome_type="survival", iterations=100, pop_size=50, tournament_prop=0.2,
                crossover_prob=0.5, min_mutation_prob=0.1, max_mutation_prob=0.5, merge_prob=0.1, 
                new_gen=1.0, elitism=0.1, diversity_pressure=0, min_bin_size=1,
                  max_bin_size=None, max_bin_init_size=10, fitness_metric="log_rank", censor_label="Censoring", 
                group_strata_min=0.2, group_thresh=None, min_thresh=0, max_thresh=5, 
                int_thresh=True, covariates=covariates, pop_clean = 'group_strata',  
                report=None, random_seed=42, verbose=False),
            FIBERS(outcome_label="Duration", outcome_type="survival", iterations=100, pop_size=50, tournament_prop=0.2,
                crossover_prob=0.5, min_mutation_prob=0.1, max_mutation_prob=0.5, merge_prob=0.1, 
                new_gen=1.0, elitism=0.1, diversity_pressure=0, min_bin_size=1,
                  max_bin_size=None, max_bin_init_size=10, fitness_metric="residuals", censor_label="Censoring", 
                group_strata_min=0.2, group_thresh=None, min_thresh=0, max_thresh=5, 
                int_thresh=True, covariates=covariates, pop_clean = 'group_strata',  
                report=None, random_seed=42, verbose=False),
            FIBERS(outcome_label="Duration", outcome_type="survival", iterations=100, pop_size=50, tournament_prop=0.2,
                crossover_prob=0.5, min_mutation_prob=0.1, max_mutation_prob=0.5, merge_prob=0.1, 
                new_gen=1.0, elitism=0.1, diversity_pressure=0, min_bin_size=1,
                  max_bin_size=None, max_bin_init_size=10, fitness_metric="log_rank_residuals", censor_label="Censoring", 
                group_strata_min=0.2, group_thresh=None, min_thresh=0, max_thresh=5, 
                int_thresh=True, covariates=covariates, pop_clean = 'group_strata',  
                report=None, random_seed=42, verbose=False)
               ]

In [16]:
dataset_name_list

['covariates_dataset_1.csv',
 'covariates_dataset_2.csv',
 'covariates_dataset_3.csv']

In [17]:
param_grid = [
        ("Goal6", dataset_name_list[0], fibers_list[0]),
        ("Goal6", dataset_name_list[1], fibers_list[1]), 
        ("Goal6", dataset_name_list[2], fibers_list[2]), 
    ]

In [18]:
DEBUG = False
if DEBUG:
    import shutil
    try:
        shutil.rmtree(root_folder)
    except:
        pass
for experiment_name, dataset_name, _ in param_grid:
    try:
        folder = root_folder + '/' + experiment_name + '/' + dataset_name.split('.')[0] + '/'
        os.makedirs(folder)
        os.makedirs(folder + '/models/')
    except FileExistsError:
        folder = root_folder + '/' + experiment_name + '/' + dataset_name.split('.')[0] + '/'
        print("Folder Already Exists:" + folder)
        continue

In [19]:
def make_obj_list(fibers, param):  
    obj_list = list()
    for i in range(20):
        fibers.random_seed = i+1
        obj_list.append((copy.deepcopy(fibers), param))
    return obj_list

In [20]:
def param_maker(dataset_name, experiment_name):
    param_dict = {
        'root_folder': root_folder,
        'dataset_name': dataset_name,
        'experiment_name': experiment_name,
    }
    return param_dict

In [21]:
job_list = list()
for experiment_name, dataset_name, fibers in param_grid: 
    job_list.extend(make_obj_list(fibers, param_maker(dataset_name, experiment_name)))

In [22]:
client = get_cluster('SLURM')

Running dask-cluster
{'type': 'Scheduler', 'id': 'Scheduler-b6203665-5d51-4dff-befc-41b274e99c87', 'address': 'tcp://172.21.0.79:39415', 'services': {'dashboard': 35725}, 'started': 1713377837.0599556, 'workers': {}}


Perhaps you already have a cluster running?
Hosting the HTTP server on port 35725 instead


In [23]:
results = dask.compute([dask.delayed(runner_fn)(fibers_obj, params
                                            ) for fibers_obj, params in job_list])

In [24]:
result_df = pd.concat(results[0], axis=1, ignore_index=False).T

In [25]:
result_df

Unnamed: 0,Features in Bin,Number of P,Number of R,Bin Size,Pred Ratio,Birth Iteration,Iterations to Ideal Solution,Log-Rank Score,Unadjusted HR,HR CI,...,Runtime,Count At/Below Threshold,Count Above Threshold,Group Ratio,Log-Rank p-value,Threshold,Accuracy,Residual,Residual p-value,Dataset Filename
0,[PC_1],1,0,1,1.0,8,,2826.466752,0.987031,0.9255488741490403-1.0525974568923784,...,183.084627,7407,2593,0.7407,0.0,0,,,,covariates_dataset_1.csv
0,[PC_1],1,0,1,1.0,23,,2826.466752,0.987031,0.9255488741490403-1.0525974568923784,...,191.352746,7407,2593,0.7407,0.0,0,,,,covariates_dataset_1.csv
0,[PC_1],1,0,1,1.0,1,,2826.466752,0.987031,0.9255488741490403-1.0525974568923784,...,175.422055,7407,2593,0.7407,0.0,0,,,,covariates_dataset_1.csv
0,[PC_1],1,0,1,1.0,3,,2826.466752,0.987031,0.9255488741490403-1.0525974568923784,...,207.972032,7407,2593,0.7407,0.0,0,,,,covariates_dataset_1.csv
0,[PC_1],1,0,1,1.0,5,,2826.466752,0.987031,0.9255488741490403-1.0525974568923784,...,189.046364,7407,2593,0.7407,0.0,0,,,,covariates_dataset_1.csv
0,[PC_1],1,0,1,1.0,4,,2826.466752,0.987031,0.9255488741490403-1.0525974568923784,...,184.446407,7407,2593,0.7407,0.0,0,,,,covariates_dataset_1.csv
0,[PC_1],1,0,1,1.0,4,,2826.466752,0.987031,0.9255488741490403-1.0525974568923784,...,187.039498,7407,2593,0.7407,0.0,0,,,,covariates_dataset_1.csv
0,"[P_1, P_2, P_3, P_4, P_5]",5,0,5,1.0,13,13.0,1591.575241,4.335615,4.134548991752542-4.546459063505293,...,201.806143,4678,5322,0.4678,0.0,1,,,,covariates_dataset_1.csv
0,[PC_1],1,0,1,1.0,3,,2826.466752,0.987031,0.9255488741490403-1.0525974568923784,...,197.3651,7407,2593,0.7407,0.0,0,,,,covariates_dataset_1.csv
0,[PC_1],1,0,1,1.0,2,,2826.466752,0.987031,0.9255488741490403-1.0525974568923784,...,202.322672,7407,2593,0.7407,0.0,0,,,,covariates_dataset_1.csv
