# Old scikit-FIBERS Runs

## Installation and Imports 
How to setup and use scikit-FIBERS

In [1]:
import os
import copy
import dask
import pickle
import pandas as pd
from lifelines import CoxPHFitter
from lifelines.statistics import logrank_test
from oldsrc.skfibers.fibers import FIBERS
from dask.distributed import Client
from dask_jobqueue import SLURMCluster, LSFCluster, SGECluster

## Loading dataset

### Setting Variables for General Experiment

In [2]:
# dataset_name_list = os.listdir('PPSNDatasets')
dataset_name_list = [   'standard_with_noise.csv',
                        'standard_no_noise.csv',
                        'threshold_0_no_noise.csv',
                        'threshold_0_with_noise.csv',
                        'threshold_1_no_noise.csv',
                        'threshold_1_with_noise.csv',
                        'threshold_2_no_noise.csv',
                        'threshold_2_with_noise.csv',
                        'threshold_4_no_noise.csv',
                        'threshold_4_with_noise.csv',]
experiment_list = ['Goal1', 'Goal2', 'Goal3', 'Goal4', 'Testing']

In [3]:
root_folder = 'PPSNResults/FIBERS1/'
dataset_name = dataset_name_list[0]
experiment_name = experiment_list[4]

In [4]:
# for experiment_name in experiment_list:
#     for dataset_name in dataset_name_list:
#         try:
#             folder = root_folder + '/' + experiment_name + '/' + dataset_name.split('.')[0] + '/'
#             os.makedirs(folder)
#         except FileExistsError:
#             folder = root_folder + '/' + experiment_name + '/' + dataset_name.split('.')[0] + '/'
#             print("Folder Already Exists:" + folder)

In [5]:
def read_dataset(dataset_name):
    data = pd.read_csv('PPSNDatasets/'+ dataset_name)
    true_risk_group = data[['TrueRiskGroup']]
    data = data.drop('TrueRiskGroup', axis=1)
    return data, true_risk_group

In [6]:
data, true_risk_group = read_dataset(dataset_name)

## Code For Single FIBERS Run

In [7]:
# Pop Size = 50, Iterations = 100, crossover_prob = 0.5, min mutation = 0.1, 
# elitism = 0.1, min_bin_size = 1, max initial bin size = 10, group_strata_min = 0.2.

In [8]:
# min mutation = 0.1 and max mutation prob =0.1 (effectively turning off oscillating mutation rate),
#  and merge prob = 0, group_thresh = 0, fitness metric = log_rank, and diversity_pressure = 0

In [9]:
# fibers = FIBERS(label_name="Censoring", duration_name="Duration",
#                 given_starting_point=False, start_point_feature_list=None, 
#                 feature_bins_start_point=None, iterations=100,
#                 set_number_of_bins=50, min_features_per_group=1,
#                 max_number_of_groups_with_feature=10,
#                 informative_cutoff=0.2, crossover_probability=0.5,
#                 mutation_probability=0.1, elitism_parameter=0.1,
#                 random_seed=42, threshold = 0, evolving_probability = 0.0,
#                 min_threshold = 0, max_threshold = 4,
#                 merge_probability = 0.0, adaptable_threshold=False, covariates=None,
#                 scoring_method="log_rank")
# fibers = fibers.fit(data) 

In [10]:
# try:
#     os.makedirs(root_folder + '/' + experiment_name + '/' + dataset_name.split('.')[0] + '/models/')
# except Exception:
#     pass

In [11]:
# with open(root_folder + '/' + experiment_name + '/' + dataset_name.split('.')[0] + '/models/test.pkl', 'wb') as file:
#     pickle.dump(fibers, file)

In [12]:
# with open(root_folder + '/' + experiment_name + '/' + dataset_name.split('.')[0] + '/models/test.pkl', 'rb+') as file:
#     model = pickle.load(file)
# model

In [13]:
def get_experiment_output(fibers, X, y=None, dataset=None, filename=None):
        columns = ["Features in Bin", "Number of P", "Number of R", 
                   "Bin Size", "Pred Ratio",
                   "Iterations to Ideal Solution", 
                   "Log-Rank Score",
                   "Unadjusted HR", "HR CI", "HR P-value", "Runtime",
                   "Count At/Below Threshold",
                   "Count Above Threshold", "Group Ratio", 
                   "Log-Rank p-value", "Threshold", "Accuracy",
                   "Residual",
                   "Residual p-value", 
                   "Dataset Filename"]
  
        if not fibers.hasTrained:
            raise Exception("Model must be trained first")
        
        durations_no, durations_mm, event_observed_no, event_observed_mm, top_bin = fibers.get_duration_event(0)
        results = logrank_test(durations_no, durations_mm, event_observed_A=event_observed_no,
                               event_observed_B=event_observed_mm)
        
        sorted_bin_scores = dict(sorted(fibers.bin_scores.items(), key=lambda item: item[1], reverse=True))
        sorted_bin_list = list(sorted_bin_scores.keys())
        Bin = fibers.bins[sorted_bin_list[0]]
        d_data = X.copy()
        d_data['Bin'] = d_data[Bin].sum(axis=1)
        column_values = d_data['Bin'].to_list()
        for r in range(0, len(column_values)):
            if column_values[r] > 0:
                column_values[r] = 1
        d_data['Bin'] = column_values
        coxmodeldata =  d_data[["Bin", fibers.duration_name, fibers.label_name]]
        cph = CoxPHFitter()
        cph.fit(coxmodeldata, fibers.duration_name, event_col=fibers.label_name, show_progress=True)
        summary = cph.summary
        HR = summary['exp(coef)'].iloc[0]
        HR_CI = str(summary['exp(coef) lower 95%'].iloc[0])+'-'+str(summary['exp(coef) upper 95%'].iloc[0])
        HR_p_value = summary['p'].iloc[0]
        pdf = pd.DataFrame([[fibers.bins[top_bin], str(fibers.bins[top_bin]).count('P'), str(fibers.bins[top_bin]).count('R'), 
                             len(fibers.bins[top_bin]), str(fibers.bins[top_bin]).count('P')/len(fibers.bins[top_bin]), 
                             None if str(fibers.bins[top_bin]).count('P') != 10 else 101,
                             fibers.bin_scores[top_bin], 
                             HR, HR_CI, HR_p_value, fibers.elapsed_time,
                             len(durations_no),
                             len(durations_mm), len(durations_no)/len(durations_mm), 
                             results.p_value, fibers.bins[top_bin].get_threshold(), 
                             fibers.score(X, y) if y is not None else None, 
                             None, None, dataset,]],
                           columns=columns).T  # SPHIA
        
        if filename:
            pdf.to_csv(filename)
            
        return pdf

In [14]:
# get_experiment_output(fibers, data, true_risk_group, dataset_name, root_folder + '/' + experiment_name
#                                                + '/' + dataset_name.split('.')[0] + '/experiment_table.csv')

### Accessing results and internal functions

In [15]:
# experiment_results = get_experiment_output(fibers, data, true_risk_group)
# experiment_results

## Code for Experiment Run

In [16]:
def get_cluster(cluster_type='SLURM', output_path=".", queue='defq', memory=4):
    client = None
    try:
        if cluster_type == 'SLURM':
            cluster = SLURMCluster(queue=queue,
                                   cores=1,
                                   memory=str(memory) + "G",
                                   walltime="24:00:00",
                                   log_directory=output_path + "/dask_logs/")
            cluster.adapt(maximum_jobs=400)
        elif cluster_type == "LSF":
            cluster = LSFCluster(queue=queue,
                                 cores=1,
                                 mem=memory * 1000000000,
                                 memory=str(memory) + "G",
                                 walltime="24:00",
                                 log_directory=output_path + "/dask_logs/")
            cluster.adapt(maximum_jobs=400)
        elif cluster_type == 'UGE':
            cluster = SGECluster(queue=queue,
                                 cores=1,
                                 memory=str(memory) + "G",
                                 resource_spec="mem_free=" + str(memory) + "G",
                                 walltime="24:00:00",
                                 log_directory=output_path + "/dask_logs/")
            cluster.adapt(maximum_jobs=400)
        elif cluster_type == 'Local':
            c = Client()
            cluster = c.cluster
        else:
            raise Exception("Unknown or Unsupported Cluster Type")
        client = Client(cluster)
    except Exception as e:
        print(e)
        raise Exception("Exception: Unknown Exception")
    print("Running dask-cluster")
    print(client.scheduler_info())
    return client

In [17]:
def runner_fn(fibers, params):
    data, true_risk_group = read_dataset(params['dataset_name'])
    params['data'] = data
    params['label'] = true_risk_group
    fibers = fibers.fit(params['data']) 
    experiment_results = get_experiment_output(fibers, params['data'], params['label'], params['dataset_name'],
                                               params['root_folder'] + '/' + params['experiment_name'] 
                                               + '/' + params['dataset_name'].split('.')[0] 
                                               + '/models/model_table_' + str(fibers.random_seed) + '.csv')
    with open(params['root_folder'] + '/' + params['experiment_name'] 
              + '/' + params['dataset_name'].split('.')[0] + '/models/' + str(fibers.random_seed), 'wb') as file:
        pickle.dump(fibers, file)
    print(params)
    return experiment_results

In [18]:
def make_obj_list(fibers, param):  
    obj_list = list()
    for i in range(20):
        fibers.random_seed = i+1
        obj_list.append((copy.deepcopy(fibers), param))
    return obj_list

In [19]:
def param_maker(dataset_name, experiment_name):
    param_dict = {
        'root_folder': root_folder,
        'dataset_name': dataset_name,
        'experiment_name': experiment_name,
    }
    return param_dict

In [20]:
fibers_list = [
            FIBERS(label_name="Censoring", duration_name="Duration",
                given_starting_point=False, start_point_feature_list=None, 
                feature_bins_start_point=None, iterations=100,
                set_number_of_bins=50, min_features_per_group=1,
                max_number_of_groups_with_feature=10,
                informative_cutoff=0.2, crossover_probability=0.5,
                mutation_probability=0.1, elitism_parameter=0.1,
                random_seed=42, threshold = 0, evolving_probability = 0.0,
                min_threshold = 0, max_threshold = 4,
                merge_probability = 0.0, adaptable_threshold=False, covariates=None,
                scoring_method="log_rank"),
            FIBERS(label_name="Censoring", duration_name="Duration",
                given_starting_point=False, start_point_feature_list=None, 
                feature_bins_start_point=None, iterations=100,
                set_number_of_bins=50, min_features_per_group=1,
                max_number_of_groups_with_feature=10,
                informative_cutoff=0.2, crossover_probability=0.5,
                mutation_probability=0.1, elitism_parameter=0.1,
                random_seed=42, threshold = 0, evolving_probability = 0.0,
                min_threshold = 0, max_threshold = 5,
                merge_probability = 0.1, adaptable_threshold=True, covariates=None,
                scoring_method="log_rank")               
                ]

In [21]:
param_grid = [
        ("Goal1", dataset_name_list[0], fibers_list[0]),
        ("Goal1", dataset_name_list[1], fibers_list[0]), 
        # ("Goal4", dataset_name_list[2], fibers_list[1]), 
        # ("Goal4", dataset_name_list[3], fibers_list[1]), 
        # ("Goal4", dataset_name_list[4], fibers_list[1]), 
        # ("Goal4", dataset_name_list[5], fibers_list[1]), 
        # ("Goal4", dataset_name_list[6], fibers_list[1]), 
        # ("Goal4", dataset_name_list[7], fibers_list[1]), 
        # ("Goal4", dataset_name_list[8], fibers_list[1]), 
        # ("Goal4", dataset_name_list[9], fibers_list[1]), 
    ]

In [22]:
DEBUG = True
if DEBUG:
    import shutil
    try:
        shutil.rmtree(root_folder)
    except:
        pass

In [23]:
for experiment_name, dataset_name, _ in param_grid:
    try:
        folder = root_folder + '/' + experiment_name + '/' + dataset_name.split('.')[0] + '/'
        os.makedirs(folder)
        os.makedirs(folder + '/models/')
    except FileExistsError:
        folder = root_folder + '/' + experiment_name + '/' + dataset_name.split('.')[0] + '/'
        print("Folder Already Exists:" + folder)
        continue

In [24]:
job_list = list()
for experiment_name, dataset_name, fibers in param_grid: 
    job_list.extend(make_obj_list(fibers, param_maker(dataset_name, experiment_name)))

In [25]:
client = get_cluster('SLURM')

Running dask-cluster
{'type': 'Scheduler', 'id': 'Scheduler-17cae008-4fb5-44ed-a7f0-ac10562a6855', 'address': 'tcp://10.17.134.112:41157', 'services': {'dashboard': 8787}, 'started': 1711376975.1509361, 'workers': {}}


In [26]:
results = dask.compute([dask.delayed(runner_fn)(fibers_obj, params
                                            ) for fibers_obj, params in job_list])

In [27]:
result_df = pd.concat(results[0], axis=1, ignore_index=False).T
result_df


Unnamed: 0,Features in Bin,Number of P,Number of R,Bin Size,Pred Ratio,Iterations to Ideal Solution,Log-Rank Score,Unadjusted HR,HR CI,HR P-value,Runtime,Count At/Below Threshold,Count Above Threshold,Group Ratio,Log-Rank p-value,Threshold,Accuracy,Residual,Residual p-value,Dataset Filename
0,"['P_2', 'P_7', 'P_8', 'P_1', 'P_5']",5,0,5,1.0,,1497.743171,2.373491,2.2691460722418944-2.482634934538887,0.0,134.899833,5181,4819,1.075119,0.0,0,0.9819,,,standard_with_noise.csv
0,"['P_5', 'P_7', 'P_3', 'P_8']",4,0,4,1.0,,1454.363087,2.344467,2.2414227846454917-2.4522483902962815,0.0,142.460138,5286,4714,1.121341,0.0,0,0.9714,,,standard_with_noise.csv
0,"['P_4', 'P_1', 'P_3', 'P_5', 'P_8']",5,0,5,1.0,,1503.691979,2.380254,2.2754686636827497-2.4898644374964083,0.0,118.596755,5251,4749,1.105706,0.0,0,0.9749,,,standard_with_noise.csv
0,"['P_5', 'P_10', 'P_9', 'P_6']",4,0,4,1.0,,1449.285537,2.344349,2.2411356391750252-2.452315923891702,0.0,135.051229,5432,4568,1.189142,0.0,0,0.9568,,,standard_with_noise.csv
0,"['P_8', 'P_5', 'P_1', 'P_4']",4,0,4,1.0,,1476.114272,2.363178,2.2590853159248008-2.472066356025928,0.0,133.966506,5375,4625,1.162162,0.0,0,0.9625,,,standard_with_noise.csv
0,"['P_1', 'P_9', 'P_5', 'P_10']",4,0,4,1.0,,1458.436722,2.350732,2.2472208855849276-2.459011334645617,0.0,136.180808,5419,4581,1.182929,0.0,0,0.9581,,,standard_with_noise.csv
0,"['P_6', 'P_2', 'P_3', 'P_7', 'P_5']",5,0,5,1.0,,1466.64619,2.351365,2.2480786618686497-2.4593961435936924,0.0,135.907568,5184,4816,1.076412,0.0,0,0.9816,,,standard_with_noise.csv
0,"['P_1', 'P_5', 'P_3', 'P_10', 'P_7']",5,0,5,1.0,,1496.817216,2.372872,2.268553472261355-2.4819873552761123,0.0,120.684126,5185,4815,1.076843,0.0,0,0.9815,,,standard_with_noise.csv
0,"['P_7', 'P_9', 'P_2', 'P_5', 'P_10']",5,0,5,1.0,,1470.25869,2.354798,2.251311914084308-2.4630420108338913,0.0,142.171253,5172,4828,1.071251,0.0,0,0.9828,,,standard_with_noise.csv
0,"['P_10', 'P_1', 'P_7', 'P_5', 'P_3']",5,0,5,1.0,,1496.817216,2.372872,2.268553472261355-2.4819873552761123,0.0,127.755837,5185,4815,1.076843,0.0,0,0.9815,,,standard_with_noise.csv


In [28]:
outname = 'experiment_table.csv'

outdir = root_folder + 'Goal1'
if not os.path.exists(outdir):
    os.makedirs(outdir)

fullname = os.path.join(outdir, outname)    

result_df.to_csv(fullname)

In [29]:
param_grid = [
        # ("Goal1", dataset_name_list[0], fibers_list[0]),
        # ("Goal1", dataset_name_list[1], fibers_list[0]), 
        ("Goal4", dataset_name_list[2], fibers_list[1]), 
        ("Goal4", dataset_name_list[3], fibers_list[1]), 
        ("Goal4", dataset_name_list[4], fibers_list[1]), 
        ("Goal4", dataset_name_list[5], fibers_list[1]), 
        ("Goal4", dataset_name_list[6], fibers_list[1]), 
        ("Goal4", dataset_name_list[7], fibers_list[1]), 
        ("Goal4", dataset_name_list[8], fibers_list[1]), 
        ("Goal4", dataset_name_list[9], fibers_list[1]), 
    ]

In [30]:
for experiment_name, dataset_name, _ in param_grid:
    try:
        folder = root_folder + '/' + experiment_name + '/' + dataset_name.split('.')[0] + '/'
        os.makedirs(folder)
        os.makedirs(folder + '/models/')
    except FileExistsError:
        folder = root_folder + '/' + experiment_name + '/' + dataset_name.split('.')[0] + '/'
        print("Folder Already Exists:" + folder)
        continue

In [31]:
job_list = list()
for experiment_name, dataset_name, fibers in param_grid: 
    job_list.extend(make_obj_list(fibers, param_maker(dataset_name, experiment_name)))

In [32]:
results = dask.compute([dask.delayed(runner_fn)(fibers_obj, params
                                            ) for fibers_obj, params in job_list])

In [33]:
result_df = pd.concat(results[0], axis=1, ignore_index=False).T
result_df

Unnamed: 0,Features in Bin,Number of P,Number of R,Bin Size,Pred Ratio,Iterations to Ideal Solution,Log-Rank Score,Unadjusted HR,HR CI,HR P-value,Runtime,Count At/Below Threshold,Count Above Threshold,Group Ratio,Log-Rank p-value,Threshold,Accuracy,Residual,Residual p-value,Dataset Filename
0,"['P_6', 'P_7', 'P_2', 'P_8', 'P_9']",5,0,5,1.0,,7218.927444,10.187378,9.600043117993977-10.810646474800656,0.0,328.050019,5176,4824,1.072968,0.0,0,0.9824,,,threshold_0_no_noise.csv
0,"['P_7', 'P_9', 'P_10', 'P_6', 'P_2']",5,0,5,1.0,,7217.455302,10.150686,9.566276828636989-10.77079791576569,0.0,337.08594,5173,4827,1.07168,0.0,0,0.9827,,,threshold_0_no_noise.csv
0,"['P_7', 'P_8', 'P_9', 'P_5', 'P_6', 'P_2']",6,0,6,1.0,,7337.156576,10.48324,9.876177139066503-11.127618137736878,0.0,397.843624,5112,4888,1.045827,0.0,0,0.9888,,,threshold_0_no_noise.csv
0,"['P_7', 'P_1', 'P_6', 'P_3']",4,0,4,1.0,,7041.855619,9.754927,9.196326315278446-10.347458029867786,0.0,301.728852,5279,4721,1.118195,0.0,0,0.9721,,,threshold_0_no_noise.csv
0,"['P_10', 'P_7', 'P_6', 'P_8', 'P_9']",5,0,5,1.0,,7245.206064,10.183895,9.597827859915734-10.805748590429634,0.0,324.208014,5183,4817,1.075981,0.0,0,0.9817,,,threshold_0_no_noise.csv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,"['P_4', 'P_6', 'P_3', 'R_20', 'P_1', 'P_5']",5,1,6,0.833333,,926.947398,1.552061,1.4664090189677328-1.6427165050512005,0.0,304.426712,5318,4682,1.135839,0.0,2,0.889,,,threshold_4_with_noise.csv
0,"['P_6', 'P_9', 'P_3', 'P_1', 'P_8', 'R_57']",5,1,6,0.833333,,849.61428,1.575104,1.4859323440025505-1.6696261497996745,0.0,370.038867,5437,4563,1.191541,0.0,2,0.8703,,,threshold_4_with_noise.csv
0,"['P_6', 'P_3', 'P_4', 'P_9', 'P_7', 'P_8']",6,0,6,1.0,,968.258642,1.569288,1.4816067533723463-1.6621585523591043,0.0,341.892459,4763,5237,0.90949,0.0,2,0.9121,,,threshold_4_with_noise.csv
0,"['P_3', 'P_9', 'P_6', 'P_4', 'P_7']",5,0,5,1.0,,932.332431,1.597379,1.5119963925491908-1.687583786597171,0.0,306.582033,5519,4481,1.231645,0.0,2,0.8895,,,threshold_4_with_noise.csv


In [34]:
outname = 'experiment_table.csv'

outdir = root_folder + 'Goal4'
if not os.path.exists(outdir):
    os.makedirs(outdir)

fullname = os.path.join(outdir, outname)    

result_df.to_csv(fullname)