# scikit-FIBERS: Demonstration Notebook
This notebook is set up as a demonstration for running scikit-FIBERS.

## Installation:

In [3]:
#Add Installation code

## Imports:

In [5]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter
from paretoset import paretoset
from sklearn.metrics import classification_report
from src.skfibers.fibers import FIBERS
from src.skfibers.experiments.datagen import create_data_simulation_bin
from src.skfibers.experiments.datagen_evolvable_threshold import create_data_simulation_bin_evolve

current_working_directory = os.getcwd()
print(current_working_directory)

C:\Users\ryanu\Desktop\scikit-FIBERS


## Survial Data Simulation

In [None]:
data = create_data_simulation_bin_evolve(number_of_instances=10000, number_of_features=100, number_of_features_in_bin=10,
                                  no_fail_proportion=0.5, mm_frequency_range=(0.4, 0.5), noise_frequency=0.0,
                                  class0_time_to_event_range=(1.5, 0.2), class1_time_to_event_range=(1, 0.2),
                                  censoring_frequency=0.5, random_seed=42, negative=False, threshold=0)
data.to_csv('sampledata.csv', index=False)
data = pd.read_csv('sampledata.csv')
true_risk_group = data[['TrueRiskGroup']]
data = data.drop('TrueRiskGroup', axis=1)

## Running FIBERS (Training)

In [None]:
fibers = FIBERS(outcome_label="Duration", outcome_type="survival", iterations=50, pop_size = 50, crossover_prob=0.5, mutation_prob=0.1, 
                new_gen=1.0, elitism=0.1, min_bin_size=1, fitness_metric="log_rank", log_rank_weighting=None,
                censor_label="Censoring", group_strata_min=0.2, group_thresh=None, min_thresh=0, max_thresh=3, int_thresh=True, thresh_evolve_prob=0.5,
                manual_bin_init=None, covariates=None, report=None, random_seed=None, verbose=False)

fibers = fibers.fit(data)

## Top-Bin Examination

In [None]:
bin_index = 0 # lowest index is the bin with the highest fitness (only reports the bin ranked at the top, despite possible fitness ties for top)
# Get bin reportting variables 
low_outcome, high_outcome, low_censor, high_censor, bin_report_df = fibers.get_bin_groups(data, bin_index)
bin_report_df

### Kaplan Meier Survival Plot (For Top Bin)

In [None]:
kmf1 = KaplanMeierFitter()

# fit the model for 1st cohort
kmf1.fit(low_outcome, low_censor, label='Instance Count At/Below Threshold')
a1 = kmf1.plot_survival_function()
a1.set_ylabel('Survival Probability')

# fit the model for 2nd cohort
kmf1.fit(high_outcome, high_censor, label='Instance Count Above Threshold')
kmf1.plot_survival_function(ax=a1)
a1.set_xlabel('Time After Event')

plt.show()

### Check and View Top Bin Ties

In [None]:
top_bin_list = fibers.get_top_bins()
count = len(top_bin_list)
if count > 1:
    print(str(len(top_bin_list))+" bins were tied for best fitness")
    for bin in top_bin_list:
        #print("Features in Bin: "+str(bin.feature_list))
        report = bin.bin_short_report()
        print(report)
else:
    print("Only one top performing bin found")


## History of Bin Evolution
### Plot: Fitness of top bin each training iteration

In [None]:
# Extract columns for plotting
time = fibers.top_perform_df['Iteration']
df = fibers.top_perform_df[['Fitness']]

# Plot the data
plt.figure(figsize=(5, 3))
colors = ['blue']  # Manually set colors
for i, column in enumerate(df.columns):
    plt.plot(time, df[column], label=column, color=colors[i])

# Add labels and title
plt.xlabel('Iteration')
plt.ylabel('Fitness (Top Bin)')
#plt.title('Top Bin Fitness Across Training Iterations')

# Show the plot
plt.grid(True)
plt.show()

### Plot: Normalized Top-Bin Stats Across Training Iterations

In [None]:
# Extract columns for plotting
time = fibers.top_perform_df['Iteration']
df = fibers.top_perform_df[['Birth Iteration','Bin Size','Group Ratio','Threshold']]
df = (df - df.min()) / (df.max() - df.min())
# Plot the data
plt.figure(figsize=(5, 3))
colors = ['red', 'blue', 'green', 'orange']   # Manually set colors
for i, column in enumerate(df.columns):
    plt.plot(time, df[column], label=column, color=colors[i])

# Add labels and title
plt.xlabel('Iteration')
plt.ylabel('Normalized Values (0-1) ')
#plt.title('Normalized Top-Bin Stats Across Training Iterations')
plt.legend()  # Show legend

# Show the plot
plt.grid(True)
plt.show()

### View: The dataframe containing all top-bin statistics/characteristics across training iterations

In [None]:
fibers.top_perform_df

## Transforming Bins Into New Features (Feature Learning)

In [None]:
tdf = fibers.transform(data)
tdf

## Prediction (of Strata)
### Predict Strata (Low vs. High) Using Top Bin

In [None]:
predictions = fibers.predict(data,bin_number=0)
print(classification_report(predictions, true_risk_group))



### Predict Strata (Low vs. High) Using Whole Bin Population (Weighted Voting Scheme)

In [None]:
predictions = fibers.predict(data)
print(classification_report(predictions, true_risk_group))

## Pareto Front Visualization

In [None]:
def generate_pareto_plot(objective_values_array, titleNum):
    mask = paretoset(objective_values_array, sense=["max", "min"])
    efficient_solutions = sorted([objective_values_array[i] for i in range(len(objective_values_array)) if mask[i]],
                                 key=lambda x: x[0])

    #plt.figure()
    plt.plot([rule[0] for rule in objective_values_array], [rule[1] for rule in objective_values_array],
             "go", markersize=6, label='Non Pareto-optimal')
    plt.plot([rule[0] for rule in efficient_solutions],
             [rule[1] for rule in efficient_solutions],
             "-o", markersize=6, label='Pareto-optimal')

    plt.xlabel('LogRank', fontsize=16)
    plt.ylabel('Bin Simplicity', fontsize=16)

    simplicity_array = [rule[1] for rule in objective_values_array]

    plt.xlim(0, 3000)
    plt.ylim(max(max(simplicity_array) * 2, 10), 0)

    plt.title("Population at Generation " + str(titleNum + 1))

    _ = plt.legend(loc=3, numpoints=1)
    plt.show()

In [None]:
pop_df = fibers.set.get_pop()
print(isinstance(pop_df, pd.DataFrame))
pop_df
pop_df.to_csv('FIBERS_pop.csv', index=False)

In [None]:

bin_pop = fibers.set.bin_pop
# Initialize lists to store Pareto-optimal solutions
pareto_fitness = []
pareto_binsize = []

# Plot the data
plt.figure(figsize=(8, 6))
for bin in bin_pop:
    fitness = bin.fitness
    binsize = bin.bin_size
    dominated = False
    
    for other_bin in bin_pop:
        if (other_bin.fitness < fitness and other_bin.bin_size < binsize) \
            or (other_bin.fitness <= fitness and other_bin.bin_size < binsize) \
            or (other_bin.fitness < fitness and other_bin.bin_size <= binsize):
            dominated = True
            break
            
    if not dominated:
        pareto_fitness.append(fitness)
        pareto_binsize.append(binsize)
        plt.scatter(fitness, binsize, color='red', label='Pareto-optimal')
    else:
        plt.scatter(fitness, binsize, color='blue')

# Add labels and title
plt.xlabel('Fitness')
plt.ylabel('Binsize')
plt.title('Pareto Front')
plt.legend()

# Show the plot
plt.grid(True)
plt.show()

In [None]:
pop_df = fibers.set.report_pop()

print(pop_df)
max_bin_size = pop_df['Bin Size:'].max()
print(max_bin_size)
object_list = pop_df[['Fitness','Bin Size']]

bin_pop = fibers.set.bin_pop
# Initialize lists to store Pareto-optimal solutions
pareto_fitness = []
pareto_binsize = []

# Sort objects based on fitness (to be maximized) and binsize (to be minimized)
sorted_objects = sorted(object_list, key=lambda x: (x['fitness'], -x['binsize']))

# Initialize lists to store Pareto-optimal solutions
pareto_fitness = []
pareto_binsize = []

# Plot the data
plt.figure(figsize=(8, 6))
for i, obj in enumerate(sorted_objects):
    fitness = obj['fitness']
    binsize = obj['binsize']
    dominated = False
    
    for other_obj in sorted_objects[i + 1:]:
        if other_obj['fitness'] > fitness and other_obj['binsize'] < binsize:
            dominated = True
            break
            
    if not dominated:
        pareto_fitness.append(fitness)
        pareto_binsize.append(binsize)
        plt.scatter(fitness, binsize, color='red', label='Pareto-optimal')
    else:
        break

# Draw a line representing the non-dominated front
plt.plot(pareto_fitness, pareto_binsize, color='green', linestyle='-', linewidth=2, label='Non-dominated front')

# Add labels and title
plt.xlabel('Fitness (Maximize)')
plt.ylabel('Binsize (Minimize)')
plt.title('Pareto Front')
plt.legend()

# Show the plot
plt.grid(True)
plt.show()