In [1]:
import random
import numpy as np
import pandas as pd
from skfibers.fibers import FIBERS

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
def generate_features(row, number_of_features, number_of_features_in_bin, mm_frequency):
    if row['Class'] == 1:
        idxs = random.sample(list(range(1, number_of_features_in_bin + 1)), 
                             int(mm_frequency/2 * number_of_features_in_bin))
        for idx in idxs:
            row['P_' + str(idx)] = 1
        idxs = random.sample(list(range(1, number_of_features - number_of_features_in_bin + 1)), 
                             int(mm_frequency/2 * (number_of_features - number_of_features_in_bin)))
        for idx in idxs:
            row['R_' + str(idx)] = 1
    else:
        idxs = random.sample(list(range(1, number_of_features - number_of_features_in_bin + 1)), 
                             int(mm_frequency * (number_of_features - number_of_features_in_bin)))
        for idx in idxs:
            row['R_' + str(idx)] = 1
    return row

In [4]:
def create_data_simulation_bin(number_of_instances, number_of_features, number_of_features_in_bin,
                               no_fail_proportion, mm_frequency_range, noise_frequency,
                               class0_time_to_event_range, class1_time_to_event_range):
    """
    Defining a function to create an artificial dataset with parameters, there will be one ideal/strong bin
    Note: MAF (minor allele frequency) cutoff refers to the threshold
    separating rare variant features from common features

    :param number_of_instances: dataset size
    :param number_of_features: total number of features in dataset
    :param number_of_features_in_bin: total number of predictive features in the ideal bin
    :param no_fail_proportion: the proportion of instances to be labled as (no fail class)
    :param mm_frequency_range: the max and min MM frequency for a given column/feature in data. (e.g. 0.1 to 0.5)
    :param noise_frequency: Value from 0 to 0.5 representing the proportion of class 0/class 1 instance pairs that \
                            have their outcome switched from 0 to 1
    :param class0_time_to_event_range: (min, max) time to event as a tuple (should be larger (e.g. 100 to 200)
    :param class1_time_to_event_range: (min, max) time to event as a tuple (should be smaller but a but overlapping \
                                        with above range (e.g. 20 to 150)

    :return: pandas dataframe of generated data
    """
    
    # Creating an empty dataframe to use as a starting point for the eventual feature matrix
    # Adding one to number of features to give space for the class and Duration column
    df = pd.DataFrame(np.zeros((number_of_instances, number_of_features + 2)))
    
    # Creating a list of predictive features in the strong bin
    predictive_features = ["P_" + str(i + 1) for i in range(number_of_features_in_bin)]

    # Creating a list of randomly created features
    random_features = ["R_" + str(i + 1) for i in range(number_of_features - number_of_features_in_bin)]

    # Adding the features and the class/endpoint
    df.columns  = predictive_features + random_features + ['Class', 'Duration']
    
    # Assigning class according to no_fail_proportion parameter
    fail_count = int(number_of_instances * (1 - no_fail_proportion))
    no_fail_count = number_of_instances - fail_count
    class_list = [1] * fail_count + [0] * no_fail_count
    df['Class'] = class_list
    
    # Generating predictive and random features columns
    mm_frequency = np.random.uniform(mm_frequency_range[0], mm_frequency_range[1])
    df = df.apply(generate_features, 
                  args=(number_of_features, number_of_features_in_bin, mm_frequency), axis=1).astype(int)
    
    # Assigning Guassians according to class
    df_0 = df[df['Class'] == 0].sample(frac=1).reset_index(drop=True)
    df_1 = df[df['Class'] == 1].sample(frac=1).reset_index(drop=True)
    df_0['Duration'] = np.random.uniform(class0_time_to_event_range[0], 
                                         class0_time_to_event_range[1], size=len(df_0))
    df_1['Duration'] = np.random.uniform(class1_time_to_event_range[0], 
                                         class1_time_to_event_range[1], size=len(df_1))
    swap_count = min(no_fail_count, fail_count) * noise_frequency
    
    idxs = random.sample(list(range(min(no_fail_count, fail_count))), swap_count)
    
    df = pd.concat([df_0, df_1]).sample(frac=1).reset_index(drop=True)
    
    return df


In [5]:
data = create_data_simulation_bin(number_of_instances=10000, number_of_features=50, number_of_features_in_bin=10,
                                  no_fail_proportion=0.5, mm_frequency_range=(0.4, 0.5) , noise_frequency=0,
                                  class0_time_to_event_range=(1.00, 2.00), class1_time_to_event_range=(0.2, 1.50))

In [6]:
data.head()

Unnamed: 0,P_1,P_2,P_3,P_4,P_5,P_6,P_7,P_8,P_9,P_10,...,R_33,R_34,R_35,R_36,R_37,R_38,R_39,R_40,Class,Duration
0,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,0,0,1,0,0,1.431392
1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,1,0,1,0.660579
2,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,1,0,1.956405
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,1.941802
4,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1.361986


In [7]:
data.to_csv('data.csv')

In [8]:
def experiment_1():
    for replicate in range(0, 1):
        print('Experiment 1')
        # Creating the simulated dataset with 1000 instances, 10 features to bin, 50 total features
        # no_fail_proportion=0.5, mm_frequency_range=(0.1, 0.5) , noise_frequency=0,
        # class0_time_to_event_range=(100, 200), class1_time_to_event_range=(20, 150)
        data = create_data_simulation_bin(1000, 50, 10, 0.5, (0.1, 0.5), 0, (100, 200), (20, 150))

        fibers = FIBERS(given_starting_point=False, amino_acid_start_point=None, algorithm="FIBERS",
                        amino_acid_bins_start_point=None, iterations=500, label_name="Class",
                        duration_name="Duration", rare_variant_maf_cutoff=0.05,
                        set_number_of_bins=50, min_features_per_group=5,
                        max_number_of_groups_with_feature=25,
                        scoring_method='Relief',
                        score_based_on_sample=True, score_with_common_variables=False,
                        instance_sample_size=50, crossover_probability=0.8,
                        mutation_probability=0.1, elitism_parameter=0.4,
                        random_seed=None, bin_size_variability_constraint=None)

        fibers.fit(data)
        fibers, bin_feature_matrix_internal, amino_acid_bins_internal, \
            amino_acid_bin_scores_internal, maf_0_features = fibers.transform(data)
        return bin_feature_matrix_internal, amino_acid_bins_internal, \
            amino_acid_bin_scores_internal, maf_0_features

In [None]:
bin_feature_matrix_internal, amino_acid_bins_internal, \
            amino_acid_bin_scores_internal, maf_0_features = experiment_1()

Experiment 1
FIBERS


 33%|█████████████▋                           | 167/500 [08:17<17:43,  3.19s/it]

In [None]:
from skfibers.methods.fibers_methods import top_bin_summary_fibers

In [None]:
top_bin_summary_fibers(pd.DataFrame(), "Class", "Duration", bin_feature_matrix_internal, amino_acid_bins_internal,
                       amino_acid_bin_scores_internal)