In [3]:
import pandas as pd
import numpy as np
from random import shuffle
from copy import deepcopy
from pathlib import Path

In [4]:
# Parameters
data_dir = '../Data-Populated-Unfiltered/Data/'
save_dir = '../Data-50/Folds/'
clss = ['MAPK1', 'MTOR']
num_folds = 50
k_folds = 5


class_columns = ['y.{}'.format(c) for c in clss]
fold_columns = ['F{}'.format(val+1) for val in range(num_folds)]

Path(save_dir).mkdir(parents=True, exist_ok=True)

In [5]:
ds = ['C2C12', 'ESC', 'MLC', 'NBC', 'L1-I','L1-F','L1-R','L6']


In [6]:
def partition(lst, n):
    '''
        lst - list of elemenets
        n - number of partitions
        returns a list of n list (partitions)
    '''
    return [lst[i::n] for i in range(n)]   

In [7]:
for d in ds:
    datasets = [d]
    data_fp = ['{}{}.csv'.format(data_dir, dat) for dat in datasets]
    data = []

    for fp in data_fp:
        data.append(pd.read_csv(fp))

    for i in range(len(clss)):
        # Generate data copy
        tmp_data = deepcopy(data)
        for df in tmp_data:
            for col in fold_columns:
                df[col] = 0

        # Retrieve positive and negative sites for a given class
        label_col = class_columns[i]
        tmp_dataset = []
        tmp_d = []

        # Get all sites, and positive sites
        positives, sites = set(), set()
        pos_sites = []
        for dataset, df in zip(datasets, tmp_data):
            tmp_df = df[['site', label_col]]
            positives.update(tmp_df['site'][tmp_df[label_col] == 1].to_list())
            if len(set(tmp_df['site'][tmp_df[label_col] == 1].to_list())) > 0:
                tmp_dataset.append(dataset)
                tmp_d.append(df)
                pos_sites.append(set(tmp_df['site'][tmp_df[label_col] == 1].to_list()))
            sites.update(df['site'].to_list())

        others = list(sites.difference(positives))
        sites = list(sites)
        positives = list(positives)
        remaining_pos = deepcopy(positives)

        make_fold = True
        print('Number Unique Positives for Class {}: {}'.format(clss[i],len(positives)))
        print([len(pos_site) for pos_site in pos_sites])

        # Find minimum number of sites in a dataset
        min_dataset = np.argmin([len(x) for x in pos_sites])
        max_len = min([len(x) for x in pos_sites])

        for col in fold_columns:
            # Generate folds where each fold has a positive
            make_folds = True

            folds = []
            fold_freq = []
            remaining_pos = deepcopy(positives)
            shuffle(remaining_pos)
            while make_folds:
                current_fold = []
                pos_counter = [0 for _ in tmp_d]
                if len(fold_freq) == k_folds:
                    break
                for j, df in enumerate(tmp_d):
                    if pos_counter[j] == 0:
                        relevant_pos = list(pos_sites[j].intersection(remaining_pos))
                        if len(relevant_pos) == 0:
                            if len(folds) < max_len:
                                folds =[]
                                fold_freq = []
                                remaining_pos = deepcopy(positives)
                            else:
                                make_folds = False
                            break
                        else:
                            if j != min_dataset:
                                tmp = list(set(relevant_pos).difference(pos_sites[min_dataset]))
                                if len(tmp) != 0:
                                    relevant_pos = tmp
                            pos_choice = np.random.choice(relevant_pos)
                            remaining_pos.remove(pos_choice)
                            current_fold.append(pos_choice)
                            pos_counter[j] += 1
                            for q in range(j+1, len(tmp_d)):
                                if pos_choice in pos_sites[q]:
                                    pos_counter[q] += 1
                if make_folds:
                    pos_counter = [len(pos_site.intersection(current_fold)) for pos_site in pos_sites]            
                    fold_freq.append(pos_counter)
                    folds.append(current_fold)
            print(fold_freq) 

            ## Distribute remaining
            distribute_pos = partition(remaining_pos, k_folds)
            folds = [ x+y for x,y in zip(distribute_pos, folds) ]


            # Construct sites
            remaing_sites = list(set(others))
            shuffle(remaing_sites)
            num_pos, num_neg = [], []
            # Partition negatives
            while True:
                rerun_negative=False
                num_pos, num_neg = [], []
                negatives_partitions = partition(remaing_sites, len(folds))
                folds_sites = [folds[j] + negatives_partitions[j] for j in range(len(folds))]
                for df in tmp_d:
                    print('Dataset')
                    if rerun_negative:
                        break
                    for k in range(len(folds_sites)): # Set fold number
                        df.loc[df['site'].isin(folds_sites[k]),col] = k  
                        n_p = sum(df.loc[df['site'].isin(folds_sites[k])][label_col] == 1)
                        n_n = sum(df.loc[df['site'].isin(folds_sites[k])][label_col] == 0)
                        num_pos.append(n_p)
                        num_neg.append(n_n)
                        if n_p == 0 or n_n == 0:
                            rerun_negative = True
                            break
                    print(np.array((num_pos, num_neg)))
                if not rerun_negative:
                    break 


        print('Savining for {}'.format(clss[i]))
        # Save dataset
        for m in range(len(tmp_d)):
            f_name = '{}{}_{}_FLD.csv'.format(save_dir,tmp_dataset[m],clss[i])
            tmp_d[m].to_csv(f_name, index=False)
        print('Number of Folds for Class {}: {}'.format(clss[i], len(folds)))

Number Unique Positives for Class MAPK1: 36
[36]
[[1], [1], [1], [1], [1]]
Dataset
[[   8    7    7    7    7]
 [2069 2065 2070 2068 2063]]
[[1], [1], [1], [1], [1]]
Dataset
[[   8    7    7    7    7]
 [2069 2068 2065 2067 2066]]
[[1], [1], [1], [1], [1]]
Dataset
[[   8    7    7    7    7]
 [2067 2066 2070 2065 2067]]
[[1], [1], [1], [1], [1]]
Dataset
[[   8    7    7    7    7]
 [2069 2066 2068 2065 2067]]
[[1], [1], [1], [1], [1]]
Dataset
[[   8    7    7    7    7]
 [2065 2068 2067 2066 2069]]
[[1], [1], [1], [1], [1]]
Dataset
[[   8    7    7    7    7]
 [2066 2069 2068 2068 2064]]
[[1], [1], [1], [1], [1]]
Dataset
[[   8    7    7    7    7]
 [2069 2068 2065 2068 2065]]
[[1], [1], [1], [1], [1]]
Dataset
[[   8    7    7    7    7]
 [2066 2066 2071 2064 2068]]
[[1], [1], [1], [1], [1]]
Dataset
[[   8    7    7    7    7]
 [2064 2069 2067 2070 2065]]
[[1], [1], [1], [1], [1]]
Dataset
[[   8    7    7    7    7]
 [2063 2068 2067 2068 2069]]
[[1], [1], [1], [1], [1]]
Dataset
[[   8 

In [None]:
## Individual Testing
for i in range(len(clss)):
    # Generate data copy
    tmp_data = deepcopy(data)
    for df in tmp_data:
        for col in fold_columns:
            df[col] = -1
    
    # Retrieve positive and negative sites for a given class
    label_col = class_columns[i]
    
    positives, negatives = set(), set()
    for df in tmp_data:
        tmp_df = df[['site', label_col]]
        positives.update(tmp_df['site'][tmp_df[label_col] == 1].to_list())
        negatives.update(tmp_df['site'][tmp_df[label_col] == 0].to_list())
    if len(positives.intersection(negatives)) > 0:
        raise Exception("There are sites which are positive and negative depending on the dataset")
    
    negatives = list(negatives)
    positives = list(positives)
    # Generate folds
    for col in fold_columns:
        # Randomness
        shuffle(positives)
        shuffle(negatives) 
        
        folds_sites = partition(positives, k_folds) # [[pos] for pos in positives]
        neg_partitions = partition(negatives, k_folds)
        folds_sites = [folds_sites[j] + neg_partitions[j] for j in range(k_folds)]
        print([len(x) for x in folds_sites])
        for k in range(len(folds_sites)): # Set fold number
            for df in tmp_data:
                df.loc[df['site'].isin(folds_sites[k]),col] = k
    # Save dataset
    for m in range(len(tmp_data)):
        f_name = '{}{}_{}_FLD.csv'.format(save_dir,datasets[m],clss[i])
        tmp_data[m].to_csv(f_name, index=False)