In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

In [2]:
path = '../input/glioblastoma-radiosensitivity-dataset/glioblastoma_data/extracted_data/cell_behaviors/videos_transcriptions_csv'
print(path)

../input/glioblastoma-radiosensitivity-dataset/glioblastoma_data/extracted_data/cell_behaviors/videos_transcriptions_csv


In [3]:
def import_files(path):
    struc = pd.DataFrame(columns =["radiation_level", "path","code_file","group"])
    read = os.walk(path)
    for root, dirs, files in read:
        for name in files:
            if name.endswith(".csv"):
                #print(name)
                radiation = name.split("gy")
                [_,group,_] = name.split("_") 
                filepath = path+ '/' + name
                code_file = name[:-4] #removing suffix
                
                struc = struc.append({"radiation_level" : int(radiation[0]), "path" : filepath, "code_file" : code_file, "group" : group}, ignore_index=True)
                
    return struc

In [4]:
def str_to_numpy(input_str):
    """
    str_to_numpy is used to parse the columns "input" and "output" of the dataset csv files.
    It converts a string representing an array of cells to a numpy array containing strings
    which are the names of these cells.
    """
    process_str = input_str.replace("[", "")
    process_str = process_str.replace("'", "")
    process_str = process_str.replace(" ", "")
    process_str = process_str.replace("]", "")
    if(process_str != ""):
        process_str = process_str.split(",")
        output_array = np.asarray(process_str, dtype=str)
    else:
        output_array = np.asarray([], dtype=str)
    return output_array

In [5]:
file = path + "/0gy_n1_201104.nd2u2510Gy1A.csv"
data = pd.read_csv(file, converters = {"input" : str_to_numpy, "output" : str_to_numpy}) #imports a dataframe from a csv 
                  # and converts its string components to numpy
data.head()


Unnamed: 0,time,event,input,output
0,0,begin,[],[C1]
1,66,div,[C1],"[C1.1, C1.2]"
2,238,div,[C1.1],"[C1.1.1, C1.1.2]"
3,386,div,[C1.1.1],"[C1.1.1.1, C1.1.1.2]"
4,432,end,[C1.1.1.1],[]


## Algorithm :

We suppose that trees do not merge.

1. List begin events -> give us the root cells for each tree
2. Group together cells which contains the name of previous cells ("C1" and "C1.1" e.g.)
   1. Create a DataFrame with [root_cell, cell_name, time, event, input, ouput]

In [6]:
roots = data.loc[data.event == 'begin'].output
roots.head()

0     [C1]
16    [C2]
23    [C3]
39    [C4]
49    [C5]
Name: output, dtype: object

In [7]:
def root_cell_test(row, root): #Tests if the row corresponds to the root cell
    for c in row.input:
        if str(c).startswith(root):
            return True

    return False

def obtain_roots(experience):
    roots = experience.loc[experience.event == 'begin'].output 
    root_cells = []
    for cell in roots:
        root_cells.append(str(cell[0]))
    return(root_cells)

def cells_to_trees(experience): #experience is a DataFrame
    #Select root cells in experience
    root_cells = obtain_roots(experience)

    #Creation of a tree corresponding to the experience
    trees = pd.DataFrame(columns = ["root_cell", "time","event","input","output"])
    frames = []
    for root in root_cells:
        #exp_bis = experience.loc[experience.event != "begin"]
        f = lambda row:root_cell_test(row, root)
        root_cell = experience[experience.apply(f, axis=1)]
        root_cell.insert(loc=0, column='root_cell', value=root, allow_duplicates=True)
        #trees = trees.append(root_cell)
        root_cell=root_cell.sort_values(by='time')
        trees = pd.concat([trees, root_cell], ignore_index=True, sort=False)
    return trees

In [8]:
#TEST

experience = pd.read_csv(file, converters = {"input" : str_to_numpy, "output" : str_to_numpy})
root_cells = experience.loc[experience.event == 'begin'].output
root = root_cells[0]
print(root[0])
f = lambda row:root_cell_test(row, str(root[0]))

root_cell = experience[experience.apply(f, axis=1)]
root_cell.head()


C1


Unnamed: 0,time,event,input,output
1,66,div,[C1],"[C1.1, C1.2]"
2,238,div,[C1.1],"[C1.1.1, C1.1.2]"
3,386,div,[C1.1.1],"[C1.1.1.1, C1.1.1.2]"
4,432,end,[C1.1.1.1],[]
5,432,end,[C1.1.1.2],[]


In [9]:
#TEST 2
trees = cells_to_trees(experience)
trees

Unnamed: 0,root_cell,time,event,input,output
0,C1,66,div,[C1],"[C1.1, C1.2]"
1,C1,238,div,[C1.1],"[C1.1.1, C1.1.2]"
2,C1,259,div,[C1.2],"[C1.2.1, C1.2.2]"
3,C1,364,div,[C1.2.1],"[C1.2.1.1, C1.2.1.2]"
4,C1,386,div,[C1.1.1],"[C1.1.1.1, C1.1.1.2]"
...,...,...,...,...,...
111,C9,350,div,[C9.2.2],"[C9.2.2.1, C9.2.2.2]"
112,C9,432,end,[C9.2.1.1],[]
113,C9,432,end,[C9.2.1.2],[]
114,C9,432,end,[C9.2.2.1],[]


In [10]:
def list_of_trees(data_frame):
    trees_list = pd.DataFrame(columns =["tree_id", "tree"])
    for idx in data_frame.index:

        #Select experience correspondingto index idx
        path = data_frame.loc[idx,'path']
        file_name = data_frame.loc[idx,'code_file']
        file_path = path + "/"+ file_name + ".csv"
        experience = pd.read_csv(file_path, converters = {"input" : str_to_numpy, "output" : str_to_numpy})

        trees = cells_to_trees(experience)
        
        #Add experience trees to the list of all trees
        trees_list.append({"tree_id" : "pouet"})