# Initial data processing

## Cleaning the data

We are using human-generated neural reconstructions as our ground truth. The first step is to get rid of comments in the header of the file

In [1]:
from swc_io import get_fnames_and_abspath_from_dir, remove_comments_from_swc

# get human trajectories
RAW_HUMAN_TRAJECTORY_DIR = "../data/01_raw_human_annotations/"

fnames, abs_paths = get_fnames_and_abspath_from_dir(RAW_HUMAN_TRAJECTORY_DIR)
remove_comments_from_swc(abs_paths, fnames)

Next, we need to split each file into its component branches. The SWC files have the following columns:

`node_id type x_coordinate y_coordinate z_coordinate radius parent_node`

First, I convert the SWC file into a linked list.

In [2]:
# moved code to its own script
from swc_io import swc_to_linked_list

## Chopping  reconstructions into individual branches

I start tracing the neuron starting at each root node, which by convention has a parent_node_id of -1. From each root node, I start to grow the branch

In [3]:
def chop_graph(fname: str, linked_list: dict):
    """take a single reconstruction, and chop it up at every fork"""
    assert isinstance(linked_list, dict)
    
    root_nodes = linked_list["-1"]
    for i, root_node in enumerate(root_nodes):
        #print("scanning {} root node {} of {}".format(fname, i, len(root_nodes)))
        root_node_id, _x_coord, _y_coord, _z_coord = root_node
        assert(len(root_node) == 4  and isinstance(root_node, tuple))
        #print("root node ", root_node)

        child_nodes = linked_list[root_node_id]
        for j, child_node in enumerate(child_nodes):
            #print("starting child {} of {}".format(j, len(child_nodes)))
            #print("child node ", child_node)
            child_node_id,  _x_coord, _y_coord, _z_coord = child_node
            assert(len(child_node) == 4  and isinstance(child_node, tuple))
            branch_name = "_".join([fname, "root{}".format(root_node_id), "child{}".format(child_node_id)])
            grow_branch(linked_list, root_node, child_node, branch_name)
            
            

I grow each branch until I encounter a fork. Forks are easy to find because the parent node will have more than one child. When the 

In [4]:
from swc_io import save_branch_as_swc

def grow_branch(linked_list: dict, root_node: tuple, first_child_node: tuple, branch_name: str):
    """grow the branch until there's a fork"""
    assert isinstance(linked_list, dict)
    assert isinstance(root_node, tuple)
    assert isinstance(first_child_node, tuple)
    
    # initialize the branch
    #print("new branch started: {}".format(branch_name))
    branch = [root_node, first_child_node]
    
    end_found = False
    parent_node = first_child_node
    while not end_found:
        if (len(branch) % 1500 == 0):
            print("{} nodes processed".format(len(branch)))
        
        # num nodes in a branch should never be > #nodes in whole tree
        if (len(branch) > len(linked_list)):
            print(branch)
            raise Exception("this is growing out of control")
        try:
            parent_node_id = parent_node[0]
            child_nodes = linked_list[parent_node_id]
            assert(isinstance(child_nodes, list))
            if len(child_nodes) > 1: # fork found, chop some more
                #print("fork found after {} nodes".format(len(branch)))
                save_branch_as_swc(branch, branch_name)
                for i, child_node in enumerate(child_nodes):
                    child_node_id = child_node[0]
                    child_branch_name = "_".join(branch_name, "grandchild{}".format(child_node_id))
                    grow_branch(linked_list, parent_node, child_node)
                end_found = True
            elif len(child_nodes) == 0:  # no more children, end of branch
                #print("end of branch {} found after {} nodes".format(branch_name, len(branch)))
                save_branch_as_swc(branch, branch_name)
                end_found = True
            else:  # no branch, so only one child node
                assert (len(child_nodes) == 1)
                child_node = child_nodes[0]  # should be list w single tuple
                branch.append(child_node)  # should be just 1 node
                child_node_id = child_node[0]  # unpack node from list first
                assert child_node != parent_node
                parent_node = child_node  # start new round
        except:
            raise
                

Finally, save each branch as its own SWC file. Reminder, the column order convention is 

`node_id type x_coordinate y_coordinate z_coordinate radius parent_node`

In [5]:
# moved to separate script

Now, we can go through the list of clean files and actually split everything.

In [6]:
CLEAN_SWC_DIR = "../data/02_human_clean/"
clean_fnames, clean_files_abspath = get_fnames_and_abspath_from_dir(CLEAN_SWC_DIR)
assert len(clean_files_abspath) == len(clean_fnames), "# file paths and fnames don't match: {} and {}".format(len(clean_files), len(fnames))

#linked_lists = []
for i in range(len(fnames)):
    #print("#{} parsing {} to linked list".format(i, fnames[i]))
    linked_list = swc_to_linked_list(clean_files_abspath[i])
    #print("#{} splitting {} to branches".format(i, fnames[i]))
    chop_graph(clean_fnames[i], linked_list)