# Initial data processing

## Cleaning the data

We are using human-generated neural reconstructions as our ground truth. The first step is to get rid of comments in the header of the file

In [64]:
import os

def get_human_trajectories():
    file_list = glob.glob("../data/human/*.swc")
    return file_list

def remove_comments(fpaths, fnames):
    """SWC files start with comments, remove before proceeding"""
    for i in range(len(fpaths)):

        input = open(fpaths[i], "r")
        outdir = "../data/human_clean/"
        outdir = os.path.abspath(outdir)
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        outfile = os.path.join(outdir, fnames[i])
        output = open(outfile, "w+")

        for line in input:
            if not line.lstrip().startswith("#"):
                output.write(line)

        input.close()
        output.close()

def main():
    # get human trajectories
    fnames = []
    abs_paths = []
    for root, dirs, fnames_ in os.walk("../data/human/"):
        fnames.extend(fnames_)
        for f in fnames_:
            relpath = os.path.join(root, f)
            abs_path = os.path.abspath(relpath)
            abs_paths.append(abs_path)

    remove_comments(abs_paths, fnames)


main()

Next, we need to split each file into its component branches. The SWC files have the following columns:

`node_id type x_coordinate y_coordinate z_coordinate radius parent_node`

First, I convert the SWC file into a linked list.

In [75]:
from collections import defaultdict

def file_to_linked_list(fpath: list):
    
    # https://stackoverflow.com/a/17756005/4212158
    linked_list = defaultdict(list)
    
    input = open(fpath, "r")
    for line in input:
        # note: if the parent node is -1, then the child_node_id is the true identity of the root node
        child_node_id, type_, x_coord, y_coord, z_coord, radius, parent_node = line.split()
        new_node = (child_node_id, x_coord, y_coord, z_coord)
        if (len(new_node) != 4 and isinstance(new_node, tuple)):
            raise Exception("faulty node: {}".format(new_node))
        linked_list[parent_node].append(new_node)
    
    input.close()
    return linked_list


## Chopping  reconstructions into individual branches

I start tracing the neuron starting at each root node, which by convention has a parent_node_id of -1. From each root node, I start to grow the branch

In [76]:
def chop_graph(fname: str, linked_list: dict):
    """take a single reconstruction, and chop it up at every fork"""
    assert isinstance(linked_list, dict)
    
    root_nodes = linked_list["-1"]
    for i, root_node in enumerate(root_nodes):
        #print("scanning {} root node {} of {}".format(fname, i, len(root_nodes)))
        root_node_id, _x_coord, _y_coord, _z_coord = root_node
        assert(len(root_node) == 4  and isinstance(root_node, tuple))
        #print("root node ", root_node)

        child_nodes = linked_list[root_node_id]
        for j, child_node in enumerate(child_nodes):
            #print("starting child {} of {}".format(j, len(child_nodes)))
            #print("child node ", child_node)
            child_node_id,  _x_coord, _y_coord, _z_coord = child_node
            assert(len(child_node) == 4  and isinstance(child_node, tuple))
            branch_name = "_".join([fname, "root{}".format(root_node_id), "child{}".format(child_node_id)])
            grow_branch(linked_list, root_node, child_node, branch_name)
            
            

I grow each branch until I encounter a fork. Forks are easy to find because the parent node will have more than one child. When the 

In [80]:
def grow_branch(linked_list: dict, root_node: tuple, first_child_node: tuple, branch_name: str):
    """grow the branch until there's a fork"""
    assert isinstance(linked_list, dict)
    assert isinstance(root_node, tuple)
    assert isinstance(first_child_node, tuple)
    
    # initialize the branch
    #print("new branch started: {}".format(branch_name))
    branch = [root_node, first_child_node]
    
    end_found = False
    parent_node = first_child_node
    while not end_found:
        if (len(branch) % 1500 == 0):
            print("{} nodes processed".format(len(branch)))
        
        # num nodes in a branch should never be > #nodes in whole tree
        if (len(branch) > len(linked_list)):
            print(branch)
            raise Exception("this is growing out of control")
        try:
            parent_node_id = parent_node[0]
            child_nodes = linked_list[parent_node_id]
            assert(isinstance(child_nodes, list))
            if len(child_nodes) > 1: # fork found, chop some more
                print("fork found after {} nodes".format(len(branch)))
                save_branch_as_swc(branch, branch_name)
                for i, child_node in enumerate(child_nodes):
                    child_node_id = child_node[0]
                    child_branch_name = "_".join(branch_name, "grandchild{}".format(child_node_id))
                    grow_branch(linked_list, parent_node, child_node)
                end_found = True
            elif len(child_nodes) == 0:  # no more children, end of branch
                print("end of branch {} found after {} nodes".format(branch_name, len(branch)))
                save_branch_as_swc(branch, branch_name)
                end_found = True
            else:  # no branch, so only one child node
                assert (len(child_nodes) == 1)
                child_node = child_nodes[0]  # should be list w single tuple
                branch.append(child_node)  # should be just 1 node
                child_node_id = child_node[0]  # unpack node from list first
                assert child_node != parent_node
                parent_node = child_node  # start new round
        except:
            raise
                

Finally, save each branch as its own SWC file. Reminder, the column order convention is 

`node_id type x_coordinate y_coordinate z_coordinate radius parent_node`

In [81]:
def save_branch_as_swc(branch: list, branch_name: str):
    assert(isinstance(branch, list))
    outdir = "../data/human_splitted/"
    outdir = os.path.abspath(outdir)
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    outfile = os.path.join(outdir, branch_name+".swc")
    #print("saving SWC {}".format(branch_name))
    output = open(outfile, "w+")
    
    default_radius = "1.0"
    default_type = "3"
    
    parent_node_id = "-1"
    for i, node in enumerate(branch):
        try:
            child_node_id, x, y, z = node
        except ValueError:
            print("error in save-branch", len(node), node)
            raise
        # this is the SWC file convention
        swc_items = [child_node_id, default_type, x, y, z, default_radius, parent_node_id]
        #swc_items = [str(item) for item in str_items]
        try:
            swc_line = " ".join(swc_items)
        except:
            print(swc_items)
            for item in swc_items:
                print(type(item))
                raise
        output.write(swc_line)
        parent_node_id = child_node_id

    output.close()

Now, we can go through the list of clean files and actually split everything.

In [82]:
clean_files = []
fnames = []
for root, dirs, fnames_ in os.walk("../data/human_clean/"):
        fnames.extend(fnames_)
        for f in fnames_:
            relpath = os.path.join(root, f)
            abs_path = os.path.abspath(relpath)
            clean_files.append(abs_path)
assert len(clean_files) == len(fnames), "# file paths and fnames don't match: {} and {}".format(len(clean_files), len(fnames))

#linked_lists = []
for i in range(len(fnames)):
    #print("#{} parsing {} to linked list".format(i, fnames[i]))
    linked_list = file_to_linked_list(clean_files[i])
    print("#{} splitting {} to branches".format(i, fnames[i]))
    chop_graph(fnames[i], linked_list)

#0 parsing 110_ZWX_LijLiu_06072018.ano.swc to linked list
#0 splitting 110_ZWX_LijLiu_06072018.ano.swc to branches
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5497882_child5497881 found after 6 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5501825_child5501824 found after 13 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5498858_child5498857 found after 48 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5496748_child5496747 found after 56 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5502730_child5502729 found after 31 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5496692_child5496691 found after 47 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5496804_child5496803 found after 56 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5496937_child5496936 found after 54 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5504420_child5504419 found after 29 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5487621_

end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5503336_child5503335 found after 63 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5489608_child5489607 found after 20 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5504786_child5504785 found after 26 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5504760_child5504759 found after 29 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5503246_child5503245 found after 11 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5503358_child5503357 found after 22 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5490273_child5490272 found after 59 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5503498_child5503497 found after 38 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5490214_child5490213 found after 25 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5503613_child5503612 found after 45 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5490285_child5490284 found aft

end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5499443_child5499442 found after 21 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5501348_child5501347 found after 9 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5499641_child5499640 found after 27 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5503182_child5503181 found after 17 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5497576_child5497575 found after 13 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5486804_child5486803 found after 44 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5494274_child5494273 found after 4 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5503235_child5503234 found after 12 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5498536_child5498535 found after 21 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5491339_child5491338 found after 33 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5500476_child5500475 found after

end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5494978_child5494977 found after 2 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5494282_child5494281 found after 8 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5498965_child5498964 found after 21 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5489021_child5489020 found after 14 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5493895_child5493894 found after 8 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5499231_child5499230 found after 26 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5486980_child5486979 found after 14 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5488008_child5488007 found after 13 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5500531_child5500530 found after 27 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5493595_child5493594 found after 51 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5489543_child5489542 found after 

end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5497965_child5497964 found after 9 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5500784_child5500783 found after 55 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5494976_child5494975 found after 57 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5504180_child5504179 found after 12 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5488127_child5488126 found after 94 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5486442_child5486441 found after 45 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5491213_child5491212 found after 12 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5494004_child5494003 found after 46 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5504491_child5504490 found after 8 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5505350_child5505349 found after 34 nodes
end of branch 110_ZWX_LijLiu_06072018.ano.swc_root5504483_child5504482 found after