In [1]:
study_type_terms = ['efo_0002768', 'efo_0002761', 'efo_0003738', 'efo_0000749',
       'efo_0002767', 
       #'efo_0002692', 
       'efo_0000753', 'efo_0003737',
       'efo_0002759', 'efo_0002693', 'efo_0007045', 'efo_0002762',
       'efo_0005684', 'efo_0002760', 'efo_0004428', 'efo_0002765',
       'efo_0002769', 
       #'efo_0000747', 
       'efo_0005310', 
       #'efo_0003143',
       'efo_0002771', 'efo_0001030', 'efo_0002896', 'efo_0004160',
       'efo_0001033', 'efo_0007690', 'efo_0002764', 'efo_0003751',
       'efo_0003749', 'efo_0008891', 'efo_0009653', 'efo_0002766',
       'efo_0003753', 'efo_0005517', 'efo_0007693', 'efo_0030005',
       'efo_0007691', 'efo_0001031', 'efo_0010891', 'efo_0002770',
       'efo_0005227', 
       #'efo_0008994'
        ]
study_type_terms = [string.replace("efo", "EFO") for string in study_type_terms]

In [2]:
import owlready2 as o2

file = 'efo.owl'
onto = o2.get_ontology(file)
onto.load()

get_ontology("http://www.ebi.ac.uk/efo/efo.owl#")

In [6]:
class O2_LeastCommonAncestor:
    def __init__(self, onto):
        self.onto = onto
        self.prefix = "http://www.ebi.ac.uk/efo/"

    def set_terms_to_include(self, terms_to_include):
        self.set_terms_to_include = terms_to_include

    def iterate_from(self, node_str=None):
        if node_str is None:
            node_str = self.set_terms_to_include[0]
        node_obj = self.onto.search_one(iri = self.prefix+node_str)
        ancestors = node_obj.ancestors()
        solutions = []
        for node in ancestors:
            D = node.descendants() # set of all descendants, as node objs.
            D = [d.name for d in D] # list of all descentant strings
            if set(self.set_terms_to_include) <= set(D): # if subset, i.e. node is common anestor:
                solutions.append((node, len(D)))
        solutions = sorted(solutions, key = lambda x: x[1])
        print("all commmon ancestors:", [(s[0].label, s[1]) for s in solutions])
        self.lca = solutions[0]

In [7]:
lca = O2_LeastCommonAncestor(onto)
lca.set_terms_to_include(study_type_terms)
lca.iterate_from()
print(lca.lca)

all commmon ancestors: [(['assay by instrument'], 149), (['assay', locstr('assay', 'en')], 596), (['experimental process'], 804), (['planned process'], 854), (['process'], 1941), (['experimental factor'], 51414), ([], 57807)]
(efo.EFO_0002773, 149)


In [8]:
# print some nubers ++

print("desc.\toverlap\tancs.\tname\tsubnode under assay")
for node_str in study_type_terms:
    node = onto.search_one(iri = "http://www.ebi.ac.uk/efo/"+node_str)
    D = node.descendants()
    overlap = -1 # do exclude self
    for d in D:
        if d.name in study_type_terms:
            overlap += 1
    print(len(node.descendants()),"\t", overlap, "\t", len(node.ancestors())-len(lca.lca[0].ancestors())+1, "\t", node.label, node.name)
    internal_ancestors = []
    for a in node.ancestors():
        if lca.lca[0] in a.ancestors():
            internal_ancestors.append((a, len(a.ancestors())))
    internal_ancestors = sorted(internal_ancestors, key = lambda x : x[1])
    print("\t\t\t\t", internal_ancestors[1][0].label)

desc.	overlap	ancs.	name	subnode under assay
1 	 0 	 6 	 ['transcription profiling by array'] EFO_0002768
				 ['assay by array']
3 	 0 	 7 	 ['methylation profiling by high throughput sequencing'] EFO_0002761
				 ['assay by sequencer']
2 	 1 	 8 	 ['RNA-seq of coding RNA'] EFO_0003738
				 ['assay by sequencer']
1 	 0 	 5 	 ['comparative genomic hybridization by array'] EFO_0000749
				 ['assay by array']
1 	 0 	 6 	 ['genotyping by array'] EFO_0002767
				 ['assay by array']
1 	 0 	 5 	 ['microRNA profiling by array'] EFO_0000753
				 ['assay by array']
3 	 1 	 8 	 ['RNA-seq of non coding RNA'] EFO_0003737
				 ['assay by sequencer']
1 	 0 	 6 	 ['methylation profiling by array'] EFO_0002759
				 ['assay by array']
10 	 2 	 6 	 ['DNA-seq'] EFO_0002693
				 ['assay by sequencer']
8 	 1 	 8 	 ['ATAC-seq'] EFO_0007045
				 ['assay by sequencer']
1 	 0 	 7 	 ['ChIP-chip by tiling array'] EFO_0002762
				 ['assay by array']
1 	 0 	 11 	 ['RNA-seq of coding RNA from single cells'] EFO_000

In [11]:
for d in node.descendants():
    print(d.label, d.IAO_0000115)
    print("")

['GRO-seq'] ['GRO-seq or Genomic run-on sequencing or sometimes Global run-on sequencing is a sequencing assay based on traditional nuclear run-on assays, but instead of looking at a few loci at a time, this is a high-throughput method that allows for the calculation of transcription rates of all genes across the genome. Please note that the starting materials for sequencing are RNAs, not genomic DNA because nascent transcripts are studied.']



In [12]:
node.IAO_0000115

['GRO-seq or Genomic run-on sequencing or sometimes Global run-on sequencing is a sequencing assay based on traditional nuclear run-on assays, but instead of looking at a few loci at a time, this is a high-throughput method that allows for the calculation of transcription rates of all genes across the genome. Please note that the starting materials for sequencing are RNAs, not genomic DNA because nascent transcripts are studied.']

In [9]:
# Check that lca works as supposed

lca = O2_LeastCommonAncestor(onto)
lca.set_terms_to_include(["EFO_0030035", "EFO_0010941"])
lca.iterate_from()

lca = O2_LeastCommonAncestor(onto)
lca.set_terms_to_include(["EFO_0030035", "EFO_0010941", "EFO_0009655"])
lca.iterate_from()


all commmon ancestors: [(['neoplastic sample'], 5), (['abnormal sample'], 6), (['material sample'], 8), (['case control design'], 8), (['specimen', locstr('specimen', 'en')], 23), (['biological variation design'], 38), (['study design'], 64), (['protocol', locstr('protocol', 'en')], 117), (['information entity'], 8694), (['material entity'], 14234), (['experimental factor'], 51414), ([], 57807)]
all commmon ancestors: [(['abnormal sample'], 6), (['material sample'], 8), (['case control design'], 8), (['specimen', locstr('specimen', 'en')], 23), (['biological variation design'], 38), (['study design'], 64), (['protocol', locstr('protocol', 'en')], 117), (['information entity'], 8694), (['material entity'], 14234), (['experimental factor'], 51414), ([], 57807)]
