Turn the [Jaspar](http://jaspar.genereg.net) experiments at  https://github.com/TomConlin/Jaspar_FA 
into [RDF](https://en.wikipedia.org/wiki/Resource_Description_Framework) to load into a [triplestore](https://en.wikipedia.org/wiki/Triplestore)  
This work is done for the Monarch's Translator project

In [1]:
import csv
import hashlib
import re
import yaml
import os, stat

The triples to generate may be found by extracting them from the  
graphical model ```jaspar_target_model.gv``` which is encoding the relations  
in the [GraphViz]() [dot]() format


an image is generated with   
    ```dot -T png jaspar_target_model.gv > jaspar_target_model.png```

![Jaspar Target model](jaspar_target_model.png)

Extract the psudo RDF triples to use as templates for what needs to be genreated.  

```sed 's|//.*||g' jaspar_target_model.gv|\
awk -F'"' '/.* -> .*/ {print "# <" $2 "><" $6 "><" $4 ">"}' jaspar_target_model.gv```

A few conveniance functions from Monarch's dipper

In [2]:
# return a deterministic digest of input
# the 'b' is an experiment forcing the first char to be
# non numeric but valid hex
# which is in no way required for RDF
# but can help when using the identifier in other contexts
# where the first char of an identifier 
# is expected to be non numeric (such as 'C')
def digest_id(wordage):
    return 'b' + hashlib.sha1(wordage.encode('utf-8')).hexdigest()[1:20]

In [3]:
# regular expression to limit what is found in the CURIE identifier
# it is ascii centric and may(will) not pass some valid utf8 curies
CURIERE = re.compile(r'^.*:[A-Za-z0-9_][A-Za-z0-9_.]*[A-Za-z0-9_]*$')

In [4]:
with open('translation_tables/curie_map.yaml') as f:
    CURIEMAP = yaml.load(f) 

In [5]:
# Translate labels found in ontologies
# to the terms they are for
with open('translation_tables/translation_table.yaml') as f:
    TT = yaml.load(f)           

In [6]:
# Translate external strings found in datasets
# to specific labels found in ontologies
with open('translation_tables/jaspar.yaml') as f:
    LT = yaml.load(f)

In [7]:
def resolve(label):  # , local_tt):
    '''
    composed mapping 
    given x, f() and g(), here Local & Global 
    translation tables respectivly
    return g(x) | g(f(x)) | f(x) in order of preference
    (TODO consider returning x on fall through)

    : return as close to a global mapping  x as we can manage

    '''
    
    if label is not None and label in TT:
        term_id = TT[label]
    elif label is not None and label in LT:
        label = LT[label]
        if label in TT:
            term_id = TT[label]
        else:
            # print('Translated but do not have a term_id for label: ' + label) 
            term_id = label
    else:
        
        #print('Do not have any mapping for label: ' + label)
        term_id = label
    return term_id

In [8]:
def make_spo(sub, prd, obj):
    '''
    Decorates the three given strings as a line of ntriples

    '''
    # To establish string as a curi and expand we use a global curie_map(.yaml)
    # sub are allways uri  (unless a bnode)
    # prd are allways uri (unless prd is 'a')
    # should fail loudly if curie does not exist
    if prd == 'a':
        prd = 'rdf:type'

    (subcuri, subid) = re.split(r':', sub)
    (prdcuri, prdid) = re.split(r':', prd)
    objt = ''

    # object is a curie or bnode or literal [string|number]
    match = re.match(CURIERE, obj)
    objcuri = None
    if match is not None:
        try:
            (objcuri, objid) = re.split(r':', obj)
        except ValueError:
            match = None
    if match is not None and objcuri in CURIEMAP:
        objt = CURIEMAP[objcuri] + objid
        # allow unexpanded bnodes in object
        if objcuri != '_' or CURIEMAP[objcuri] != '_:':
            objt = '<' + objt + '>'
    elif obj.isnumeric():
        objt = '"' + obj + '"'
    else:
        # Literals may not contain the characters ", LF, CR '\'
        # except in their escaped forms. internal quotes as well.
        obj = obj.strip('"').replace('\\', '\\\\').replace('"', '\'')
        obj = obj.replace('\n', '\\n').replace('\r', '\\r')
        objt = '"' + obj + '"'

    # allow unexpanded bnodes in subject
    if subcuri is not None and subcuri in CURIEMAP and \
            prdcuri is not None and prdcuri in CURIEMAP:
        subjt = CURIEMAP[subcuri] + subid
        if subcuri != '_' or CURIEMAP[subcuri] != '_:':
            subjt = '<' + subjt + '>'

        return subjt + ' <' + CURIEMAP[prdcuri] + prdid + '> ' + objt + ' .'
    else:
        print('Cant work with: ', subcuri, subid,  prdcuri, prdid, objt)
        return None


In [9]:
def write_triple(sub, prd, obj):
    triples.append(make_spo(sub,prd, obj))

In [10]:
# gene start regions have a centroid and contain a set of motifs
# sets of motifs are taken as signatures 
# which may be shared by different regions
regions = {
    '1k' : {'f1' : 'gene_motifsetsig_1k.tab'}, 
    '2k' : {'f1' : 'gene_motifsetsig_2k.tab'},
    '5k' : {'f1' : 'gene_motifsetsig_5k.tab'}
}

The triples to produce from these ```gene_motifsetsig_?k.tab``` files are:

    # <NCBIGene:123><SO:adjacent_to><BNODE:gene_upstream_region>
    # <BNODE:gene_upstream_region><rdfs:label><gene_upstream_region>
    # <BNODE:gene_upstream_region><GENO:has_extent><1000 (region extent bp)>
    # <BNODE:gene_upstream_region><rdf:type><SO:five_prime_flanking_region>
    # <BNODE:motif_set><rdfs:label><motif_set_sig>
    # <BNODE:motif_set><rdf:type><SIO:collection>


In [11]:
triples = []

for extent in regions:
    # print(extent)
    # print(regions[extent])
    for fname in regions[extent]:
       # print(fname)
       # print(regions[extent][fname])    
        with open(regions[extent][fname], 'r') as tabfile:
            filereader = csv.reader(tabfile, delimiter='\t')
            for row in filereader:   
                (geneid, centroid, motifsetsig, count) = row
                gene = 'NCBIGene:' + str(geneid)
                region_label = gene + "_up"  + extent + "_@" + centroid
                region = '_:' + digest_id(region_label)
                motifset = '_:' + digest_id(motifsetsig)
    
                # using the triple templates extracted from the GraphViz target model
    
                # <NCBIGene_123><SO:adjacent_to><BNODE:gene_upstream_region>
                write_triple(gene, resolve('SO:adjacent_to'), region)
                # <BNODE:gene_upstream_region><rdfs:label><gene_upstream_region>
                write_triple(region, resolve('rdfs:label'), region_label)
                # <BNODE:gene_upstream_region><GENO:has_extent><1000>
                write_triple(region, resolve('GENO:has_extent'), extent)
                # <BNODE:gene_upstream_region><rdf:type><SO:five_prime_flanking_region>
                write_triple(region, 'rdf:type', resolve('SO:five_prime_flanking_region'))
                # <BNODE:motif_set><rdfs:label><motif_set_sig>
                write_triple(motifset, 'rdfs:label', motifsetsig)
                # <BNODE:motif_set><rdf:type><SIO:collection>
                write_triple(motifset, 'rdf:type', resolve('SIO:collection'))

In [12]:
len(triples)

278862

In [13]:
print(triples[1])
print(triples[len(triples)-1])

<https://127.0.0.1/.well-known/genid/b605d866a434b979359a> <http://www.w3.org/2000/01/rdf-schema#label> "NCBIGene:100037417_up2k_@+24308025" .
<https://127.0.0.1/.well-known/genid/bab65abf77bc97165a69> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> "SIO:000616" .


splitting the size of the motifset out b/c it is independent of the gene start regions

    # <BNODE:motif_set><rdf:value><3>
    
in `motifsetsig_count.tab`

In [14]:
with open('motifsetsig_count.tab', 'r') as tabfile:
    filereader = csv.reader(tabfile, delimiter='\t')
    for row in filereader:   
        (motifsetsig, count) = row
        motifset = '_:' + digest_id(motifsetsig)
        # <BNODE:motif_set><rdf:value><3>
        write_triple(motifset, 'rdf:value', count)

In [15]:
print(len(triples))
print(triples[len(triples)-1])

284861
<https://127.0.0.1/.well-known/genid/ba4b9237bacccdf19c07> <http://www.w3.org/1999/02/22-rdf-syntax-ns#value> "1" .


these may not turn out to be relevent, but will keep them for now  

    # <BNODE:motif_set><OIO:subset><BNODE:motif_set>

comes from `motifsetsig_subset.tab`  

In [16]:
with open('motifsetsig_subset.tab', 'r') as tabfile:
    filereader = csv.reader(tabfile, delimiter='\t')
    for row in filereader:     
        (motifsetsig, subsetsig) = row
        motifset = '_:' + digest_id(motifsetsig)
        subset = '_:' + digest_id(subsetsig)
        # <BNODE:motif_set><OIO:subset><BNODE:motif_set>
        write_triple(motifset,  resolve('OIO:subset'), subset)

In [17]:
print(len(triples))
print(triples[len(triples)-1])

295223
<https://127.0.0.1/.well-known/genid/b1d5781111d84f7b3fe4> <http://www.geneontology.org/formats/oboInOwl#subset> <https://127.0.0.1/.well-known/genid/be5dbbcea5ce7e2988b8> .


    # <BNODE:motif_set><OIO:hasdbxref><http:JASPAR:motif>
or      
    # <BNODE:motif_set><RO:has member><JASPAR:motif> 
    
will just do the first with data from `motifsetsig_motif.tab`

In [18]:
with open('motifsetsig_motif.tab', 'r') as tabfile:
    filereader = csv.reader(tabfile, delimiter='\t')
    for row in filereader:     
        (motifsetsig, motif) = row
        motifset = '_:' + digest_id(motifsetsig)
        # <BNODE:motif_set><OIO:hasdbxref><http:JASPAR:motif>
        write_triple(motifset, 'OIO:hasDbXref', "JASPAR:" + motif)

In [19]:
print(len(triples))
print(triples[len(triples)-1])

344019
<https://127.0.0.1/.well-known/genid/b093da02f1d652201da3> <http://www.geneontology.org/formats/oboInOwl#hasDbXref> <http://fantom.gsc.riken.jp/5/sstar/JASPAR_motif:MA0940.1> .


    # <BNODE:gene_upstream_region><RO:member of><BNODE:gene_jaccard_value>
    # <BNODE:gene2_upstream_region><RO:member of><BNODE:gene_jaccard_value>
    
    # <BNODE:gene_jaccard_value><rdfs:label><gene1_region gene2_region>
    # <BNODE:gene_jaccard_value><SWO:Similarity score>	<0.73>
    # <BNODE:gene_jaccard_value><rdf:type><SWO:Jaccard’s index> 
    
 previously created    
    gene = 'NCBIGene:' + str(geneid)  
    region_label = gene + "_up"  + extent + "_@" + centroid  
    region = '_:' + digest_id(region_label)  
    
 will need   
    (extent)    gene1 region1    gene2 region2   jaccard  
    
 maybe also  
    
    (extent)    gene    dimotifsig  
    
    
    
    
    

In [20]:
dimotif_regions = {    
    '1k' : {'f1' : 'gene_pair_dimotif_jaccard_1k.tab'}, 
    '2k' : {'f1' : 'gene_pair_dimotif_jaccard_2k.tab'},
    '5k' : {'f1' : 'gene_pair_dimotif_jaccard_5k.tab'}
}

for extent in dimotif_regions:
    # print(extent)
    # print(regions[extent])
    for fname in dimotif_regions[extent]:
       # print(fname)
       # print(regions[extent][fname])    
        with open(dimotif_regions[extent][fname], 'r') as tabfile:
            filereader = csv.reader(tabfile, delimiter='\t')
            for row in filereader:    
                (gene1id, centroid1, gene2id, centroid2, jaccard) = row
                gene1 = 'NCBIGene:' + str(gene1id)
                region1_label = gene1 + "_up"  + extent + "_@" + centroid1
                gene2 = 'NCBIGene:' + str(gene2id)
                region2_label = gene2 + "_up"  + extent + "_@" + centroid2
                pairwise_label = region1_label + " & " + region2_label
                pairwise_id =  "_:" + digest_id(pairwise_label)
                # <BNODE:gene_jaccard_value><rdf:type><SWO:Jaccard’s index>
                write_triple(pairwise_id, 'rdf:type', resolve("SWO:Jaccard's index"))
                # <BNODE:gene_jaccard_value><rdfs:label><gene1_region gene2_region>
                write_triple(pairwise_id, 'rdfs:label', pairwise_label)
                # <BNODE:gene_jaccard_value><SWO:Similarity score>	<0.73>
                write_triple(pairwise_id,  resolve('SWO:Similarity score'), jaccard) 
                # <BNODE:gene1_upstream_region><RO:member of><BNODE:gene_jaccard_value>
                write_triple ("_:" + digest_id(region1_label),  resolve('RO:member of'), pairwise_id)     
                # <BNODE:gene2_upstream_region><RO:member of><BNODE:gene_jaccard_value>
                write_triple ("_:" + digest_id(region2_label),  resolve('RO:member of'), pairwise_id)
        print(len(triples))
        print(triples[len(triples)-1])
   
                
                

566639
<https://127.0.0.1/.well-known/genid/be5d2a55c4233584cb7d> <http://purl.obolibrary.org/obo/RO_0002350> <https://127.0.0.1/.well-known/genid/b4275d1c825047bba10b> .
749874
<https://127.0.0.1/.well-known/genid/b5c5785d54897c75a149> <http://purl.obolibrary.org/obo/RO_0002350> <https://127.0.0.1/.well-known/genid/b7ceb735bad75d8b2d19> .
1019589
<https://127.0.0.1/.well-known/genid/b2fe455e351721a178cb> <http://purl.obolibrary.org/obo/RO_0002350> <https://127.0.0.1/.well-known/genid/b74270061446a824056b> .


In [21]:
fh = open('jaspar.nt', 'w')
print('\n'.join(list(set(triples))), file=fh)
fh.close()

# a readable copy for blazegraph 
fh = open('/tmp/jaspar.nt', 'w')
print('\n'.join(list(set(triples))), file=fh)
os.chmod('/tmp/jaspar.nt', 644)
fh.close()