Turn the [Jaspar](http://jaspar.genereg.net) experiments at  https://github.com/TomConlin/Jaspar_FA 
into [RDF](https://en.wikipedia.org/wiki/Resource_Description_Framework) to load into a [triplestore](https://en.wikipedia.org/wiki/Triplestore)  
This work is done for the Monarch's Translator project

In [1]:
import csv
import hashlib

The triples to generate may be found by extracting them from the  
graphical model ```jaspar_target_model.gv``` which is encoding the relations  
in the [GraphViz]() [dot]() format


an image is generated with   
    ```dot -T png jaspar_target_model.gv > jaspar_target_model.png```

![Jaspar Target model](jaspar_target_model.png)

```sed 's|//.*||g' jaspar_target_model.gv|\
awk -F'"' '/.* -> .*/ {print "# <" $2 "><" $6 "><" $4 ">"}' jaspar_target_model.gv```

A few conveniance functions from Monarch's dipper

In [3]:
# return a deterministic digest of input
# the 'b' is an experiment forcing the first char to be
# non numeric but valid hex
# which is in no way required for RDF
# but can help when using the identifier in other contexts
def digest_id(wordage):
    return 'b' + hashlib.sha1(wordage.encode('utf-8')).hexdigest()[1:20]

In [4]:
# stand in till I expand the curises to full IRI
def write_triple(sub, prd, obj):
    triples.append(sub + ' ' + prd + ' ' + obj + ' .')


In [5]:
# gene start regions have a centroid and contain a set of motifs
# sets of motifs are taken as signatures 
# which may be shared by different regions
regions = {
    '1k' : {'f1' : 'gene_motifsetsig_1k.tab'}, 
    '2k' : {'f1' : 'gene_motifsetsig_2k.tab'},
    '5k' : {'f1' : 'gene_motifsetsig_5k.tab'}
}

The triples to produce from these ```gene_motifsetsig_?k.tab``` files are:

    # <NCBIGene:123><SO:adjacent_to><BNODE:gene_upstream_region>
    # <BNODE:gene_upstream_region><rdfs:label><gene_upstream_region>
    # <BNODE:gene_upstream_region><GENO:has_extent><1000 (region extent bp)>
    # <BNODE:gene_upstream_region><rdf:type><SO:five_prime_flanking_region>
    # <BNODE:gene_upstream_region><rdf:comment><Candidate SO:TF_binding_sites>
    # <BNODE:motif_set><rdfs:label><motif_set_sig>
    # <BNODE:motif_set><rdf:type><SIO:collection>


In [6]:
triples = []

for extent in regions:
    # print(extent)
    # print(regions[extent])
    for fname in regions[extent]:
       # print(fname)
       # print(regions[extent][fname])    
        with open(regions[extent][fname], 'r') as tabfile:
            filereader = csv.reader(tabfile, delimiter='\t')
            for row in filereader:   
                (geneid, centroid, motifsetsig, count) = row
                gene = 'NCBIGene:' + str(geneid)
                region_label = gene + "_up"  + extent + "_@" + centroid
                region = '_:' + digest_id(region_label)
                motifset = '_:' + digest_id(motifsetsig)
    
                # using the triple templates extracted from the GraphViz target model
    
                # <NCBIGene_123><SO:adjacent_to><BNODE:gene_upstream_region>
                write_triple(gene, 'SO:adjecent_to', region)
                # <BNODE:gene_upstream_region><rdfs:label><gene_upstream_region>
                write_triple(region, 'rdfs:label', region_label)
                # <BNODE:gene_upstream_region><GENO:has_extent><1000>
                write_triple(region, 'GENO:has_extent', extent)
                # <BNODE:gene_upstream_region><rdf:type><SO:five_prime_flanking_region>
                write_triple(region, 'rdf:type', 'SO:five_prime_flanking_region')
                # <BNODE:motif_set><rdfs:label><motif_set_sig>
                write_triple(motifset, 'rdfs:label', motifsetsig)
                # <BNODE:motif_set><rdf:type><SIO:collection>
                write_triple(motifset, 'rdf:type', 'SIO:collection')

In [7]:
len(triples)

278862

In [8]:
print(triples[1])
print(triples[len(triples)-1])

_:be3bb7a7360e7ee5a2f9 rdfs:label NCBIGene:10002_up5k_@+72100387 .
_:bab65abf77bc97165a69 rdf:type SIO:collection .


splitting the size of the motifset out b/c it is independent of the gene start regions

    # <BNODE:motif_set><rdf:value><3>
    
in `motifsetsig_count.tab`

In [9]:
with open('motifsetsig_count.tab', 'r') as tabfile:
    filereader = csv.reader(tabfile, delimiter='\t')
    for row in filereader:   
        (motifsetsig, count) = row
        motifset = '_:' + digest_id(motifsetsig)
        # <BNODE:motif_set><rdf:value><3>
        write_triple(motifset, 'rdf:value', count)

In [10]:
print(len(triples))
print(triples[len(triples)-1])

284861
_:ba4b9237bacccdf19c07 rdf:value 1 .


    # <BNODE:motif_set><OIO:subset><BNODE:motif_set>

comes from `motifsetsig_subset.tab`

In [11]:
with open('motifsetsig_subset.tab', 'r') as tabfile:
    filereader = csv.reader(tabfile, delimiter='\t')
    for row in filereader:     
        (motifsetsig, subsetsig) = row
        motifset = '_:' + digest_id(motifsetsig)
        subset = '_:' + digest_id(subsetsig)
        # <BNODE:motif_set><OIO:subset><BNODE:motif_set>
        write_triple(motifset, 'OIO:subset', subset)

In [12]:
print(len(triples))
print(triples[len(triples)-1])

295223
_:b1d5781111d84f7b3fe4 OIO:subset _:be5dbbcea5ce7e2988b8 .


    # <BNODE:motif_set><OIO:hasdbxref><http:JASPAR:motif>
    # <BNODE:motif_set><RO:has member><JASPAR:motif> 
    
will just do the first with data from `motifsetsig_motif.tab`

In [13]:
with open('motifsetsig_motif.tab', 'r') as tabfile:
    filereader = csv.reader(tabfile, delimiter='\t')
    for row in filereader:     
        (motifsetsig, motif) = row
        motifset = '_:' + digest_id(motifsetsig)
        # <BNODE:motif_set><OIO:hasdbxref><http:JASPAR:motif>
        write_triple(motifset, 'OIO:hasdbxref', "JASPAR:" + motif)

In [14]:
print(len(triples))
print(triples[len(triples)-1])

344019
_:b093da02f1d652201da3 OIO:hasdbxref JASPAR:MA0940.1 .


    # <BNODE:gene_upstream_region><RO:member of><BNODE:gene_jaccard_value>
    # <BNODE:gene2_upstream_region><RO:member of><BNODE:gene_jaccard_value>
    
    # <BNODE:gene_jaccard_value><rdfs:label><gene1_region gene2_region>
    # <BNODE:gene_jaccard_value><SWO:Similarity score>	<0.73>
    # <BNODE:gene_jaccard_value><rdf:type><SWO:Jaccard’s index> 
    
 previously created    
    gene = 'NCBIGene:' + str(geneid)  
    region_label = gene + "_up"  + extent + "_@" + centroid  
    region = '_:' + digest_id(region_label)  
    
 will need   
    (extent)    gene1 region1    gene2 region2   jaccard  
    
 maybe also  
    
    (extent)    gene    dimotifsig  
    
    
    
    
    

In [15]:
dimotif_regions = {    
    '1k' : {'f1' : 'gene_pair_dimotif_jaccard_1k.tab'}, 
    '2k' : {'f1' : 'gene_pair_dimotif_jaccard_2k.tab'},
    '5k' : {'f1' : 'gene_pair_dimotif_jaccard_5k.tab'}
}

for extent in dimotif_regions:
    # print(extent)
    # print(regions[extent])
    for fname in dimotif_regions[extent]:
       # print(fname)
       # print(regions[extent][fname])    
        with open(dimotif_regions[extent][fname], 'r') as tabfile:
            filereader = csv.reader(tabfile, delimiter='\t')
            for row in filereader:    
                (gene1id, centroid1, gene2id, centroid2, jaccard) = row
                gene1 = 'NCBIGene:' + str(gene1id)
                region1_label = gene1 + "_up"  + extent + "_@" + centroid1
                gene2 = 'NCBIGene:' + str(gene2id)
                region2_label = gene2 + "_up"  + extent + "_@" + centroid2
                pairwise_label = region1_label + " & " + region2_label
                pairwise_id =  "_:" + digest_id(pairwise_label)
                # <BNODE:gene_jaccard_value><rdf:type><SWO:Jaccard’s index>
                write_triple(pairwise_id, 'rdf:type', "SWO:Jaccard’s index")
                # <BNODE:gene_jaccard_value><rdfs:label><gene1_region gene2_region>
                write_triple(pairwise_id, 'rdfs:label', pairwise_label)
                # <BNODE:gene_jaccard_value><SWO:Similarity score>	<0.73>
                write_triple(pairwise_id, 'SWO:Similarity score', jaccard) 
                # <BNODE:gene1_upstream_region><RO:member of><BNODE:gene_jaccard_value>
                write_triple ("_:" + digest_id(region1_label), 'RO:member of', pairwise_id)     
                # <BNODE:gene2_upstream_region><RO:member of><BNODE:gene_jaccard_value>
                write_triple ("_:" + digest_id(region2_label), 'RO:member of', pairwise_id)
        print(len(triples))
        print(triples[len(triples)-1])
   
                
                

527254
_:b5c5785d54897c75a149 RO:member of _:b7ceb735bad75d8b2d19 .
749874
_:be5d2a55c4233584cb7d RO:member of _:b4275d1c825047bba10b .
1019589
_:b2fe455e351721a178cb RO:member of _:b74270061446a824056b .


In [16]:
fh = open('jaspar_curies.nt', 'w')
print('\n'.join(list(set(triples))), file=fh)
fh.close()