In [1]:
import pandas as pd
from pathlib import Path
from metapaths.tools.processing import regularize_colnames, head, expand_col_on_char, combine_group_rows_on_char
from hetnet_ml.src import graph_tools as gt

load_dir = Path('../2_pipeline/00_download_data/out/').resolve()
network_dir = load_dir.parent.parent.joinpath('01_Building_a_network_from_CTD/out')

In [2]:
all_nodes = gt.remove_colons(pd.read_csv(network_dir.joinpath('nodes_all.csv'), dtype=str))
edges = gt.remove_colons(pd.read_csv(network_dir.joinpath('edges.csv'), dtype=str))

In [3]:
abv, et = gt.get_abbrev_dict_and_edge_tuples(all_nodes, edges)

In [4]:
go_ids = set(all_nodes[all_nodes['id'].str.startswith('GO:')]['id'])

In [5]:
new_nodes = []
new_edges = []

## Start loading .gaf files.

In [6]:
go_cols = ['db', 'db_object_id', 'db_object_symbol','qualifier', 'go_id', 'db_reference', 'evidence_code',
 'with_or_from', 'aspect', 'db_object_name', 'db_object_synonym', 'db_object_type', 'taxon', 'date', 'assigned_by',
 'annotation_extension', 'gene_product_form_id']

In [7]:
go_prot = pd.read_csv(load_dir.joinpath('goa_human.gaf.gz'), sep='\t', header=None, 
                      names=go_cols, comment='!', dtype=str)
go_prot.head(2)

Unnamed: 0,db,db_object_id,db_object_symbol,qualifier,go_id,db_reference,evidence_code,with_or_from,aspect,db_object_name,db_object_synonym,db_object_type,taxon,date,assigned_by,annotation_extension,gene_product_form_id
0,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0003924,GO_REF:0000002,IEA,InterPro:IPR001770,F,Guanine nucleotide-binding protein subunit gamma,DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20190504,InterPro,,
1,UniProtKB,A0A024RBG1,NUDT4B,,GO:0003723,GO_REF:0000037,IEA,UniProtKB-KW:KW-0694,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20190504,UniProt,,


In [8]:
go_prot['db'].value_counts()

UniProtKB    479438
Name: db, dtype: int64

In [9]:
go_complex = pd.read_csv(load_dir.joinpath('goa_human_complex.gaf.gz'), sep='\t', header=None, 
                      names=go_cols, dtype=str, comment='!')
go_complex.head(2)

Unnamed: 0,db,db_object_id,db_object_symbol,qualifier,go_id,db_reference,evidence_code,with_or_from,aspect,db_object_name,db_object_synonym,db_object_type,taxon,date,assigned_by,annotation_extension,gene_product_form_id
0,ComplexPortal,CPX-1012,tenascin-w_human,,GO:0030155,PMID:17909022,IDA,,P,Tenascin-W complex,TNN hexamer|Tenascin W complex,protein_complex,taxon:9606,20170313,ComplexPortal,,
1,ComplexPortal,CPX-1012,tenascin-w_human,,GO:0030334,PMID:17909022,IDA,,P,Tenascin-W complex,TNN hexamer|Tenascin W complex,protein_complex,taxon:9606,20170313,ComplexPortal,,


In [10]:
go_complex.query('aspect == "C"').head(2)

Unnamed: 0,db,db_object_id,db_object_symbol,qualifier,go_id,db_reference,evidence_code,with_or_from,aspect,db_object_name,db_object_synonym,db_object_type,taxon,date,assigned_by,annotation_extension,gene_product_form_id
2,ComplexPortal,CPX-1012,tenascin-w_human,,GO:0062023,PMID:19884327,IDA,,C,Tenascin-W complex,TNN hexamer|Tenascin W complex,protein_complex,taxon:9606,20170313,ComplexPortal,,
13,ComplexPortal,CPX-1032,snurportin_human,,GO:0005654,PMID:9670026,IDA,,C,"Importin complex, Snurportin variant","Snurportin complex|Importin complex, SNUPN var...",protein_complex,taxon:9606,20170608,ComplexPortal,,


In [11]:
go_complex['db'].value_counts()

ComplexPortal    1470
Name: db, dtype: int64

In [12]:
go_isoform = pd.read_csv(load_dir.joinpath('goa_human_isoform.gaf.gz'), sep='\t', header=None, 
                      names=go_cols, dtype=str, comment='!')
go_isoform.head(2)

Unnamed: 0,db,db_object_id,db_object_symbol,qualifier,go_id,db_reference,evidence_code,with_or_from,aspect,db_object_name,db_object_synonym,db_object_type,taxon,date,assigned_by,annotation_extension,gene_product_form_id
0,UniProtKB,A0A087WTH5,KCNE1B,,GO:0005249,GO_REF:0000002,IEA,InterPro:IPR000369|InterPro:IPR005424,F,Potassium voltage-gated channel subfamily E me...,KCNE1B,protein,taxon:9606,20190504,InterPro,,UniProtKB:A0A087WU88
1,UniProtKB,A0A087WTH5,KCNE1B,,GO:0005249,GO_REF:0000002,IEA,InterPro:IPR000369|InterPro:IPR005424,F,Potassium voltage-gated channel subfamily E me...,KCNE1B,protein,taxon:9606,20190504,InterPro,,UniProtKB:A0A087WWU3


In [13]:
go_isoform['db'].value_counts()

UniProtKB    101596
Name: db, dtype: int64

In [14]:
go_rna = pd.read_csv(load_dir.joinpath('goa_human_rna.gaf.gz'), sep='\t', header=None, 
                      names=go_cols, dtype=str, comment='!')
go_rna.head(2)

Unnamed: 0,db,db_object_id,db_object_symbol,qualifier,go_id,db_reference,evidence_code,with_or_from,aspect,db_object_name,db_object_synonym,db_object_type,taxon,date,assigned_by,annotation_extension,gene_product_form_id
0,RNAcentral,URS0000001346_9606,URS0000001346_9606,,GO:0006412,GO_REF:0000108,IEA,GO:0030533,P,Homo sapiens (human) tRNA-Lys,,tRNA,taxon:9606,20190504,GOC,,
1,RNAcentral,URS0000001346_9606,URS0000001346_9606,,GO:0030533,GO_REF:0000115,IEA,Rfam:RF00005,F,Homo sapiens (human) tRNA-Lys,,tRNA,taxon:9606,20190504,RNAcentral,,


In [15]:
go_rna['db'].value_counts()

RNAcentral    43498
Name: db, dtype: int64

## Add GO to protein annotiaions....

Will also be able to add complex and rna info

In [16]:
type_conversion = {'P': 'involved_in_GinBP',
                   'F': 'enables_GeMF',
                   'C': 'part_of_GpoCC'}

edges_go_prot = go_prot.rename(columns={'db_object_symbol': 'start_id', 'go_id': 'end_id'})
edges_go_prot['type'] = edges_go_prot['aspect'].map(type_conversion)

In [17]:
def fix_col_order_edge(df):
    cols = ['start_id', 'end_id', 'type']
    cols = cols + [c for c in df.columns if c not in cols]
    return cols


In [18]:
edges_go_prot = edges_go_prot[fix_col_order_edge(edges_go_prot)]
edges_go_prot.head(2)

Unnamed: 0,start_id,end_id,type,db,db_object_id,qualifier,db_reference,evidence_code,with_or_from,aspect,db_object_name,db_object_synonym,db_object_type,taxon,date,assigned_by,annotation_extension,gene_product_form_id
0,DNAJC25-GNG10,GO:0003924,enables_GeMF,UniProtKB,A0A024R161,,GO_REF:0000002,IEA,InterPro:IPR001770,F,Guanine nucleotide-binding protein subunit gamma,DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20190504,InterPro,,
1,NUDT4B,GO:0003723,enables_GeMF,UniProtKB,A0A024RBG1,,GO_REF:0000037,IEA,UniProtKB-KW:KW-0694,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20190504,UniProt,,


Remove relations that have qualifiers that contain NOT

In [19]:
edges_go_prot = edges_go_prot[~edges_go_prot['qualifier'].str.contains('NOT').fillna(False)]

In [20]:
# if 0 then no Pipe Characters in symnbol, so all only have 1 associated symbol
print('GeneIDs with more than one symbol in same row: {:,}'.format(all_nodes['gene_symbol']
                                                                   .str
                                                                   .contains('|', regex=False)
                                                                   .sum()))
# if 0 then all GeneID to Symbol mappings are 1 to 1
print('Gene Symbols mapped to multiple GeneIDs: {:,}'.format((all_nodes.dropna(subset=['gene_symbol'])
                                                                       .groupby('gene_symbol')['id']
                                                                       .nunique() > 1).sum()))

GeneIDs with more than one symbol in same row: 0
Gene Symbols mapped to multiple GeneIDs: 0


In [21]:
gene_sym_to_id = all_nodes.set_index('gene_symbol')['id'].dropna().to_dict()
edges_go_prot['start_id'] = edges_go_prot['start_id'].map(gene_sym_to_id)

In [22]:
print('GO Gene annotations')
print('Total:   {:10,}'.format(len(edges_go_prot)))
print('Mapped:  {:10,}'.format(edges_go_prot.query('end_id in @go_ids')['start_id'].count()))
print('Unmapped:{:10,}'.format(len(edges_go_prot) - edges_go_prot.query('end_id in @go_ids')['start_id'].count()))

GO Gene annotations
Total:      478,227
Mapped:     475,214
Unmapped:     3,013


In [23]:
edges_go_prot['db_object_id'].nunique()

19834

In [24]:
edges_go_prot['start_id'].nunique()

19095

There are fewer Gene IDs than Protein IDs, so there may be 1 to many relationships going on...

In [25]:
print('Number of UniProt IDs that map to more than one Gene ID: {:,}'.format(
    (edges_go_prot[['start_id', 'db_object_id']].groupby('db_object_id')['start_id'].nunique() > 1).sum()))
print('Number of Gene IDs that map to more than 1 UniProt ID: {:,}'.format(
    (edges_go_prot[['start_id', 'db_object_id']].groupby('start_id')['db_object_id'].nunique() > 1).sum()))

Number of UniProt IDs that map to more than one Gene ID: 0
Number of Gene IDs that map to more than 1 UniProt ID: 51


51 Genes that map to multiple Uniprots... it should be fine to map them to all the edges for all the correspoding uniprots

In [26]:
# we will do a pipe, separated mapping and add to the node Infos.
gene_to_uniprot = combine_group_rows_on_char(edges_go_prot[['start_id', 'db_object_id']], 'start_id', 
                                             ['db_object_id'], '|')
gene_to_uniprot = gene_to_uniprot.set_index('start_id')['db_object_id'].to_dict()

In [27]:
all_nodes['uniprot_id'] = all_nodes['id'].map(gene_to_uniprot)
all_nodes.dropna(subset=['uniprot_id']).head(10)

Unnamed: 0,id,name,label,tree_numbers,drug_bank_ids,alt_disease_ids,gene_symbol,alt_gene_ids,bio_gridids,pharm_gkbids,uni_prot_ids,uniprot_id
199609,1,alpha-1-B glycoprotein,Gene,,,,A1BG,100064369|100354232|100400383|100438958|100482...,106523,PA24356,A0A0A0MX79|A0A1U8C678|A0A2J8JM56|A0A2K5C3B8|A0...,P04217
199611,29974,APOBEC1 complementation factor,Gene,,,,A1CF,100013290|100071844|100155074|100174508|100231...,119004,PA162375098,A0A024QZI9|A0A024QZJ5|A0A024QZM7|A0A091CWX6|A0...,Q9NQ94
199618,2,alpha-2-macroglobulin,Gene,,,,A2M,100061692|100173946|100390764|100543551|100657...,106524|231245|246347|679499,PA24357,A0A1U7TC46|A0A2K5E7U5|A0A2K5KBI2|A0A2K6N6A9|A0...,P01023
199622,144568,alpha-2-macroglobulin like 1,Gene,,,,A2ML1,100061421|100127688|100152492|100347314|100407...,126860,PA142670460,A0A0D9RAZ7|A0A1S3AMH8|A0A1U7S0T5|A0A1U7SWH3|A0...,A8K2U0
200727,127550,"alpha 1,3-galactosyltransferase 2",Gene,,,,A3GALT2,100339429|100414894|100431438|100479995|100586...,,PA142670461,A0A087Y8Z2|A0A1S2ZRA6|A0A1S3FHK7|A0A1U7R263|A0...,U3KPV4
201201,53947,"alpha 1,4-galactosyltransferase (P blood group)",Gene,,,,A4GALT,100017110|100090950|100172807|100347783|100399...,119825,PA143485570|PA24359,A0A096N8Q2|A0A0D9SD56|A0A0S2Z5J1|A0A1S2ZRQ3|A0...,Q9NPC4
201203,51146,"alpha-1,4-N-acetylglucosaminyltransferase",Gene,,,,A4GNT,100024221|100033872|100091228|100224446|100351...,119330,PA134960042,A0A091DKX1|A0A096MVI4|A0A0D9REY6|A0A1S3APW6|A0...,Q9UNA3
204630,8086,aladin WD repeat nucleoporin,Gene,,,,AAAS,100063811|100154333|100218436|100356621|100405...,113759|230214,PA24361,A0A087XX44|A0A091CTK4|A0A096NFX9|A0A0P7XRK7|A0...,Q9NRG9
204635,65985,acetoacetyl-CoA synthetase,Gene,,,,AACS,100020461|100061661|100076424|100156545|100231...,122434|219689,PA134940696,A0A024RBV2|A0A087XT43|A0A0D9S5D0|A0A0F8CR28|A0...,Q86V21
204640,13,arylacetamide deacetylase,Gene,,,,AADAC,100056411|100148912|100217836|100393374|100468...,,PA24363,A0A087QIL4|A0A091QEJ0|A0A091SA32|A0A093J0F4|A0...,P22760


In [28]:
# Add all mapped edges
new_edges.append(edges_go_prot.dropna(subset=['start_id', 'end_id']))

### Complex to GO

In [29]:
type_conversion = {k: v.replace('G', 'X') for k, v in type_conversion.items()}

edges_go_cpx = go_complex.rename(columns={'db_object_id': 'start_id', 'go_id': 'end_id'})
edges_go_cpx['type'] = edges_go_cpx['aspect'].map(type_conversion)
edges_go_cpx = edges_go_cpx[fix_col_order_edge(edges_go_cpx)]
edges_go_cpx.head()

Unnamed: 0,start_id,end_id,type,db,db_object_symbol,qualifier,db_reference,evidence_code,with_or_from,aspect,db_object_name,db_object_synonym,db_object_type,taxon,date,assigned_by,annotation_extension,gene_product_form_id
0,CPX-1012,GO:0030155,involved_in_XinBP,ComplexPortal,tenascin-w_human,,PMID:17909022,IDA,,P,Tenascin-W complex,TNN hexamer|Tenascin W complex,protein_complex,taxon:9606,20170313,ComplexPortal,,
1,CPX-1012,GO:0030334,involved_in_XinBP,ComplexPortal,tenascin-w_human,,PMID:17909022,IDA,,P,Tenascin-W complex,TNN hexamer|Tenascin W complex,protein_complex,taxon:9606,20170313,ComplexPortal,,
2,CPX-1012,GO:0062023,part_of_XpoCC,ComplexPortal,tenascin-w_human,,PMID:19884327,IDA,,C,Tenascin-W complex,TNN hexamer|Tenascin W complex,protein_complex,taxon:9606,20170313,ComplexPortal,,
3,CPX-1012,GO:1903672,involved_in_XinBP,ComplexPortal,tenascin-w_human,,PMID:19884327,IDA,,P,Tenascin-W complex,TNN hexamer|Tenascin W complex,protein_complex,taxon:9606,20170313,ComplexPortal,,
4,CPX-1014,GO:0007160,involved_in_XinBP,ComplexPortal,tenascin-x_human,,GO_REF:0000108,IEA,GO:0098639,P,Tenascin-X complex,TNXB trimer|Tenascin X complex|TNX complex|TN-...,protein_complex,taxon:9606,20190504,GOC,,


In [30]:
edges_go_cpx = edges_go_cpx[~edges_go_cpx['qualifier'].str.contains('NOT').fillna(False)]

In [31]:
# Add all mapped edges
new_edges.append(edges_go_cpx.dropna(subset=['start_id', 'end_id']))

#### Need more to anchor complexes into the network... Gene to Complex, and Complex Identifiers...

In [32]:
cplx = pd.read_csv(load_dir.joinpath('homo_sapiens.tsv'), sep='\t')
cplx.columns = regularize_colnames(cplx.columns)
cplx.head(2)

Unnamed: 0,complex_ac,recommended_name,aliases_for_complex,taxonomy_identifier,identifiers_and_stoichiometry_of_molecules_in_complex,confidence,experimental_evidence,go_annotations,cross_references,description,complex_properties,complex_assembly,ligand,disease,agonist,antagonist,comment,source
0,CPX-1196,Polybromo-associated SWI/SNF ATP-dependent chr...,Polybromo-associated SWI/SNF ATP-dependent chr...,9606,Q86U86(0)|Q68CP9(0)|Q8WUB8(0)|P60709(0)|O94805...,ECO:0005547(biological system reconstruction e...,-,GO:0016363(nuclear matrix)|GO:2000045(regulati...,pubmed:11790558(see-also)|pubmed:18809673(see-...,An ATP-dependent chromatin remodeling complex ...,MW approximately 2 MDa. May contain 2 instance...,-,-,Coffin-Siris syndrome (CSS) [Orphanet:1465]: a...,-,-,-,"psi-mi:""MI:0469""(IntAct)"
1,CPX-1201,Neural progenitor-specific SWI/SNF ATP-depende...,neural progenitor-specific BAF ATP-dependent c...,9606,P51531(0)|O14497(0)|Q969G3(0)|Q6STE5(0)|Q8WUB8...,ECO:0005547(biological system reconstruction e...,-,GO:2000045(regulation of G1/S transition of mi...,pubmed:11790558(see-also)|pubmed:18809673(see-...,An ATP-dependent chromatin remodeling complex ...,MW approximately 2 MDa. May contain 2 instance...,-,-,Coffin-Siris syndrome (CSS) [Orphanet:1465]: a...,-,-,-,"psi-mi:""MI:0469""(IntAct)"


In [33]:
cpx_nodes = cplx.rename(columns={'complex_ac': 'id', 'recommended_name': 'name'})
cpx_nodes['label'] = 'Protein Complex'
cpx_nodes[['id', 'name', 'label']].head()

Unnamed: 0,id,name,label
0,CPX-1196,Polybromo-associated SWI/SNF ATP-dependent chr...,Protein Complex
1,CPX-1201,Neural progenitor-specific SWI/SNF ATP-depende...,Protein Complex
2,CPX-1194,Muscle cell-specific SWI/SNF ATP-dependent chr...,Protein Complex
3,CPX-1282,Laminin211-nidogen complex,Protein Complex
4,CPX-1285,Laminin221-nidogen complex,Protein Complex


In [34]:
new_nodes.append(cpx_nodes[['id', 'name', 'label']])

In [35]:
cpx_exp = expand_col_on_char(cplx, 'identifiers_and_stoichiometry_of_molecules_in_complex', '|')
cpx_exp.head(2)

Unnamed: 0,complex_ac,recommended_name,aliases_for_complex,taxonomy_identifier,identifiers_and_stoichiometry_of_molecules_in_complex,confidence,experimental_evidence,go_annotations,cross_references,description,complex_properties,complex_assembly,ligand,disease,agonist,antagonist,comment,source
0,CPX-1196,Polybromo-associated SWI/SNF ATP-dependent chr...,Polybromo-associated SWI/SNF ATP-dependent chr...,9606,Q86U86(0),ECO:0005547(biological system reconstruction e...,-,GO:0016363(nuclear matrix)|GO:2000045(regulati...,pubmed:11790558(see-also)|pubmed:18809673(see-...,An ATP-dependent chromatin remodeling complex ...,MW approximately 2 MDa. May contain 2 instance...,-,-,Coffin-Siris syndrome (CSS) [Orphanet:1465]: a...,-,-,-,"psi-mi:""MI:0469""(IntAct)"
1,CPX-1196,Polybromo-associated SWI/SNF ATP-dependent chr...,Polybromo-associated SWI/SNF ATP-dependent chr...,9606,Q68CP9(0),ECO:0005547(biological system reconstruction e...,-,GO:0016363(nuclear matrix)|GO:2000045(regulati...,pubmed:11790558(see-also)|pubmed:18809673(see-...,An ATP-dependent chromatin remodeling complex ...,MW approximately 2 MDa. May contain 2 instance...,-,-,Coffin-Siris syndrome (CSS) [Orphanet:1465]: a...,-,-,-,"psi-mi:""MI:0469""(IntAct)"


In [36]:
cpx_exp['uniprot_id'] = cpx_exp['identifiers_and_stoichiometry_of_molecules_in_complex'].str.split('(', expand=True)[0]

In [37]:
# Uniprot to gene is one to many, so mapping this way should be fine
uniprot_to_geneid_a = expand_col_on_char(all_nodes[['id', 'uni_prot_ids']].dropna(), 'uni_prot_ids', '|')
uniprot_to_geneid_a = uniprot_to_geneid_a.set_index('uni_prot_ids')['id'].to_dict()

uniprot_to_geneid_b = expand_col_on_char(all_nodes[['id', 'uniprot_id']].dropna(), 'uniprot_id', '|')
uniprot_to_geneid_b = uniprot_to_geneid_b.set_index('uniprot_id')['id'].to_dict()

uniprot_to_geneid = {**uniprot_to_geneid_a, **uniprot_to_geneid_b}

In [38]:
edges_cpx_gene = cpx_exp[['complex_ac', 'uniprot_id']].rename(columns={'complex_ac': 'end_id'})
edges_cpx_gene['start_id'] = edges_cpx_gene['uniprot_id'].map(uniprot_to_geneid)
edges_cpx_gene['type'] = 'part_of_GpoX'
edges_cpx_gene[['start_id', 'end_id', 'type']].head()

Unnamed: 0,start_id,end_id,type
0,55193,CPX-1196,part_of_GpoX
1,196528,CPX-1196,part_of_GpoX
2,55274,CPX-1196,part_of_GpoX
3,60,CPX-1196,part_of_GpoX
4,51412,CPX-1196,part_of_GpoX


In [39]:
new_edges.append(edges_cpx_gene[['start_id', 'end_id', 'type']].dropna())

### RNA to GO

In [40]:
go_rna.head(2)

Unnamed: 0,db,db_object_id,db_object_symbol,qualifier,go_id,db_reference,evidence_code,with_or_from,aspect,db_object_name,db_object_synonym,db_object_type,taxon,date,assigned_by,annotation_extension,gene_product_form_id
0,RNAcentral,URS0000001346_9606,URS0000001346_9606,,GO:0006412,GO_REF:0000108,IEA,GO:0030533,P,Homo sapiens (human) tRNA-Lys,,tRNA,taxon:9606,20190504,GOC,,
1,RNAcentral,URS0000001346_9606,URS0000001346_9606,,GO:0030533,GO_REF:0000115,IEA,Rfam:RF00005,F,Homo sapiens (human) tRNA-Lys,,tRNA,taxon:9606,20190504,RNAcentral,,


In [41]:
type_conversion = {k: v.replace('X', 'N') for k, v in type_conversion.items()}
edges_go_rna = go_rna.rename(columns={'db_object_id': 'start_id', 'go_id': 'end_id'})
edges_go_rna['type'] = edges_go_rna['aspect'].map(type_conversion)
edges_go_rna = edges_go_rna[fix_col_order_edge(edges_go_rna)]
edges_go_rna.head()

Unnamed: 0,start_id,end_id,type,db,db_object_symbol,qualifier,db_reference,evidence_code,with_or_from,aspect,db_object_name,db_object_synonym,db_object_type,taxon,date,assigned_by,annotation_extension,gene_product_form_id
0,URS0000001346_9606,GO:0006412,involved_in_NinBP,RNAcentral,URS0000001346_9606,,GO_REF:0000108,IEA,GO:0030533,P,Homo sapiens (human) tRNA-Lys,,tRNA,taxon:9606,20190504,GOC,,
1,URS0000001346_9606,GO:0030533,enables_NeMF,RNAcentral,URS0000001346_9606,,GO_REF:0000115,IEA,Rfam:RF00005,F,Homo sapiens (human) tRNA-Lys,,tRNA,taxon:9606,20190504,RNAcentral,,
2,URS000000192A_9606,GO:0035068,part_of_NpoCC,RNAcentral,URS000000192A_9606,,GO_REF:0000115,IEA,Rfam:RF00951,C,Homo sapiens (human) MIR1302-2 host gene (MIR1...,,lnc_RNA,taxon:9606,20190504,RNAcentral,,
3,URS000000192A_9606,GO:0035195,involved_in_NinBP,RNAcentral,URS000000192A_9606,,GO_REF:0000115,IEA,Rfam:RF00951,P,Homo sapiens (human) MIR1302-2 host gene (MIR1...,,lnc_RNA,taxon:9606,20190504,RNAcentral,,
4,URS00000019BC_9606,GO:0000244,involved_in_NinBP,RNAcentral,URS00000019BC_9606,,GO_REF:0000115,IEA,Rfam:RF00026,P,Homo sapiens (human) snRNA-U6-related,,snRNA,taxon:9606,20190504,RNAcentral,,


In [42]:
edges_go_rna = edges_go_rna[~edges_go_rna['qualifier'].str.contains('NOT').fillna(False)]

In [43]:
new_edges.append(edges_go_rna)

#### Need some more info to add RNA as node type, specifically names

Some other DBs with interesting MicroRNA related edges use the name as an identifier rather than an RNA Central ID

In [44]:
rna_colnames = ['db', 'db_object_id', 'db_object_symbol', 'db_object_name', 'db_object_synonyms', 'db_object_type',
 'taxon', 'parent_object_id', 'db_xrefs', 'properties']

rna_c = pd.read_csv(load_dir.joinpath('rnacentral.gpi.gz'), header=None, names=rna_colnames, 
                    dtype=str, comment='!', sep='\t')
rna_c.head(10)

Unnamed: 0,db,db_object_id,db_object_symbol,db_object_name,db_object_synonyms,db_object_type,taxon,parent_object_id,db_xrefs,properties
0,RNAcentral,URS00006753F8_4081,,Solanum lycopersicum (tomato) tRNA-Tyr for ant...,,tRNA,taxon:4081,,,
1,RNAcentral,URS0000675402_7159,,Aedes aegypti tRNA,,tRNA,taxon:7159,,,
2,RNAcentral,URS0000675413_6945,,Ixodes scapularis tRNA,,tRNA,taxon:6945,,,
3,RNAcentral,URS0000675414_59463,,Myotis lucifugus (little brown bat) snRNA U6 s...,,snRNA,taxon:59463,,,
4,RNAcentral,URS000067541A_60711,,Chlorocebus sabaeus Small nucleolar RNA U13,,snoRNA,taxon:60711,,,
5,RNAcentral,URS000067541A_9544,,Macaca mulatta Small nucleolar RNA U13,,snoRNA,taxon:9544,,,
6,RNAcentral,URS000067541C_4558,,Sorghum bicolor Plant small nucleolar RNA R71,,snoRNA,taxon:4558,,,
7,RNAcentral,URS000067541F_1696176,,Pelagibacteraceae bacterium GOM-A4 bablM sRNA,,ncRNA,taxon:1696176,,,
8,RNAcentral,URS0000675420_15368,,Brachypodium distachyon microRNA MIR1122,,primary_transcript,taxon:15368,,,
9,RNAcentral,URS0000675421_9483,,Callithrix jacchus (white-tufted-ear marmoset)...,,snRNA,taxon:9483,,,


In [45]:
len(rna_c)

14483979

Human microRNA names start with `hsa-`.  However, this DB seems to have a lot of other text in the name than the smile `hsa-` value

In [46]:
hsa_lines = rna_c['db_object_name'].str.contains('hsa-')
rna_c['rna_name'] = rna_c[hsa_lines]['db_object_name'].apply(lambda s: s[s.index('hsa-'):])

In [47]:
rna_c.head(2)

Unnamed: 0,db,db_object_id,db_object_symbol,db_object_name,db_object_synonyms,db_object_type,taxon,parent_object_id,db_xrefs,properties,rna_name
0,RNAcentral,URS00006753F8_4081,,Solanum lycopersicum (tomato) tRNA-Tyr for ant...,,tRNA,taxon:4081,,,,
1,RNAcentral,URS0000675402_7159,,Aedes aegypti tRNA,,tRNA,taxon:7159,,,,


In [48]:
rna_nodes = rna_c.dropna(subset=['rna_name'])
rna_nodes = rna_nodes.rename(columns={'rna_name': 'name', 'db_object_id': 'id'})
rna_nodes['label'] = 'Micro RNA'
rna_nodes[['id', 'name', 'label']].head()

Unnamed: 0,id,name,label
2548,URS0000676F0F_9606,hsa-mir-891b precursor,Micro RNA
4431,URS0000678203_9606,hsa-mir-633 precursor,Micro RNA
8879,URS000067B227_9606,hsa-mir-644a precursor,Micro RNA
13287,URS000067E0B9_9606,hsa-mir-920 precursor,Micro RNA
13806,URS000067E604_9606,hsa-mir-422a precursor,Micro RNA


In [49]:
new_nodes.append(rna_nodes[['id', 'name', 'label']])

### Now get some interesting RNA to Gene Endges

In [50]:
rna_names = rna_c[hsa_lines][~rna_c[hsa_lines]['rna_name'].str.contains('precursor')]['rna_name'].values

In [51]:
mti = pd.read_excel(load_dir.joinpath('hsa_MTI.xlsx'))
mti.head(2)

Unnamed: 0,miRTarBase ID,miRNA,Species (miRNA),Target Gene,Target Gene (Entrez Gene ID),Species (Target Gene),Experiments,Support Type,References (PMID)
0,MIRT000002,hsa-miR-20a-5p,Homo sapiens,HIF1A,3091,Homo sapiens,Luciferase reporter assay//Western blot//North...,Functional MTI,18632605
1,MIRT000002,hsa-miR-20a-5p,Homo sapiens,HIF1A,3091,Homo sapiens,HITS-CLIP,Functional MTI (Weak),22473208


In [52]:
mti.columns = regularize_colnames(mti.columns)
mti.head(2)

Unnamed: 0,mirtarbase_id,mirna,species_mirna,target_gene,target_gene_entrez_gene_id,species_target_gene,experiments,support_type,references_pmid
0,MIRT000002,hsa-miR-20a-5p,Homo sapiens,HIF1A,3091,Homo sapiens,Luciferase reporter assay//Western blot//North...,Functional MTI,18632605
1,MIRT000002,hsa-miR-20a-5p,Homo sapiens,HIF1A,3091,Homo sapiens,HITS-CLIP,Functional MTI (Weak),22473208


In [53]:
gene_ids = all_nodes.query('label == "Gene"')['id']
print('MicroRNA Gene Annotations mappable to RNACentral')
print('  Total:    {:9,}'.format(len(mti)))
print('  Mapped:   {:9,}'.format(len(mti.query('mirna in @rna_names'))))
print('  Unmapped: {:9,}'.format(
    len(mti) - len(mti.query('mirna in @rna_names and target_gene_entrez_gene_id in @gene_ids'))))


MicroRNA Gene Annotations mappable to RNACentral
  Total:      502,652
  Mapped:     493,614
  Unmapped:     9,125


In [54]:
rna_map = rna_nodes.set_index('name')['id'].to_dict()

In [55]:
edges_rna_gene = mti.query('mirna in @rna_names and target_gene_entrez_gene_id in @gene_ids').reset_index(drop=True)
edges_rna_gene = edges_rna_gene.rename(columns={'target_gene_entrez_gene_id': 'end_id'}) 
edges_rna_gene['start_id'] = edges_rna_gene['mirna'].map(rna_map)
edges_rna_gene['end_id'] = edges_rna_gene['end_id'].astype(str)
edges_rna_gene['type'] = 'regulates_NrG'
edges_rna_gene = edges_rna_gene[fix_col_order_edge(edges_rna_gene)]
edges_rna_gene['references_pmid'] = edges_rna_gene['references_pmid'].astype(str)
edges_rna_gene.head(2)

Unnamed: 0,start_id,end_id,type,mirtarbase_id,mirna,species_mirna,target_gene,species_target_gene,experiments,support_type,references_pmid
0,URS0000574A2C_9606,3091,regulates_NrG,MIRT000002,hsa-miR-20a-5p,Homo sapiens,HIF1A,Homo sapiens,Luciferase reporter assay//Western blot//North...,Functional MTI,18632605
1,URS0000574A2C_9606,3091,regulates_NrG,MIRT000002,hsa-miR-20a-5p,Homo sapiens,HIF1A,Homo sapiens,HITS-CLIP,Functional MTI (Weak),22473208


In [56]:
new_edges.append(edges_rna_gene[['start_id', 'end_id', 'type', 'experiments', 'support_type', 'references_pmid']])

# Putting it together

In [57]:
new_nodes_df = pd.concat(new_nodes, sort=False)

In [58]:
new_nodes_df.head(2)

Unnamed: 0,id,name,label
0,CPX-1196,Polybromo-associated SWI/SNF ATP-dependent chr...,Protein Complex
1,CPX-1201,Neural progenitor-specific SWI/SNF ATP-depende...,Protein Complex


In [59]:
new_edges_df = pd.concat(new_edges, sort=False)

In [60]:
keep_cols = ['start_id', 'end_id', 'type', 'qualifier',
 'db_reference', 'evidence_code', 'with_or_from', 'date',
 'assigned_by',  'experiments', 'support_type', 'references_pmid']

In [61]:
new_edges_df[keep_cols].head(2)

Unnamed: 0,start_id,end_id,type,qualifier,db_reference,evidence_code,with_or_from,date,assigned_by,experiments,support_type,references_pmid
0,552891,GO:0003924,enables_GeMF,,GO_REF:0000002,IEA,InterPro:IPR001770,20190504,InterPro,,,
1,440672,GO:0003723,enables_GeMF,,GO_REF:0000037,IEA,UniProtKB-KW:KW-0694,20190504,UniProt,,,


In [63]:
print('Total number of new edges: {:,}'.format(len(new_edges_df)))
print('Number of unique new edges: {:,}'.format(len(new_edges_df.drop_duplicates(subset=['start_id', 'end_id', 'type']))))

Total number of new edges: 1,015,886
Number of unique new edges: 697,401


In [64]:
%%time
# Need to combine those that have multiple of whatever... 
new_edges_df = combine_group_rows_on_char(new_edges_df, ['start_id', 'end_id', 'type'], ['evidence_code', 'references_pmid'])

CPU times: user 3min 11s, sys: 733 ms, total: 3min 12s
Wall time: 3min 9s


In [66]:
new_edges_df = new_edges_df[keep_cols]
new_edges_df['abbv'] = new_edges_df['type'].apply(lambda s: s.split('_')[-1])
new_edges_df = new_edges_df.rename(columns={'references_pmid': 'pub_med_ids'})

In [67]:
new_edges_df.head(2)

Unnamed: 0,start_id,end_id,type,qualifier,db_reference,evidence_code,with_or_from,date,assigned_by,experiments,support_type,pub_med_ids,abbv
0,1,GO:0002576,involved_in_GinBP,,Reactome:R-HSA-114608,TAS,,20181122,Reactome,,,,GinBP
1,1,GO:0003674,enables_GeMF,,GO_REF:0000015,ND,,20070222,UniProt,,,,GeMF


In [68]:
all_nodes_out = pd.concat([all_nodes, new_nodes_df], sort=False)
all_nodes_out.head(2)

Unnamed: 0,id,name,label,tree_numbers,drug_bank_ids,alt_disease_ids,gene_symbol,alt_gene_ids,bio_gridids,pharm_gkbids,uni_prot_ids,uniprot_id
0,MESH:C089250,(0.017ferrocene)amylose,Compound,D01.490.200/C089250|D02.691.550.200/C089250|D0...,,,,,,,,
1,MESH:C114385,001-C8-NBD,Compound,D03.383.129.462.580/C114385|D12.644.456/C114385,,,,,,,,


In [69]:
len(all_nodes_out)

753050

In [70]:
all_edges_out = pd.concat([edges, new_edges_df], sort=False)
all_edges_out.head(2)

Unnamed: 0,start_id,end_id,type,parent_ixn,pub_med_ids,organism_id,abbv,direct_evidence,corrected_pvalue,inference_gene_symbol,qualifier,db_reference,evidence_code,with_or_from,date,assigned_by,experiments,support_type
0,MESH:C000121,4313,decreases_activity_CdaG,decreases^activity,25899827,9606,CdaG,,,,,,,,,,,
1,MESH:C000121,4313,decreases_expression_CdeG,decreases^expression,25899827,9606,CdeG,,,,,,,,,,,


In [71]:
node_ids = all_nodes_out['id'].unique()

In [72]:
len(all_nodes_out), all_nodes_out['id'].nunique()

(753050, 753050)

In [73]:
# Double check that all of the edges have a startID and End ID contained in the Nodes...
print(len(all_edges_out))
all_edges_out_filt = all_edges_out.query('start_id in @node_ids and end_id in @node_ids')
print(len(all_edges_out_filt))

6052217
6013920


In [74]:
# Lests see how many of what kinds of edges had to be dropped...
all_edges_out.query('start_id not in @node_ids or end_id not in @node_ids')['type'].value_counts()

part_of_NpoCC        16320
involved_in_NinBP    13177
enables_NeMF          8799
involved_in_GinBP        1
Name: type, dtype: int64

These are all RNAs that could not be mapped properly to a name... they also have very minimal information when accessed on RNACentral, and would increase the number of miRNA nodes by a factor of 5.

In [75]:
all_edge_ids = all_edges_out_filt[['start_id', 'end_id']].stack().unique()

In [76]:
filt_nodes_out = all_nodes_out.query('id in @all_edge_ids')
print(len(all_nodes_out))
print(len(filt_nodes_out))

753050
97006


In [77]:
filt_nodes_out['label'].value_counts()

Gene                  46777
Compound              16347
Biological Process    14391
Disease                7201
Molecular Function     4384
Micro RNA              3064
Pathway                2363
Cellular Component     1835
Protein Complex         644
Name: label, dtype: int64

In [78]:
all_edges_out_filt['type'].value_counts()

associated_with_BPawD        1798362
associated_with_CawPW        1215923
associated_with_DawPW         563169
increases_expression_CieG     479825
decreases_expression_CdeG     410983
regulates_NrG                 373942
associated_with_MFawD         199530
affects_expression_CaeG       137093
involved_in_GinBP             135869
part_of_GpoPW                 135809
associated_with_CCawD         126497
part_of_GpoCC                  78258
enables_GeMF                   64281
marker_or_mechanism_CmD        62912
increases_phenotype_CipBP      35984
therapeutic_CtD                34573
marker_or_mechanism_GmD        29342
increases_activity_CiaG        29001
decreases_reaction_CdrBP       26730
decreases_activity_CdaG        21926
decreases_phenotype_CdpBP      16993
affects_phenotype_CapBP        11221
increases_reaction_CirBP        6694
affects_reaction_CarBP          2419
affects_activity_CaaG           2382
involved_in_NinBP               2313
part_of_GpoX                    2047
t

# Save

In [79]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')
// hack to get the filename for this notebook

<IPython.core.display.Javascript object>

In [80]:
out_dir = Path('../2_pipeline/').joinpath(nb_name.split('.')[0]).joinpath('out').resolve()
out_dir.mkdir(parents=True, exist_ok=True)

In [81]:
gt.add_colons(new_nodes_df, id_name='identifier').to_csv(out_dir.joinpath('new_nodes.csv'), index=False)
gt.add_colons(new_edges_df).to_csv(out_dir.joinpath('new_edges.csv'), index=False)

In [82]:
gt.add_colons(all_nodes_out, id_name='identifier').to_csv(out_dir.joinpath('nodes_all.csv'), index=False)
gt.add_colons(filt_nodes_out, id_name='identifier').to_csv(out_dir.joinpath('nodes_filt.csv'), index=False)

gt.add_colons(all_edges_out_filt).to_csv(out_dir.joinpath('edges.csv'), index=False)

![Metagraph](../2_pipeline/03_Adding_GO_Annotations/out/CTD_GO_MTIR_metagraph.png)