# Adding Enhancers to AnnoQ

## Loading data as it is and formatting it accordingly 
Adding and renaming columns headers and setting index for lookups and table joins

In [None]:
%ls

In [1]:
import pandas as pd
import json

enhs =  pd.read_csv("../resources/test/CREbedDBenhancers_10092018", sep='\t', 
                    names=["chr", "start", "end", "enhancer"],
                    index_col="enhancer")
enh_gene_links =pd.read_csv("../resources/test/enh_gene_link_tissue_pval_snp_hg19-sample", sep='\t',
                    dtype={'enhancer':str})
enh_source = pd.read_csv("../resources/test/PEREGRINEenhancersources", sep='\t',
                    names=["enhancer", "source"],                    
                    index_col="enhancer")
tissues = pd.read_csv("../resources/test/tissues.txt", sep='\t')
assays = pd.read_csv("../resources/test/assaytable.txt",  sep='\t',
                    names=["id", "assay_label"],
                    index_col="id")


## Mapping tissues to more friendly headers  
for future when we need tissues

In [None]:
 fields = [
        "tissue_id",
        "tissue_name",
        "tissue_source",
        "tissue_external_id",
        "tissue_organ_and_tissue",
        "tissue_cell_type",
        "tissue_disease",
        "tissue_sample",
        "tissue_brenda_tissue_ontology",
        "tissue_brenda_term",
        "tissue_url"
    ]
    
tissues.columns=fields
tissues.set_index('tissue_id', inplace=True)

## Some Stats for our data 

In [None]:
enh_gene_links.shape


## Left join the table 
- enhancer detail
- enhancer source
- ttissue is 
- asssay id

In [None]:
agg_table = enh_gene_links.merge(enhs, how="left", left_on="enhancer", right_on="enhancer")
agg_table = agg_table.merge(tissues, how="left", left_on="tissue", right_on="tissue_id")
agg_table = agg_table.merge(assays, how="left", left_on="assay", right_on="id")
agg_table = agg_table.merge(enh_source, how="left", left_on="enhancer", right_on="enhancer")
agg_table

## Filter unlinked enhnacers

get the ones not in enh_gene_links table. They will be many because the sample fiie is tiny

In [None]:
unlinked = enhs[~enhs.index.isin(agg_table['enhancer'])]
unlinked = unlinked.merge(enh_source, how="left", left_on="enhancer", right_on="enhancer")
unlinked

## Concat the 2, many NaNs

Idealy you can do outer join but if not interested in unlinked enhancers the n

In [None]:
all_enhs = pd.concat([agg_table, unlinked])
all_enhs

## Save the files

In [None]:
agg_table.to_json("../../annoq-data/enhancer/test-out.json", orient="records", indent=2)

## Load the sample annoq before(wgsa) and after(functions)

In [2]:
wgsa_anno =  pd.read_csv("../../annoq-data/slim-hrc/chr21.vcf", sep='\t')
all_anno =  pd.read_csv("../../annoq-data/slim-hrc-res/chr21.vcf", sep='\t')
all_anno

Unnamed: 0,chr,pos,ref,alt,ANNOVAR_ensembl_Effect,ANNOVAR_ensembl_Transcript_ID,ANNOVAR_ensembl_Gene_ID,ANNOVAR_ensembl_Closest_gene(intergenic_only),ANNOVAR_ensembl_HGVSc,ANNOVAR_ensembl_HGVSp,...,enhancer_linked_PANTHER_GO_SLIM_molecular_function_list,enhancer_linked_PANTHER_GO_SLIM_molecular_function_list_id,enhancer_linked_PANTHER_GO_SLIM_biological_process_list,enhancer_linked_PANTHER_GO_SLIM_biological_process_list_id,enhancer_linked_PANTHER_GO_SLIM_cellular_component_list,enhancer_linked_PANTHER_GO_SLIM_cellular_component_list_id,enhancer_linked_REACTOME_pathway_list,enhancer_linked_REACTOME_pathway_list_id,enhancer_linked_PANTHER_pathway_list,enhancer_linked_PANTHER_pathway_list_id
0,21,15442715,T,A,intergenic,.,.,"NONE:NONE(dist=NONE),ENSG00000238411:ENST00000...",.,.,...,phospholipase activity;triglyceride lipase act...,GO:0004806;GO:0004620,acylglycerol catabolic process;triglyceride me...,GO:0046464;GO:0006641;GO:0006633,extracellular space,GO:0005615,Synthesis of PA;Glycerophospholipid biosynthes...,R-HSA-556833;R-HSA-1483257;R-HSA-1430728;R-HSA...,.,.
1,21,9527214,T,A,intergenic,.,.,"NONE:NONE(dist=NONE),ENSG00000238411:ENST00000...",.,.,...,.,.,.,.,.,.,.,.,.,.
2,21,16128295,T,A,intergenic,.,.,"NONE:NONE(dist=NONE),ENSG00000238411:ENST00000...",.,.,...,ATP binding;ATPase activity;heat shock protein...,GO:0016887;GO:0051082;GO:0005524;GO:0031072,chaperone-mediated protein folding;'de novo' p...,GO:0006458;GO:0061077;GO:0034620,cytoplasm,GO:0005737,Cellular response to heat stress;Cellular resp...,R-HSA-3371453;R-HSA-3371556;R-HSA-2262752;R-HS...,Parkinson disease,P00049
3,21,9527233,T,C,intergenic,.,.,"NONE:NONE(dist=NONE),ENSG00000238411:ENST00000...",.,.,...,.,.,.,.,.,.,.,.,.,.
4,21,9527242,A,G,intergenic,.,.,"NONE:NONE(dist=NONE),ENSG00000238411:ENST00000...",.,.,...,.,.,.,.,.,.,.,.,.,.
5,21,15442714,T,G,intergenic,.,.,"NONE:NONE(dist=NONE),ENSG00000238411:ENST00000...",.,.,...,phospholipase activity;triglyceride lipase act...,GO:0004806;GO:0004620,acylglycerol catabolic process;triglyceride me...,GO:0046464;GO:0006641;GO:0006633,extracellular space,GO:0005615,Synthesis of PA;Glycerophospholipid biosynthes...,R-HSA-556833;R-HSA-1483257;R-HSA-1430728;R-HSA...,.,.


## How many annotations added

In [3]:
len(all_anno.columns) - len(wgsa_anno.columns)

181

In [4]:
all_anno.iloc[0:,448:]

Unnamed: 0,flanking_0_GO_molecular_function_complete_list,flanking_0_GO_molecular_function_complete_list_id,flanking_0_GO_biological_process_complete_list,flanking_0_GO_biological_process_complete_list_id,flanking_0_GO_cellular_component_complete_list,flanking_0_GO_cellular_component_complete_list_id,flanking_0_PANTHER_GO_SLIM_molecular_function_list,flanking_0_PANTHER_GO_SLIM_molecular_function_list_id,flanking_0_PANTHER_GO_SLIM_biological_process_list,flanking_0_PANTHER_GO_SLIM_biological_process_list_id,...,enhancer_linked_PANTHER_GO_SLIM_molecular_function_list,enhancer_linked_PANTHER_GO_SLIM_molecular_function_list_id,enhancer_linked_PANTHER_GO_SLIM_biological_process_list,enhancer_linked_PANTHER_GO_SLIM_biological_process_list_id,enhancer_linked_PANTHER_GO_SLIM_cellular_component_list,enhancer_linked_PANTHER_GO_SLIM_cellular_component_list_id,enhancer_linked_REACTOME_pathway_list,enhancer_linked_REACTOME_pathway_list_id,enhancer_linked_PANTHER_pathway_list,enhancer_linked_PANTHER_pathway_list_id
0,.,.,.,.,.,.,.,.,.,.,...,phospholipase activity;triglyceride lipase act...,GO:0004806;GO:0004620,acylglycerol catabolic process;triglyceride me...,GO:0046464;GO:0006641;GO:0006633,extracellular space,GO:0005615,Synthesis of PA;Glycerophospholipid biosynthes...,R-HSA-556833;R-HSA-1483257;R-HSA-1430728;R-HSA...,.,.
1,.,.,.,.,.,.,.,.,.,.,...,.,.,.,.,.,.,.,.,.,.
2,.,.,.,.,.,.,.,.,.,.,...,ATP binding;ATPase activity;heat shock protein...,GO:0016887;GO:0051082;GO:0005524;GO:0031072,chaperone-mediated protein folding;'de novo' p...,GO:0006458;GO:0061077;GO:0034620,cytoplasm,GO:0005737,Cellular response to heat stress;Cellular resp...,R-HSA-3371453;R-HSA-3371556;R-HSA-2262752;R-HS...,Parkinson disease,P00049
3,.,.,.,.,.,.,.,.,.,.,...,.,.,.,.,.,.,.,.,.,.
4,.,.,.,.,.,.,.,.,.,.,...,.,.,.,.,.,.,.,.,.,.
5,.,.,.,.,.,.,.,.,.,.,...,phospholipase activity;triglyceride lipase act...,GO:0004806;GO:0004620,acylglycerol catabolic process;triglyceride me...,GO:0046464;GO:0006641;GO:0006633,extracellular space,GO:0005615,Synthesis of PA;Glycerophospholipid biosynthes...,R-HSA-556833;R-HSA-1483257;R-HSA-1430728;R-HSA...,.,.


In [8]:
all_anno.loc[0:,'enhancer_linked_PANTHER_GO_SLIM_biological_process_list']

0    acylglycerol catabolic process;triglyceride me...
1                                                    .
2    chaperone-mediated protein folding;'de novo' p...
3                                                    .
4                                                    .
5    acylglycerol catabolic process;triglyceride me...
Name: enhancer_linked_PANTHER_GO_SLIM_biological_process_list, dtype: object