# Adding Enhancers to AnnoQ

## Loading data as it is and formatting it accordingly 
Adding and renaming columns headers and setting index for lookups and table joins

In [1]:
%ls

 Volume in drive C is Windows
 Volume Serial Number is A051-E45B

 Directory of C:\work\p101\annoq\annoq-data-builder\wgsa_add

05/31/2023  12:07 PM    <DIR>          .
05/31/2023  12:24 PM    <DIR>          ..
06/16/2021  11:36 AM    <DIR>          .ipynb_checkpoints
12/25/2022  08:43 PM    <DIR>          __pycache__
06/09/2021  08:03 AM             3,002 add_annotations.py
05/31/2023  12:06 PM             3,485 add_enhancer_anno.py
06/08/2021  08:23 PM             5,156 add_panther_anno.py
04/22/2021  10:54 PM               806 base.py
04/15/2021  05:37 PM    <DIR>          config
06/10/2021  12:38 PM               500 create_sbatch.py
09/22/2020  01:30 PM               453 dbSNP.sbatch
10/11/2022  01:50 PM               618 gunzip.sbatch.sh
09/22/2020  01:30 PM               352 gz.sbatch
09/22/2020  01:30 PM               466 hrc.sbatch
04/26/2021  02:05 PM               234 hrc_add.sh
06/21/2021  06:06 PM               286 hrc_add_all.sh
06/08/2021  06:23 PM               245 hrc_

In [2]:
import pandas as pd
import json

enhs =  pd.read_csv("../resources/test/CREbedDBenhancers_10092018", sep='\t', 
                    names=["chr", "start", "end", "enhancer"],
                    index_col="enhancer")
enh_gene_links =pd.read_csv("../resources/test/enh_gene_link_tissue_pval_snp_hg19-sample", sep='\t',
                    dtype={'enhancer':str})
enh_source = pd.read_csv("../resources/test/PEREGRINEenhancersources", sep='\t',
                    names=["enhancer", "source"],                    
                    index_col="enhancer")
tissues = pd.read_csv("../resources/test/tissues.txt", sep='\t')
assays = pd.read_csv("../resources/test/assaytable.txt",  sep='\t',
                    names=["id", "assay_label"],
                    index_col="id")


## Mapping tissues to more friendly headers  
for future when we need tissues

In [3]:
 fields = [
        "tissue_id",
        "tissue_name",
        "tissue_source",
        "tissue_external_id",
        "tissue_organ_and_tissue",
        "tissue_cell_type",
        "tissue_disease",
        "tissue_sample",
        "tissue_brenda_tissue_ontology",
        "tissue_brenda_term",
        "tissue_url"
    ]
    
tissues.columns=fields
tissues.set_index('tissue_id', inplace=True)

## Some Stats for our data 

In [4]:
enh_gene_links.shape


(9999, 7)

## Left join the table 
- enhancer detail
- enhancer source
- ttissue is 
- asssay id

In [5]:
agg_table = enh_gene_links.merge(enhs, how="left", left_on="enhancer", right_on="enhancer")
agg_table = agg_table.merge(tissues, how="left", left_on="tissue", right_on="tissue_id")
agg_table = agg_table.merge(assays, how="left", left_on="assay", right_on="id")
agg_table = agg_table.merge(enh_source, how="left", left_on="enhancer", right_on="enhancer")
agg_table

Unnamed: 0,enhancer,gene,linkID,assay,tissue,p-value,eQTL_SNP_ID,chr,start,end,...,tissue_external_id,tissue_organ_and_tissue,tissue_cell_type,tissue_disease,tissue_sample,tissue_brenda_tissue_ontology,tissue_brenda_term,tissue_url,assay_label,source
0,1,HUMAN|HGNC=15846|UniProtKB=Q9NP74,1,3,64,,,chr1,100000188,100000393,...,CCL-185,Lung,epithelial,carcinoma,Cell line,BTO:0000018,A-549 cell,https://www.atcc.org/products/all/CCL-185.aspx,Topologically Associated Domain,FANTOM
1,1,HUMAN|HGNC=15846|UniProtKB=Q9NP74,1,3,65,,,chr1,100000188,100000393,...,HTB-47,Kidney,epithelial,carcinoma,Cell line,BTO:0006204,Caki-2 cell,https://www.atcc.org/products/all/HTB-47.aspx,Topologically Associated Domain,FANTOM
2,1,HUMAN|HGNC=15846|UniProtKB=Q9NP74,1,3,66,,,chr1,100000188,100000393,...,CRL-1441,Kidney,epithelial,rhabdoid tumor,Cell line,BTO:0002586,G-401 cell,https://www.atcc.org/Products/All/CRL-1441.aspx,Topologically Associated Domain,FANTOM
3,1,HUMAN|HGNC=15846|UniProtKB=Q9NP74,1,3,67,,,chr1,100000188,100000393,...,CRL-1740,Prostate,epithelial,carcinoma,Cell line,BTO:0002398,prostate epithelium cell line,https://www.atcc.org/products/all/CRL-1740.aspx,Topologically Associated Domain,FANTOM
4,1,HUMAN|HGNC=15846|UniProtKB=Q9NP74,1,3,68,,,chr1,100000188,100000393,...,HTB-177,Lung,epithelial,carcinoma,Cell line,BTO:0002207,NCI-H460 cell,https://www.atcc.org/Products/All/HTB-177.aspx,Topologically Associated Domain,FANTOM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,10730,HUMAN|HGNC=24836|UniProtKB=Q6ZVL6,530,3,74,,,chr11,33708893,33709097,...,HTB-10,Brain,epithelial,neuroepithelioma,Cell line,BTO:0002914,SK-N-MC cell,https://www.atcc.org/products/all/HTB-10.aspx,Topologically Associated Domain,FANTOM
9995,10730,HUMAN|HGNC=24836|UniProtKB=Q6ZVL6,530,3,75,,,chr11,33708893,33709097,...,HTB-133,Mammary gland,epithelial,carcinoma,Cell line,BTO:0001248,T-47D cell,https://www.atcc.org/products/all/HTB-133.aspx,Topologically Associated Domain,FANTOM
9996,10730,HUMAN|HGNC=24836|UniProtKB=Q6ZVL6,530,3,76,,,chr11,33708893,33709097,...,CRL-1611,Kidney,epithelial,renal cell adenocarcinoma,Cell line,BTO:0003846,ACHN cell,https://www.atcc.org/products/all/CRL-1611.aspx,Topologically Associated Domain,FANTOM
9997,10730,HUMAN|HGNC=24836|UniProtKB=Q6ZVL6,530,3,77,,,chr11,33708893,33709097,...,CCL-221,Colon,epithelial,colorectal adenocarcinoma,Cell line,BTO:0000391,DLD-1 cell,https://www.atcc.org/products/all/CCL-221.aspx,Topologically Associated Domain,FANTOM


## Filter unlinked enhnacers

get the ones not in enh_gene_links table. They will be many because the sample fiie is tiny

In [None]:
unlinked = enhs[~enhs.index.isin(agg_table['enhancer'])]
unlinked = unlinked.merge(enh_source, how="left", left_on="enhancer", right_on="enhancer")
unlinked

## Concat the 2, many NaNs

Idealy you can do outer join but if not interested in unlinked enhancers the n

In [None]:
all_enhs = pd.concat([agg_table, unlinked])
all_enhs

## Save the files

In [None]:
agg_table.to_json("../../annoq-data/enhancer/test-out.json", orient="records", indent=2)

## Load the sample annoq before(wgsa) and after(functions)

In [2]:
wgsa_anno =  pd.read_csv("../../annoq-data/slim-hrc/chr21.vcf", sep='\t')
all_anno =  pd.read_csv("../../annoq-data/slim-hrc-res/chr21.vcf", sep='\t')
all_anno

Unnamed: 0,chr,pos,ref,alt,ANNOVAR_ensembl_Effect,ANNOVAR_ensembl_Transcript_ID,ANNOVAR_ensembl_Gene_ID,ANNOVAR_ensembl_Closest_gene(intergenic_only),ANNOVAR_ensembl_HGVSc,ANNOVAR_ensembl_HGVSp,...,enhancer_linked_PANTHER_GO_SLIM_molecular_function_list,enhancer_linked_PANTHER_GO_SLIM_molecular_function_list_id,enhancer_linked_PANTHER_GO_SLIM_biological_process_list,enhancer_linked_PANTHER_GO_SLIM_biological_process_list_id,enhancer_linked_PANTHER_GO_SLIM_cellular_component_list,enhancer_linked_PANTHER_GO_SLIM_cellular_component_list_id,enhancer_linked_REACTOME_pathway_list,enhancer_linked_REACTOME_pathway_list_id,enhancer_linked_PANTHER_pathway_list,enhancer_linked_PANTHER_pathway_list_id
0,21,15442715,T,A,intergenic,.,.,"NONE:NONE(dist=NONE),ENSG00000238411:ENST00000...",.,.,...,phospholipase activity;triglyceride lipase act...,GO:0004806;GO:0004620,acylglycerol catabolic process;triglyceride me...,GO:0046464;GO:0006641;GO:0006633,extracellular space,GO:0005615,Synthesis of PA;Glycerophospholipid biosynthes...,R-HSA-556833;R-HSA-1483257;R-HSA-1430728;R-HSA...,.,.
1,21,9527214,T,A,intergenic,.,.,"NONE:NONE(dist=NONE),ENSG00000238411:ENST00000...",.,.,...,.,.,.,.,.,.,.,.,.,.
2,21,16128295,T,A,intergenic,.,.,"NONE:NONE(dist=NONE),ENSG00000238411:ENST00000...",.,.,...,ATP binding;ATPase activity;heat shock protein...,GO:0016887;GO:0051082;GO:0005524;GO:0031072,chaperone-mediated protein folding;'de novo' p...,GO:0006458;GO:0061077;GO:0034620,cytoplasm,GO:0005737,Cellular response to heat stress;Cellular resp...,R-HSA-3371453;R-HSA-3371556;R-HSA-2262752;R-HS...,Parkinson disease,P00049
3,21,9527233,T,C,intergenic,.,.,"NONE:NONE(dist=NONE),ENSG00000238411:ENST00000...",.,.,...,.,.,.,.,.,.,.,.,.,.
4,21,9527242,A,G,intergenic,.,.,"NONE:NONE(dist=NONE),ENSG00000238411:ENST00000...",.,.,...,.,.,.,.,.,.,.,.,.,.
5,21,15442714,T,G,intergenic,.,.,"NONE:NONE(dist=NONE),ENSG00000238411:ENST00000...",.,.,...,phospholipase activity;triglyceride lipase act...,GO:0004806;GO:0004620,acylglycerol catabolic process;triglyceride me...,GO:0046464;GO:0006641;GO:0006633,extracellular space,GO:0005615,Synthesis of PA;Glycerophospholipid biosynthes...,R-HSA-556833;R-HSA-1483257;R-HSA-1430728;R-HSA...,.,.


## How many annotations added

In [3]:
len(all_anno.columns) - len(wgsa_anno.columns)

181

In [4]:
all_anno.iloc[0:,448:]

Unnamed: 0,flanking_0_GO_molecular_function_complete_list,flanking_0_GO_molecular_function_complete_list_id,flanking_0_GO_biological_process_complete_list,flanking_0_GO_biological_process_complete_list_id,flanking_0_GO_cellular_component_complete_list,flanking_0_GO_cellular_component_complete_list_id,flanking_0_PANTHER_GO_SLIM_molecular_function_list,flanking_0_PANTHER_GO_SLIM_molecular_function_list_id,flanking_0_PANTHER_GO_SLIM_biological_process_list,flanking_0_PANTHER_GO_SLIM_biological_process_list_id,...,enhancer_linked_PANTHER_GO_SLIM_molecular_function_list,enhancer_linked_PANTHER_GO_SLIM_molecular_function_list_id,enhancer_linked_PANTHER_GO_SLIM_biological_process_list,enhancer_linked_PANTHER_GO_SLIM_biological_process_list_id,enhancer_linked_PANTHER_GO_SLIM_cellular_component_list,enhancer_linked_PANTHER_GO_SLIM_cellular_component_list_id,enhancer_linked_REACTOME_pathway_list,enhancer_linked_REACTOME_pathway_list_id,enhancer_linked_PANTHER_pathway_list,enhancer_linked_PANTHER_pathway_list_id
0,.,.,.,.,.,.,.,.,.,.,...,phospholipase activity;triglyceride lipase act...,GO:0004806;GO:0004620,acylglycerol catabolic process;triglyceride me...,GO:0046464;GO:0006641;GO:0006633,extracellular space,GO:0005615,Synthesis of PA;Glycerophospholipid biosynthes...,R-HSA-556833;R-HSA-1483257;R-HSA-1430728;R-HSA...,.,.
1,.,.,.,.,.,.,.,.,.,.,...,.,.,.,.,.,.,.,.,.,.
2,.,.,.,.,.,.,.,.,.,.,...,ATP binding;ATPase activity;heat shock protein...,GO:0016887;GO:0051082;GO:0005524;GO:0031072,chaperone-mediated protein folding;'de novo' p...,GO:0006458;GO:0061077;GO:0034620,cytoplasm,GO:0005737,Cellular response to heat stress;Cellular resp...,R-HSA-3371453;R-HSA-3371556;R-HSA-2262752;R-HS...,Parkinson disease,P00049
3,.,.,.,.,.,.,.,.,.,.,...,.,.,.,.,.,.,.,.,.,.
4,.,.,.,.,.,.,.,.,.,.,...,.,.,.,.,.,.,.,.,.,.
5,.,.,.,.,.,.,.,.,.,.,...,phospholipase activity;triglyceride lipase act...,GO:0004806;GO:0004620,acylglycerol catabolic process;triglyceride me...,GO:0046464;GO:0006641;GO:0006633,extracellular space,GO:0005615,Synthesis of PA;Glycerophospholipid biosynthes...,R-HSA-556833;R-HSA-1483257;R-HSA-1430728;R-HSA...,.,.


In [8]:
all_anno.loc[0:,'enhancer_linked_PANTHER_GO_SLIM_biological_process_list']

0    acylglycerol catabolic process;triglyceride me...
1                                                    .
2    chaperone-mediated protein folding;'de novo' p...
3                                                    .
4                                                    .
5    acylglycerol catabolic process;triglyceride me...
Name: enhancer_linked_PANTHER_GO_SLIM_biological_process_list, dtype: object