In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import sys
sys.path.append('../src')

In [4]:
import os
from enhancer3d import Enhancer3dProject, ChromatinRegion, run_distance_calculation_for_region, load_enhancer_atlas_dataset_from_filesystem, load_gencode_annotation_dataset_from_filesystem
from chromatin_model import load_chromatin_model_ensemble_from_filesystem

reference_ensemble_region = ChromatinRegion.from_string('chr1:74280-3521135')
reference_ensemble = load_chromatin_model_ensemble_from_filesystem(
    data_path=os.path.abspath('../data/packed'),
    model_name='results_GM12878_Deni_chr1_74280_3521135'
)

modification_ensemble_region = ChromatinRegion.from_string('chr1:74280-3521135')
modification_ensemble = load_chromatin_model_ensemble_from_filesystem(
    data_path=os.path.abspath('../data/packed'),
    model_name='results_GM12878_Deni_mod_chr1_74280_3521135'
)

project = Enhancer3dProject(
    name='test-project',
    species='hg38',
    cell_line='GM12878',
    reference_ensemble_region=reference_ensemble_region,
    modification_ensemble_region=modification_ensemble_region
)

enhancer_atlas_dataset = load_enhancer_atlas_dataset_from_filesystem(
    data_path=os.path.abspath('../data'),
    dataset_name='enchancers.GM12878.bed'
)

gencode_annotation_dataset = load_gencode_annotation_dataset_from_filesystem(
    data_path=os.path.abspath('../data'),
    dataset_name='gencode.v40.basic.annotation'
)

In [16]:
enhancer_atlas_dataset

Unnamed: 0,chromosome,start,end,score
0,chr1,773300,774100,7.866088
1,chr1,778980,779450,6.472419
2,chr1,800100,802000,11.010675
3,chr1,825670,826410,6.114487
4,chr1,839470,842590,8.848865
...,...,...,...,...
49667,chrX,154444850,154445660,9.207927
49668,chrX,154457730,154458660,5.836427
49669,chrX,154459120,154459990,8.231231
49670,chrX,154841610,154842450,7.930245


In [18]:
gencode_annotation_dataset[gencode_annotation_dataset['Feature'] == 'gene']

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid
0,chr1,HAVANA,gene,11868,14409,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,...,,,,,,,,,,
12,chr1,HAVANA,gene,14403,29570,.,-,.,ENSG00000227232.5,unprocessed_pseudogene,...,,,,,,,,,,
25,chr1,ENSEMBL,gene,17368,17436,.,-,.,ENSG00000278267.1,miRNA,...,,,,,,,,,,
28,chr1,HAVANA,gene,29553,31109,.,+,.,ENSG00000243485.5,lncRNA,...,,,,ncRNA_host,,,,,,
36,chr1,ENSEMBL,gene,30365,30503,.,+,.,ENSG00000284332.1,miRNA,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1926897,chrM,ENSEMBL,gene,14148,14673,.,-,.,ENSG00000198695.2,protein_coding,...,,,,,,,,,,
1926902,chrM,ENSEMBL,gene,14673,14742,.,-,.,ENSG00000210194.1,Mt_tRNA,...,,,,,,,,,,
1926905,chrM,ENSEMBL,gene,14746,15887,.,+,.,ENSG00000198727.2,protein_coding,...,,,,,,,,,,
1926910,chrM,ENSEMBL,gene,15887,15953,.,+,.,ENSG00000210195.2,Mt_tRNA,...,,,,,,,,,,


In [7]:
os.environ['DATA_PATH'] = os.path.abspath('../data') # HACK: for stub transformers to work
distances_dataset = run_distance_calculation_for_region(
    project=project,
    enhancer_atlas_dataset=enhancer_atlas_dataset,
    gencode_annotation_dataset=gencode_annotation_dataset,
    reference_ensemble=reference_ensemble,
    modification_ensemble=modification_ensemble
)

INFO:enhancer3d.flows:Starting Enhancer3D project: name='test-project' species='hg38' cell_line='GM12878' reference_ensemble_region=chr1:74280-3521135 modification_ensemble_region=chr1:74280-3521135
INFO:enhancer3d.flows:Extracting genes and enhancers for the reference and modification ensembles
INFO:enhancer3d.flows:Extracting full genes and enhancers for the reference and modification ensembles
INFO:enhancer3d.flows:Joining enhancers and genes together
INFO:enhancer3d.flows:Calculating distances for potential enhancer-gene pairs


In [8]:
distances_dataset

Unnamed: 0,region_chr_ref,region_start_ref,region_end_ref,region_chr_mod,region_start_mod,region_end_mod,gene_chr_ref,gene_start_ref,gene_end_ref,gene_strand,...,enh_center_pos,enh_loci,enh_tSS_distance,avg_dist_ref,avg_dist_mut,avg_dist_sub,mwh_pvalue,mwh_statistic,number_bins_ref,number_bins_mod
0,chr1,74280,3521135,chr1,74280,3521135,chr1,873291,874349,+,...,890660,chr1:890290-891030,17369,5.592618,6.174317,0.581699,3.736146e-05,3312.0,100,100
1,chr1,74280,3521135,chr1,74280,3521135,chr1,873291,874349,+,...,905650,chr1:904090-907210,32359,3.118164,3.003339,0.114826,6.734013e-01,5173.0,100,100
2,chr1,74280,3521135,chr1,74280,3521135,chr1,873291,874349,+,...,966660,chr1:966620-966700,93369,7.369675,8.201164,0.831490,1.101111e-02,3959.0,100,100
3,chr1,74280,3521135,chr1,74280,3521135,chr1,873291,874349,+,...,967245,chr1:966800-967690,93954,7.369675,8.201164,0.831490,1.101111e-02,3959.0,100,100
4,chr1,74280,3521135,chr1,74280,3521135,chr1,873291,874349,+,...,992640,chr1:991620-993660,119349,6.004310,10.485207,4.480896,1.137889e-23,895.0,100,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9069,chr1,74280,3521135,chr1,74280,3521135,chr1,3487245,3487627,+,...,2550861,chr1:2550731-2550991,936384,10.159252,14.153038,3.993786,2.447708e-07,2887.0,100,100
9070,chr1,74280,3521135,chr1,74280,3521135,chr1,3487245,3487627,+,...,2556851,chr1:2556731-2556971,930394,9.815758,10.082376,0.266618,5.133653e-01,4732.0,100,100
9071,chr1,74280,3521135,chr1,74280,3521135,chr1,3487245,3487627,+,...,2567801,chr1:2566601-2569001,919444,10.976965,12.050615,1.073650,7.231776e-02,4264.0,100,100
9072,chr1,74280,3521135,chr1,74280,3521135,chr1,3487245,3487627,+,...,2577351,chr1:2575351-2579351,909894,10.559278,11.237013,0.677735,3.627369e-01,4627.0,100,100
