# Get DNase-seq dataset file accessions from ENCODE

We're limiting our training on DNase-seq datasets as there are only a handful of ATAC-seq, MNase-seq, or FAIRE-seq experiments on ENCODE which provide the same data types (signal + peak annotations)

In [171]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

#### Imports

In [218]:
import io
import json
import numpy as np
import pandas as pd
import requests
from urllib.parse import urlencode, urljoin

## Search for experiments

In [169]:
assay_type = 'DNase-seq'
params = {
    'type': 'Experiment',
    'status': 'released',
    'assay_slims': 'DNA+accessibility',
    'replicates.library.biosample.donor.organism.scientific_name': 'Homo+sapiens',
    'assay_title': 'DNase-seq',
    'limit': 'all',
    'format': 'json',
}
param_str = urlencode(params, doseq=True).replace('%2B', '+')
url = 'https://www.encodeproject.org/search/'
headers = {
    'Accept': 'application/json',
}

response = requests.get(url, headers=headers, params=param_str).json()

experiments = [x['accession'] for x in response['@graph']]

assert len(experiments) == response['total'], 'Number of results should equal'

print('Found {} human DNase-seq experiments'.format(len(experiments)))

Found 727 human DNase-seq experiments


#### Selection

In [173]:
selection = {
    'type': 'Experiment',
    'files.file_type': ['bigBed+broadPeak', 'bigBed+narrowPeak', 'bigWig'],
}
base_url = 'https://www.encodeproject.org/metadata'
file_type = 'metadata.tsv'
headers = {
    'Accept': 'text/tsv',
    'Content-Type': 'application/json'
}

#### Construct the URL and parameters

In [174]:
selection_path = urlencode(selection, doseq=True).replace('%2B', '+')
url = '/'.join([base_url, selection_path, file_type])
data = {
    'elements': ['/experiments/{}/'.format(e) for e in experiments]
}

#### Execute and parse the request

In [175]:
urlData = requests.get(url, headers=headers, json=data).content
metaData = pd.read_csv(io.StringIO(urlData.decode('utf-8')), sep='\t')

In [176]:
outputTypes = metaData['Output type'].unique()

for outputType in outputTypes:
    print(outputType, metaData[metaData['Output type'] == outputType]['File format'].unique()[0])

peaks bigBed narrowPeak
hotspots bigBed broadPeak
read-depth normalized signal bigWig
base overlap signal bigWig
signal bigWig
signal of unique reads bigWig
raw signal bigWig


In [177]:
is_grch38 = metaData['Assembly'] == 'GRCh38'
is_released = metaData['File Status'] == 'released'
is_dnase_seq = metaData['Assay'] == 'DNase-seq'
is_rdn_signal = metaData['Output type'] == 'read-depth normalized signal'
is_narrow_peaks = metaData['Output type'] == 'peaks'
is_broad_peaks = metaData['Output type'] == 'hotspots'

In [216]:
datasets = {}
use_only_one_bio_replicate = True

k = 0
for exp in metaData.loc[is_grch38 & is_released & is_dnase_seq]['Experiment accession'].unique():
    is_exp = is_grch38 & is_released & is_dnase_seq & (metaData['Experiment accession'] == exp)
    
    for sample in metaData.loc[is_grch38 & is_released & is_exp]['Biological replicate(s)'].unique():
        is_sample = is_exp & (metaData['Biological replicate(s)'] == sample)
        
        data_rdn_signal = metaData.loc[is_sample & is_rdn_signal]
        data_narrow_peaks = metaData.loc[is_sample & is_narrow_peaks]
        data_broad_peaks = metaData.loc[is_sample & is_broad_peaks]
        
        try:
            if exp not in datasets:
                datasets[exp] = {}
            else:
                if use_only_one_bio_replicate:
                    continue

            datasets[exp][sample] = {
                'rdn_signal': data_rdn_signal.iloc[0]['File accession'],
                'narrow_peaks': data_narrow_peaks.iloc[0]['File accession'],
                'broad_peaks': data_broad_peaks.iloc[0]['File accession'],
            }
        
        except IndexError:
            k += 1
            if exp in datasets:
                # Remove the key in case there exist another replicate which has all data types
                del datasets[exp]

dnum = [ds for exp in datasets for ds in datasets[exp]]
print('Found {} experiments comprising {} datasets'.format(len(datasets.keys()), len(dnum)))

Found 706 experiments comprising 706 datasets


#### Save JSON

In [219]:
with open('datasets-dnase-grch38.json'.format(), 'w') as f:
    json.dump(data, f)