# Get histone mod. ChIP-seq datasets from Roadmap Epigenomics

We're limiting our training on histone modification ChIP-seq datasets targeting the following modifications:

- H3K4me1
- H3K4me3
- H3K27ac
- H3K9ac
- H3K27me3
- H3K9me3
- H3K36me3

The list of all datasets is available at https://egg2.wustl.edu/roadmap/data/byFileType/signal/consolidated/macs2signal/foldChange/

In [14]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Imports

In [2]:
import io
import json
import numpy as np
import os
import pandas as pd
import requests
import sys
from urllib.parse import urlencode, urljoin

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../experiments'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
base = '..'
datasets_file = 'roadmap-epigenomics.tsv'
mods = ['H3K4me1', 'H3K4me3', 'H3K27ac', 'H3K27me3', 'H3K9ac', 'H3K9me3', 'H3K36me3']

In [63]:
roadmap_datasets = pd.read_csv(os.path.join(base, 'data', datasets_file), sep='\t', header=None, names=('id', 'target', 'size'))

In [46]:
from functools import reduce

filters = [roadmap_datasets['filename'].str.lower().str.contains(mod.lower()) for mod in mods]
all_filters = reduce(lambda a, b: a | b, filters)

In [112]:
selected_datasets = roadmap_datasets[all_filters]

In [136]:
"""Select experiments that feature all targets"""
tmp = selected_datasets.groupby(['id']).agg(['count'])
tmp = tmp['target'] == 7
tmp = tmp.index[tmp['count'].values]
datasets_with_all_targets = selected_datasets[selected_datasets['id'].isin(np.array(tmp))]
e_ids = datasets_with_all_targets['id'].unique().tolist()

#### Estimate size

In [61]:
N = len(roadmap_datasets)
subsample = [72, 72, 72, 54, 54, 54, 54]
selected = None

for i, f in enumerate(filters):
    if selected is None:
        selected = np.random.choice(np.arange(N)[f], subsample[i], replace=False)
    else:
        selected = np.concatenate((selected, np.random.choice(np.arange(N)[f], subsample[i], replace=False)))
        
roadmap_datasets.iloc[selected]['size'].sum() / 1000

205.707

#### Save JSON

In [126]:
with open('../datasets-chip-histone-mod-hg19.json'.format(), 'w') as f:
    json.dump(e_ids, f)

## Download data

In [18]:
from download import download_roadmap_epigenomics

with open('../datasets-chip-histone-mod-hg19.json'.format(), 'r') as f:
    e_ids = json.load(f)


with open('../settings-hg19-chip-histone-mod-12kb.json'.format(), 'r') as f:
    settings = json.load(f)

download_roadmap_epigenomics(
    e_ids,
    settings,
    base='..',
    check=True,
)

HBox(children=(IntProgress(value=0, description='Dataset', max=49, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='Targets', max=7, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='Targets', max=7, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='Targets', max=7, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='Targets', max=7, style=ProgressStyle(description_width='initi…

KeyboardInterrupt: 