In [1]:
import pandas as pd
import numpy as np
from subprocess import run
import requests
from gzip import decompress
from pathlib import Path

In [57]:
gene_address = pd.read_csv("../data/ensg.csv", delimiter="\t")
gene_expresson = pd.read_csv("../data/57epigenomes.RPKM.pc", delimiter="\t", index_col=False)

In [59]:
gene_address = pd.read_csv("../data/Ensembl_v65.Gencode_v10.ENSG.gene_info", delimiter="\t", index_col=False, header=None)
gene_address = gene_address[[0,1,2,3,4]]
gene_address.columns = ["gene", "chrom", "start", "end", "strand"]

In [61]:
gene_address.chrom = gene_address.chrom.map(lambda x: "chr"+x)

In [64]:
available_genes = np.intersect1d(gene_address.gene, gene_expresson.gene_id)
available_genes.sort()

# Gen interest bed file

In [65]:
gene_address = gene_address.set_index("gene").loc[available_genes].reset_index()

In [66]:
gene_address = gene_address.sort_values(by=["chrom", "gene", "start"])

In [67]:
gene_address = gene_address.drop_duplicates(subset=["chrom", "gene"]).copy()

In [69]:
gene_address.loc[:,"calStart"] = np.clip(gene_address.start - 5000, 0, 10000000000)
gene_address.loc[:,"calEnd"] = gene_address.start + 5000

In [70]:
arr = []
for _, gene, chrom, _, _, _, start, end in gene_address.itertuples():
    if end - start != 10000:
        continue
    for x in range(start, end, 100):
        arr.append((chrom, x, x+100, gene))
    

In [71]:
pd.DataFrame(arr).to_csv("../data/interest.v2.bed", sep="\t", header=False, index=False)

# Get Chip-seq Data

In [5]:
epigenomes = gene_expresson.columns[2:]
histones = ["H3K27me3", "H3K36me3", "H3K4me1", "H3K4me3", "H3K9me3"]

In [6]:
for epigenome in epigenomes:
    for histone in histones:
        dataaddr = f"http://egg2.wustl.edu/roadmap/data/byFileType/alignments/consolidated/{epigenome}-{histone}.tagAlign.gz"
        indexaddr = f"http://egg2.wustl.edu/roadmap/data/byFileType/alignments/consolidated/{epigenome}-{histone}.tagAlign.gz.tbi"
        
        datatarget = f"../data/{epigenome}-{histone}.bed"
        indextarget = f"../data/{epigenome}-{histone}.bed.tbi"
        
        if not Path(datatarget).is_file():
            resp = requests.get(dataaddr)
            if resp.status_code == 200:
                with open(datatarget, "wb") as f:
                    f.write(decompress(resp.content))
            else:
                print(f"Get {epigenome}-{histone}.tagAlign.gz failed")
                
        if not Path(indextarget).is_file():
            resp = requests.get(indexaddr)
            if resp.status_code == 200:
                with open(indextarget, "wb") as f:
                    f.write(decompress(resp.content))
            else:
                print(f"Get {epigenome}-{histone}.tagAlign.gz.tbi failed")