In [2]:
import pandas as pd
from subprocess import run, PIPE
from pathlib import Path
from joblib import Parallel, delayed
import numpy as np
from pickle import dump

In [3]:
gene_expresson = pd.read_csv("../data/57epigenomes.RPKM.pc", delimiter="\t", index_col=False)
epigenomes = gene_expresson.columns[2:]
histones = ["H3K27me3", "H3K36me3", "H3K4me1", "H3K4me3", "H3K9me3"]

In [4]:
NTHREADS = 16

# Convert to bam

In [5]:
def do(filename):
    result = run(
        f"bedtools bedtobam -i ../data/{filename}.bed -g ../data/hg19chrom.sizes".split(), 
        stdout=PIPE, stderr=PIPE
    )
    if result.returncode != 0:
        print(f"Command fail")
        print(result.stdout, result.stderr)

    with open(f"../data/{filename}.bam", "wb") as f:
        f.write(result.stdout)

filenames = []
for epigenome in epigenomes:
    for histone in histones:
        filename = f"{epigenome}-{histone}"
#         print(f"Current: {filename}")
        if Path(f"../data/{filename}.bam").is_file():
#             print(f"{filename} exists, skip")
            continue
        filenames.append(filename)
        
_ = Parallel(n_jobs=NTHREADS)(delayed(do)(filename) for filename in filenames)


Command fail
Command fail
Command fail
Command fail
Command fail
Command fail
Command fail
b'' b'Error: The requested file (../data/E003-H3K4me1.bed) could not be opened. Error message: (No such file or directory). Exiting!\n'
Command fail
Command fail
Command fail
b'' b'Error: The requested file (../data/E004-H3K9me3.bed) could not be opened. Error message: (No such file or directory). Exiting!\n'
Command fail
Command fail
b'' b'Error: The requested file (../data/E003-H3K4me3.bed) could not be opened. Error message: (No such file or directory). Exiting!\n'
Command fail
Command fail
b'' b'Error: The requested file (../data/E004-H3K27me3.bed) could not be opened. Error message: (No such file or directory). Exiting!\n'
b'' b'Error: The requested file (../data/E004-H3K4me1.bed) could not be opened. Error message: (No such file or directory). Exiting!\n'
b'' b'Error: The requested file (../data/E005-H3K4me1.bed) could not be opened. Error message: (No such file or directory). Exiting!\n'
b

# Sort

In [25]:
%%time
def do(filename):
    result = run(
        f"samtools sort ../data/{filename}.bam ../data/{filename}.sort".split(), 
        stdout=PIPE, stderr=PIPE
    )
filenames = []
for epigenome in epigenomes:
    for histone in histones:
        filename = f"{epigenome}-{histone}"
        if Path(f"../data/{filename}.sort.bam").is_file():
            continue
        filenames.append(filename)
_ = Parallel(n_jobs=NTHREADS)(delayed(do)(filename) for filename in filenames)

CPU times: user 36 ms, sys: 316 ms, total: 352 ms
Wall time: 516 ms


# Index

In [24]:
def do(filename):
    result = run(
        f"samtools index ../data/{filename}.sort.bam".split(), 
        stdout=PIPE, stderr=PIPE
    )
filenames = []
for epigenome in epigenomes:
    for histone in histones:
        filename = f"{epigenome}-{histone}"
        if Path(f"../data/{filename}.sort.bam.bai").is_file():
            continue
        filenames.append(filename)
_ = Parallel(n_jobs=NTHREADS)(delayed(do)(filename) for filename in filenames)

# Generate X

In [None]:
def do(files, epigenome):
    result = run(
        f"bedtools multicov -bams {files} -bed ../data/interest.v2.bed".split(), 
        stdout=PIPE, stderr=PIPE
    )
    if result.returncode != 0:
        print(f"Command fail")
        print(result.stdout, result.stderr)
        
    with open(f"../data/{epigenome}.csv", "wb") as f:
        f.write(result.stdout)
        
filenames = []

for epigenome in epigenomes: 
    pathes = []
    for histone in histones:
        filename = f"{epigenome}-{histone}"
        path = f"../data/bams/{filename}.sort.bam"
        pathes.append(path)

    files = " ".join(pathes)
    if Path(f"../data/{epigenome}.csv").is_file():
        continue
    filenames.append((files, epigenome))

_ = Parallel(n_jobs=NTHREADS)(delayed(do)(files, epigenome) for (files, epigenome) in filenames)

# Generate XY

In [16]:
Xs = []
Ys = []
for epigenome in epigenomes:
    if Path(f"../data/{epigenome}.csv").is_file():
        print(f"Processing {epigenome}.csv")
        df = pd.read_csv(f"../data/{epigenome}.csv", delimiter="\t", header=None, index_col=None)
        names = []
        groups = []
        for idx in range(len(df)//100):
            selection = df.iloc[idx*100:idx*100+100]
            names.append(selection[3].iloc[0])
            groups.append(selection[[4,5,6,7,8]].values[None,:,:])
        X = np.concatenate(groups)
        rawY = gene_expresson.set_index("gene_id").loc[names].E003
        Y = (rawY > rawY.median()) + 0
        Xs.append(X)
        Ys.append(Y)
        
Xs = np.concatenate([X[None,:,:,:] for X in Xs])
Ys = np.concatenate([Y[None,:] for Y in Ys])

Processing E003.csv
Processing E004.csv
Processing E005.csv
Processing E006.csv
Processing E007.csv
Processing E011.csv
Processing E012.csv
Processing E013.csv
Processing E016.csv
Processing E024.csv
Processing E027.csv
Processing E028.csv
Processing E037.csv
Processing E038.csv
Processing E047.csv
Processing E050.csv
Processing E053.csv
Processing E054.csv
Processing E055.csv
Processing E056.csv
Processing E057.csv
Processing E058.csv
Processing E059.csv
Processing E061.csv
Processing E062.csv
Processing E065.csv
Processing E066.csv
Processing E070.csv
Processing E071.csv
Processing E079.csv
Processing E082.csv
Processing E084.csv
Processing E085.csv
Processing E087.csv
Processing E094.csv
Processing E095.csv
Processing E096.csv
Processing E097.csv
Processing E098.csv
Processing E100.csv
Processing E104.csv
Processing E105.csv
Processing E106.csv
Processing E109.csv
Processing E112.csv
Processing E113.csv
Processing E114.csv
Processing E116.csv
Processing E117.csv
Processing E118.csv


In [24]:
with open("../data/input.pkl", "wb") as f:
    dump((Xs[:28], Ys[:28]), f)
with open("../data/input2.pkl", "wb") as f:
    dump((Xs[28:], Ys[28:]), f)