In [1]:
import sys
sys.path.append('../')
from config_sing import *
show_env()

You are in: Singularity | singularity_proj_combeffect
    BASE DIRECTORY:     /mount/work
    PATH OF SOURCE:     /mount/work/source
    PATH OF EXECUTABLE: /mount/work/exe
    PATH OF ANNOTATION: /mount/work/annotation
    PATH OF PROJECT:    /mount/project
    PATH OF RESULTS:    /mount/work/out/proj_combeffect

Library imported:
    numpy, pandas, matplotlib.pyplot
    os, sys, time, gzip, glob



In [2]:
import sqlite3
import itertools as it
from collections import defaultdict
import matplotlib as mpl
from matplotlib.lines import Line2D
from mpl_toolkits.axes_grid1 import make_axes_locatable
plt.style.use("seaborn-whitegrid")

import math
import random
import json
from sklearn.manifold import TSNE
from functools import partial, reduce
print = partial(print, flush=True)

### Set Samples
fun = np.core.defchararray.add
idx = np.arange(1,6).astype("str")

INPUT    = reduce(fun, ["Input", idx])
INPUT20X = reduce(fun, ["Input", idx,     "_20x"])
TFX_DMSO = reduce(fun, ["TFX",   idx[1:], "_DMSO"])
TFX_DEX  = reduce(fun, ["TFX",   idx[1:], "_Dex"])
SAMPLES  = np.concatenate([INPUT20X, TFX_DMSO, TFX_DEX])
SAMPLES_OUT = np.concatenate([TFX_DMSO, TFX_DEX])
GROUPS   = ["Input", "Input_20x", "TFX_DMSO", "TFX_Dex"]

### file path of database
fdiry = os.path.join(FD_RES, 'database')
fname = "fragment_chr17.db"
FPATH_DB = os.path.join(fdiry, fname)

## Test streaming fragments

In [3]:
def prep_line(line):
    cnames = (
        "Frag_Chrom", 
        "Frag_Start", 
        "Frag_End", 
        "Frag_Count", 
        "Region_Chrom", 
        "Region_Start", 
        "Region_End",
        "Region_Name", 
        "Region_Score",
        "Region_Strand",
        "Overlap")
    lst = line.decode('ASCII').strip().split('\t')  
    dct = dict(zip(cnames, lst))
    fragment = "_".join([dct["Frag_Chrom"],   dct["Frag_Start"],   dct["Frag_End"]]) 
    region   = "_".join([dct["Region_Chrom"], dct["Region_Start"], dct["Region_End"]]) 
    return fragment, region

### helper function to get a chunk of file
def get_chunks(gen, rows=10):
    """Divides the data into #rows in each list"""
    iterable = iter(gen)
    while True:
        x = list(it.islice(iterable, rows))
        if not x:
            return
        yield x

In [4]:
fdiry = os.path.join(FD_RES, "count_fragment", "TFX2_DMSO")
fname = "region_dex_GR_P300_dnase_chr17.bed.gz"
fpath = os.path.join(fdiry, fname)
print(fpath)

frgs = []
with gzip.open(fpath, 'rb') as file:
    chunks = get_chunks(file)
    chunk  = next(chunks)
    for line in chunk:
        print(line)
        frg, reg = prep_line(line)
        frgs.append(frg)
        
frgs

/mount/work/out/proj_combeffect/count_fragment/TFX2_DMSO/region_dex_GR_P300_dnase_chr17.bed.gz
b'chr17\t6001570\t6002624\t2\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t63\n'
b'chr17\t6001571\t6002624\t1\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t63\n'
b'chr17\t6001762\t6002691\t2\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t130\n'
b'chr17\t6001762\t6002692\t1\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t131\n'
b'chr17\t6001763\t6002692\t1\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t131\n'
b'chr17\t6001765\t6002692\t1\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t131\n'
b'chr17\t6001969\t6002895\t1\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t334\n'
b'chr17\t6001969\t6002896\t1\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t335\n'
b'chr17\t6001970\t6002896\t1\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t335\n'
b'chr17\t6002054\t6002943\t2\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t382\

['chr17_6001570_6002624',
 'chr17_6001571_6002624',
 'chr17_6001762_6002691',
 'chr17_6001762_6002692',
 'chr17_6001763_6002692',
 'chr17_6001765_6002692',
 'chr17_6001969_6002895',
 'chr17_6001969_6002896',
 'chr17_6001970_6002896',
 'chr17_6002054_6002943']

## Annotation: depth

In [5]:
### get locations from 
def get_locs(frg):
    chrom, start, end = frg.split("_")
    loc = (int(start) + int(end)) // 2
    return loc

def get_depth(frgs, fpath_db = FPATH_DB):
    """get"""
    ### get location
    locs = list(map(get_locs, frgs))
    
    ### set query
    txt   = ', '.join('?' for _ in locs)
    query = f"""
        SELECT   Cov.location, Cov.sample, Cov.depth, Sam.treatment, Sam.size
        FROM     Coverage Cov
        JOIN     Sample   Sam 
        ON       Cov.sample = Sam.sample
        WHERE    Cov.location IN ({txt})
        ORDER BY Cov.location
        """
    
    ### query out from database
    with sqlite3.connect(fpath_db) as conn:
        cursor = conn.cursor()
        cursor = cursor.execute(query, locs)
    
    ### generate each row
    for row in cursor:
        grp = "loc", "sam", "depth", "trt", "size"
        dct = {k:v for k, v in zip(grp, row)}
        yield dct

In [6]:
SAMPLES_OUT

array(['TFX2_DMSO', 'TFX3_DMSO', 'TFX4_DMSO', 'TFX5_DMSO', 'TFX2_Dex',
       'TFX3_Dex', 'TFX4_Dex', 'TFX5_Dex'], dtype='<U29')

In [7]:
%%time

for sample in SAMPLES_OUT:

    ### input file
    fdiry = os.path.join(FD_RES, "count_fragment", sample)
    fname = "region_dex_GR_P300_dnase_chr17.bed.gz"
    fpath_inp = os.path.join(fdiry, fname)

    ### output file
    fdiry = os.path.join(FD_RES, "scratch", "region_dex_GR_P300_dnase_chr17")
    fname = f"{sample}_depth.tsv"
    fpath_out = os.path.join(fdiry, fname)

    ### show file IO
    print(fpath_inp)
    print(fpath_out)

    with gzip.open(fpath_inp, 'rb') as finp, open(fpath_out, 'w') as fout:
        ### set chunks
        chunks = get_chunks(finp)

        ### loop through each chunk
        for chunk in chunks:
            ### get fragments
            frgs = []
            for line in chunk:
                frg, reg = prep_line(line)
                frgs.append(frg)
            frgs = np.sort(frgs)
            
            ### get depth from database
            gen = get_depth(frgs)
            dct = defaultdict(lambda: {k:0 for k in GROUPS})
            for row in gen:
                loc, trt, val, size = row["loc"], row["trt"], row["depth"], row["size"]
                cpm = float(val) * (10**6) / float(size)
                dct[loc][trt] += cpm

            ### get depth foreach fragment
            fun  = get_locs
            locs = map(fun, frgs)
            for loc, frg in zip(locs, frgs):
                ### round the values
                tmp = dct[loc]
                tmp = {key:np.round(val, decimals=5) for key, val in tmp.items()}
                tmp = json.dumps(tmp)

                ### output to a line
                line = "\t".join([str(frg), str(loc), tmp])
                fout.write(line + "\n")
                #print(line)

            #break

/mount/work/out/proj_combeffect/count_fragment/TFX2_DMSO/region_dex_GR_P300_dnase_chr17.bed.gz
/mount/work/out/proj_combeffect/scratch/region_dex_GR_P300_dnase_chr17/TFX2_DMSO_depth.tsv
/mount/work/out/proj_combeffect/count_fragment/TFX3_DMSO/region_dex_GR_P300_dnase_chr17.bed.gz
/mount/work/out/proj_combeffect/scratch/region_dex_GR_P300_dnase_chr17/TFX3_DMSO_depth.tsv
/mount/work/out/proj_combeffect/count_fragment/TFX4_DMSO/region_dex_GR_P300_dnase_chr17.bed.gz
/mount/work/out/proj_combeffect/scratch/region_dex_GR_P300_dnase_chr17/TFX4_DMSO_depth.tsv
/mount/work/out/proj_combeffect/count_fragment/TFX5_DMSO/region_dex_GR_P300_dnase_chr17.bed.gz
/mount/work/out/proj_combeffect/scratch/region_dex_GR_P300_dnase_chr17/TFX5_DMSO_depth.tsv
/mount/work/out/proj_combeffect/count_fragment/TFX2_Dex/region_dex_GR_P300_dnase_chr17.bed.gz
/mount/work/out/proj_combeffect/scratch/region_dex_GR_P300_dnase_chr17/TFX2_Dex_depth.tsv
/mount/work/out/proj_combeffect/count_fragment/TFX3_Dex/region_dex_GR_P3

In [16]:
json.dumps(tmp)

'{"Input": 6.134, "Input_20x": 6.45, "TFX_DMSO": 2.598, "TFX_Dex": 6.497}'