**set environment**

In [1]:
import sys
sys.path.append('../')
from config_sing import *
show_env()

You are in: Singularity | singularity_proj_combeffect
    BASE DIRECTORY:     /mount/work
    PATH OF SOURCE:     /mount/work/source
    PATH OF EXECUTABLE: /mount/work/exe
    PATH OF ANNOTATION: /mount/work/annotation
    PATH OF PROJECT:    /mount/project
    PATH OF RESULTS:    /mount/work/out/proj_combeffect

Library imported:
    numpy, pandas, matplotlib.pyplot
    os, sys, time, gzip, glob



In [2]:
import sqlite3
import itertools as it
from collections import defaultdict
import matplotlib as mpl
from matplotlib.lines import Line2D
from mpl_toolkits.axes_grid1 import make_axes_locatable
plt.style.use("seaborn-whitegrid")

import math
import random
from sklearn.manifold import TSNE
from functools import partial, reduce
print = partial(print, flush=True)

### Set Samples
fun = np.core.defchararray.add
idx = np.arange(1,6).astype("str")

INPUT    = reduce(fun, ["Input", idx])
INPUT20X = reduce(fun, ["Input", idx,     "_20x"])
TFX_DMSO = reduce(fun, ["TFX",   idx[1:], "_DMSO"])
TFX_DEX  = reduce(fun, ["TFX",   idx[1:], "_Dex"])
SAMPLES  = np.concatenate([INPUT20X, TFX_DMSO, TFX_DEX])

### file path of database
fdiry = os.path.join(FD_RES, 'database')
fname = "fragment_chr17.db"
FPATH_DB = os.path.join(fdiry, fname)

## Helper function

In [3]:
def get_frag(sample, fpath_db = FPATH_DB):
    ### set query
    query_out = f"""
        SELECT Cnt.fragment, Cnt.count, Cnt.sample
        FROM   Count Cnt
        WHERE  Cnt.sample = '{sample}'
        """
    
    ### query out fragments from the sample
    with sqlite3.connect(fpath_db) as conn:
        query  = query_out
        cursor = conn.cursor()
        rows   = cursor.execute(query)
        for row in rows:
            yield row

In [6]:
def get_frag_astarr(sample, start=None, end=None, fpath_db = FPATH_DB):
    """sample fragments from ATAC-STARR-seq with probability proportion to count"""
    ### set query
    if (start is None) and (end is None):
        query  = f"""
            SELECT Cnt.fragment, Cnt.sample, Frg.pct_gc, Cnt.count
            FROM   Count    Cnt
            JOIN   Fragment Frg
            ON     Cnt.fragment = Frg.fragment
            WHERE  Cnt.sample = '{sample}'
            """
    else:
        query  = f"""
            SELECT Cnt.fragment, Cnt.sample, Frg.pct_gc, Cnt.count
            FROM   Count    Cnt
            JOIN   Fragment Frg
            ON     Cnt.fragment = Frg.fragment
            WHERE  Cnt.sample = '{sample}' AND 
                   Frg.start >= '{start}'  AND 
                   Frg.end   <= '{end}'
            """
    
    fpath_db = "file:" + fpath_db + "?mode=ro"
    with sqlite3.connect(fpath_db, uri=True) as conn:
        ### query the fragment annotations
        cursor = conn.cursor()
        cursor = cursor.execute(query)
        rows   = cursor
        
        ### generate fragments
        for row in rows:
            ### parse info
            frg, sample, pct_gc, count = row
            
            ### repeat the each fragment based on its count
            for _ in range(count):
                yield row

In [9]:
%%time
N   = 10
sam = SAMPLES[0]
gen = get_frag_astarr(sam)
lst = list(it.islice(gen, N))
print(lst)

[('chr17_107410_108464', 'Input1_20x', 0.487666, 1), ('chr17_159026_160040', 'Input1_20x', 0.493097, 1), ('chr17_159426_160303', 'Input1_20x', 0.508552, 1), ('chr17_159510_160362', 'Input1_20x', 0.502347, 1), ('chr17_159977_160849', 'Input1_20x', 0.541284, 1), ('chr17_174938_176057', 'Input1_20x', 0.531725, 1), ('chr17_175103_176248', 'Input1_20x', 0.531878, 1), ('chr17_177613_178563', 'Input1_20x', 0.545263, 1), ('chr17_180289_181411', 'Input1_20x', 0.776292, 1), ('chr17_197552_198593', 'Input1_20x', 0.569645, 1)]
CPU times: user 2.63 ms, sys: 1.09 ms, total: 3.72 ms
Wall time: 5.23 ms


In [10]:
%%time
N   = 10
sam = SAMPLES[0]
gen = get_frag_astarr(sam, start=6002561, end=6003866)
lst = list(it.islice(gen, N))
print(lst)

[('chr17_6002565_6003491', 'Input1_20x', 0.475162, 1), ('chr17_6002566_6003603', 'Input1_20x', 0.486982, 1), ('chr17_6002570_6003514', 'Input1_20x', 0.474576, 1), ('chr17_6002571_6003514', 'Input1_20x', 0.47508, 1), ('chr17_6002587_6003726', 'Input1_20x', 0.489025, 2), ('chr17_6002587_6003726', 'Input1_20x', 0.489025, 2), ('chr17_6002589_6003507', 'Input1_20x', 0.476035, 1), ('chr17_6002591_6003595', 'Input1_20x', 0.484064, 1), ('chr17_6002597_6003593', 'Input1_20x', 0.482932, 1), ('chr17_6002603_6003647', 'Input1_20x', 0.478927, 1)]
CPU times: user 1.48 s, sys: 1.58 s, total: 3.07 s
Wall time: 3min 1s


In [4]:
%%time
random.seed(123)
N = 10

lst_frg = list()
for sam in SAMPLES:
    print(sam)
    
    ### random select fragments
    gen = get_frag(sam)
    lst = list(it.islice(gen, N))
    print(lst)
    #lst = iter_sample_fast(gen, N)
    #lst = np.sort(lst, axis=0)
    #lst_frg.append(lst)

Input1_20x
[('chr17_107410_108464', 1, 'Input1_20x'), ('chr17_159026_160040', 1, 'Input1_20x'), ('chr17_159426_160303', 1, 'Input1_20x'), ('chr17_159510_160362', 1, 'Input1_20x'), ('chr17_159977_160849', 1, 'Input1_20x'), ('chr17_174938_176057', 1, 'Input1_20x'), ('chr17_175103_176248', 1, 'Input1_20x'), ('chr17_177613_178563', 1, 'Input1_20x'), ('chr17_180289_181411', 1, 'Input1_20x'), ('chr17_197552_198593', 1, 'Input1_20x')]
Input2_20x
[('chr17_158043_159067', 1, 'Input2_20x'), ('chr17_159299_160311', 1, 'Input2_20x'), ('chr17_159510_160362', 1, 'Input2_20x'), ('chr17_170418_172531', 1, 'Input2_20x'), ('chr17_180525_181456', 1, 'Input2_20x'), ('chr17_182039_183141', 1, 'Input2_20x'), ('chr17_186344_187357', 1, 'Input2_20x'), ('chr17_197528_198605', 1, 'Input2_20x'), ('chr17_197546_198559', 1, 'Input2_20x'), ('chr17_197583_198596', 1, 'Input2_20x')]
Input3_20x
[('chr17_158183_159063', 1, 'Input3_20x'), ('chr17_159026_160041', 1, 'Input3_20x'), ('chr17_159027_160041', 1, 'Input3_20x')