In [2]:
import sys
sys.path.append('../')
from config_sing import *
show_env()

You are in: Singularity | singularity_proj_combeffect
    BASE DIRECTORY:     /mount/work
    PATH OF SOURCE:     /mount/work/source
    PATH OF EXECUTABLE: /mount/work/exe
    PATH OF ANNOTATION: /mount/work/annotation
    PATH OF PROJECT:    /mount/project
    PATH OF RESULTS:    /mount/work/out/proj_combeffect

Library imported:
    numpy, pandas, matplotlib.pyplot
    os, sys, time, gzip, glob



In [10]:
### import tools
import math
import random
import pickle
import sqlite3
import itertools as it
from functools import reduce
from collections import defaultdict
from sklearn.manifold import TSNE

### plotting
import matplotlib as mpl
from matplotlib.lines import Line2D
from mpl_toolkits.axes_grid1 import make_axes_locatable
plt.style.use("seaborn-whitegrid")

### Set Samples
fun = np.core.defchararray.add
idx = np.arange(1,6).astype("str")

INPUT20X = reduce(fun, ["Input", idx,     "_20x"])
TFX_DMSO = reduce(fun, ["TFX",   idx[1:], "_DMSO"])
TFX_DEX  = reduce(fun, ["TFX",   idx[1:], "_Dex"])
SAMPLES  = np.concatenate([INPUT20X, TFX_DMSO, TFX_DEX])

### file path of database
fdiry = os.path.join(FD_RES, 'database')
fname = "fragment_test_chr17.db"
FPATH_DB = os.path.join(fdiry, fname)

In [11]:
### set query
query_out = """
    SELECT name 
    FROM sqlite_master 
    WHERE type = 'table';
    """

### get table names
fpath_db = FPATH_DB
with sqlite3.connect(fpath_db) as conn:
    query  = query_out
    cursor = conn.cursor()
    cursor = cursor.execute(query)
    rows   = cursor.fetchall()
    rows   = np.sort([row[0] for row in rows])

for row in rows:
    print(row)

Annotation
Count
Coverage
Fragment
Motif
Sample


## Helper function

In [35]:
def get_frag(sample, fpath_db=FPATH_DB):
    ### set query
    query_out = f"""
        SELECT Cnt.fragment, Cnt.count, Cnt.sample
        FROM   Count Cnt
        WHERE  Cnt.sample = '{sample}'
        """
    
    ### query out fragments from the sample
    with sqlite3.connect(fpath_db) as conn:
        query  = query_out
        cursor = conn.cursor()
        rows   = cursor.execute(query)
        for row in rows:
            yield row

def get_pct(frags, fpath_db=FPATH_DB):
    """..."""
    ###
    txt = ', '.join('?' for _ in frags)
    query_out = f"""
        SELECT   Frg.fragment, Frg.pct_gc
        FROM     Fragment Frg
        WHERE    Frg.fragment IN ({txt})
        """
    
    with sqlite3.connect(fpath_db) as conn:
        query  = query_out
        cursor = conn.cursor()
        rows   = cursor.execute(query, frags)
    return rows

def get_depth(frag):
    """..."""
    ###
    chrom, start, end = frag.split("_")
    mid = math.ceil((int(start) + int(end)) / 2)
    query_out = f"""
        SELECT Cov.sample, Cov.depth, Sam.treatment, Sam.size
        FROM   Coverage Cov
        JOIN   Sample Sam ON Cov.sample = Sam.sample
        WHERE  Cov.location = {mid};
        """
    ###
    with sqlite3.connect(fpath_db) as conn:
        query  = query_out
        cursor = conn.cursor()
        cursor.execute(query)
        rows   = cursor.fetchall()
    
    ###
    cnames = ["Sample", "Depth", "Trt", "Size"]
    dat = pd.DataFrame(rows, columns=cnames)
    dat = dat.assign(LogNorm_Depth=lambda x: np.log1p(x.Depth) - np.log(x.Size))
    res1 = pd.Series(dat["LogNorm_Depth"].values, index=dat["Sample"])
    
    ###
    dat = dat.groupby("Trt")["LogNorm_Depth"].mean()
    dat["TFX_DMSO_vs_Inp20x"] = dat["TFX_DMSO"] - dat["Input_20x"]
    dat["TFX_Dex_vs_Inp20x"]  = dat["TFX_Dex"]  - dat["Input_20x"]
    dat["TFX_Dex_vs_DMSO"]    = dat["TFX_Dex"]  - dat["TFX_DMSO"]
    res2 = dat
    
    return pd.concat([res1, res2], axis=0)

def get_annot(frag, fpath_db=FPATH_DB, verbose=False):
    ### set query
    query_out = f"""
        SELECT Mtf.motif, Mtf.score
        FROM Annotation Ant
        JOIN Motif      Mtf ON Ant.binding = Mtf.binding
        WHERE Ant.fragment = '{frag}'
        """
    
    ### query out motif bindings on the fragment
    with sqlite3.connect(fpath_db) as conn:
        query  = query_out
        cursor = conn.cursor()
        rows   = cursor.execute(query)
        
    ### summarize the motif scores
    dct = defaultdict(lambda: 0)
    for idx, row in enumerate(rows):
        ### parse info
        motif, score = row

        ### sum the scores with same motifs
        dct[motif] += score
        
    if verbose:
        print(f"Fragment: {frg}; # Bindings: {idx+1}; # Motifs: {len(dct)}")
            
    return dct

## Query fragments

In [15]:
SAMPLES

array(['Input1_20x', 'Input2_20x', 'Input3_20x', 'Input4_20x',
       'Input5_20x', 'TFX2_DMSO', 'TFX3_DMSO', 'TFX4_DMSO', 'TFX5_DMSO',
       'TFX2_Dex', 'TFX3_Dex', 'TFX4_Dex', 'TFX5_Dex'], dtype='<U30')

In [29]:
%%time
N = 100
lst_frg = list()
samples = ["TFX3_DMSO", "TFX3_Dex"]

for sam in samples:
    ### select a few fragments
    gen = get_frag(sam)
    lst = it.islice(gen, N)
    lst = list(lst)
    
    ### arrange and collect
    lst = np.sort(lst, axis=0)
    lst_frg.append(lst)

CPU times: user 2.51 ms, sys: 787 µs, total: 3.3 ms
Wall time: 2.07 ms


In [30]:
dat = pd.DataFrame(np.concatenate(lst_frg), columns=["Fragment", "Count", "Sample"])
dat[['Chrom', 'Start', 'End']] = dat['Fragment'].str.split('_', expand=True)

dat = dat.astype({"Start": int, "End": int, "Count": int})
dat = dat.assign(Mid = lambda x: np.ceil((x.Start + x.End) / 2))
dat = dat.assign(Len = lambda x: x.End - x.Start)
dat = dat.astype({"Mid": int})

dat = dat.set_index("Fragment")
dat_frg = dat
dat_frg

Unnamed: 0_level_0,Count,Sample,Chrom,Start,End,Mid,Len
Fragment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
chr17_159962_160793,1,TFX3_DMSO,chr17,159962,160793,160378,831
chr17_185415_186343,1,TFX3_DMSO,chr17,185415,186343,185879,928
chr17_201202_202073,1,TFX3_DMSO,chr17,201202,202073,201638,871
chr17_201202_202074,1,TFX3_DMSO,chr17,201202,202074,201638,872
chr17_201205_202074,1,TFX3_DMSO,chr17,201205,202074,201640,869
...,...,...,...,...,...,...,...
chr17_404380_405339,2,TFX3_Dex,chr17,404380,405339,404860,959
chr17_404380_405341,2,TFX3_Dex,chr17,404380,405341,404861,961
chr17_404391_405415,2,TFX3_Dex,chr17,404391,405415,404903,1024
chr17_404391_405417,2,TFX3_Dex,chr17,404391,405417,404904,1026


In [31]:
frags = dat.index.to_numpy()
frags = np.unique(frags)
frags.shape

(195,)

In [25]:
t = 2.38 * 800_000 / 100
t / 1000 # sec

19.04

## Get GC content

In [36]:
%%time
gen = get_pct(frags)
dat_pct = pd.DataFrame(gen, columns = ["Fragment", "Pct_GC"])
dat_pct = dat_pct.set_index("Fragment")
dat_pct

CPU times: user 4.42 ms, sys: 2.82 ms, total: 7.24 ms
Wall time: 5.69 ms


Unnamed: 0_level_0,Pct_GC
Fragment,Unnamed: 1_level_1
chr17_120738_123102,0.359983
chr17_159962_160793,0.553550
chr17_169749_171139,0.660432
chr17_182083_183038,0.573822
chr17_185415_186343,0.494612
...,...
chr17_404429_405442,0.473840
chr17_405553_406431,0.449886
chr17_405554_406431,0.450399
chr17_405555_406430,0.450286


## Get coverage

In [42]:
%%time
dct = dict()
for frg in frags:
    dct[frg] = get_depth(frg)
    
dat_cov_full = pd.DataFrame.from_dict(dct, orient="index")
dat_cov_full.index.name = 'Fragment'
dat_cov_full

CPU times: user 907 ms, sys: 63.8 ms, total: 971 ms
Wall time: 983 ms


Unnamed: 0_level_0,Input1,Input2,Input3,Input4,Input5,Input1_20x,Input2_20x,Input3_20x,Input4_20x,Input5_20x,...,TFX3_Dex,TFX4_Dex,TFX5_Dex,Input,Input_20x,TFX_DMSO,TFX_Dex,TFX_DMSO_vs_Inp20x,TFX_Dex_vs_Inp20x,TFX_Dex_vs_DMSO
Fragment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chr17_120738_123102,-16.742248,-16.819604,-16.963148,-16.760157,-16.544997,-19.733648,-19.666666,-19.673427,-19.840188,-18.954570,...,-16.395753,-17.359078,-17.560540,-16.766031,-19.573700,-17.281669,-17.236673,2.292031,2.337027,0.044996
chr17_159962_160793,-16.742248,-16.819604,-16.963148,-16.760157,-16.544997,-19.040500,-19.666666,-18.980280,-19.147041,-18.954570,...,-17.088900,-16.665930,-17.560540,-16.766031,-19.157811,-16.935095,-17.236673,2.222716,1.921139,-0.301578
chr17_169749_171139,-16.742248,-16.819604,-16.963148,-16.760157,-16.544997,-19.733648,-18.973519,-19.673427,-19.840188,-19.647717,...,-16.395753,-17.359078,-17.560540,-16.766031,-19.573700,-17.281669,-17.236673,2.292031,2.337027,0.044996
chr17_182083_183038,-16.742248,-16.819604,-16.963148,-16.067010,-16.544997,-19.733648,-18.973519,-18.574814,-18.230750,-18.549105,...,-16.395753,-17.359078,-17.560540,-16.627401,-18.812367,-17.281669,-17.236673,1.530698,1.575694,0.044996
chr17_185415_186343,-16.742248,-16.819604,-16.963148,-16.760157,-16.544997,-19.733648,-19.666666,-18.980280,-19.840188,-19.647717,...,-17.088900,-17.359078,-17.560540,-16.766031,-19.573700,-17.108382,-17.409960,2.465318,2.163740,-0.301578
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chr17_404429_405442,-15.132810,-15.027845,-15.017237,-15.373862,-15.158703,-15.376939,-15.390000,-15.355939,-15.296894,-15.343652,...,-14.093168,-14.063241,-14.193244,-15.142091,-15.352684,-15.535490,-14.318231,-0.182805,1.034454,1.217259
chr17_405553_406431,-15.355954,-15.433310,-14.660563,-14.680715,-15.446385,-15.339198,-15.555792,-15.546292,-15.318400,-15.371051,...,-15.297141,-14.720020,-15.481098,-15.115385,-15.426147,-15.624659,-14.915961,-0.198512,0.510186,0.708697
chr17_405554_406431,-15.355954,-15.433310,-14.660563,-14.680715,-15.446385,-15.339198,-15.555792,-15.546292,-15.318400,-15.371051,...,-15.297141,-14.720020,-15.481098,-15.115385,-15.426147,-15.624659,-14.915961,-0.198512,0.510186,0.708697
chr17_405555_406430,-15.355954,-15.433310,-14.660563,-14.680715,-15.446385,-15.339198,-15.555792,-15.546292,-15.318400,-15.371051,...,-15.297141,-14.720020,-15.481098,-15.115385,-15.426147,-15.624659,-14.915961,-0.198512,0.510186,0.708697


In [38]:
dat_cov = dat_cov_full.loc[:,["Input_20x", "TFX_DMSO_vs_Inp20x", "TFX_Dex_vs_Inp20x"]]

In [49]:
t = 5 * 800_000 / 200
t / 60 / 60 # min

5.555555555555555

## Get annotations

In [46]:
%%time
dct = dict()
for frg in frags:
    dct[frg] = get_annot(frg)
    
dat_ant = pd.DataFrame.from_dict(dct, orient="index").fillna(0)
dat_ant = dat_ant.add_prefix("Mtf_")
dat_ant.index.name = 'Fragment'
dat_ant

CPU times: user 240 ms, sys: 156 ms, total: 396 ms
Wall time: 13.6 s


Unnamed: 0_level_0,Mtf_HD/25,Mtf_TBX/2,Mtf_KLF/SP/2,Mtf_KLF/SP/1,Mtf_ZIC/2,Mtf_EGR,Mtf_TBX/3,Mtf_MEF2,Mtf_CCAAT/CEBP,Mtf_DDIT3+CEBPA,...,Mtf_ZNF586,Mtf_HD/24,Mtf_ZNF410,Mtf_NR2E3,Mtf_GMEB2/1,Mtf_BATF,Mtf_ZNF232,Mtf_EVI1/MECOM,Mtf_HD/3,Mtf_HSFY2
Fragment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chr17_120738_123102,7.5426,6.6612,24.166200,18.5122,8.71780,8.4291,16.7869,6.9162,7.8821,17.2874,...,0.0,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0000,0.0,0.000000
chr17_185415_186343,7.8272,10.5770,28.916850,47.5126,0.00000,8.6068,8.5285,0.0000,7.4018,0.0000,...,0.0,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0000,0.0,0.000000
chr17_405553_406431,7.4013,23.7199,26.300375,5.5367,0.00000,0.0000,7.9212,34.9609,0.0000,0.0000,...,0.0,0.0,5.3271,0.0,0.0,9.3144,0.0,13.8167,0.0,0.000000
chr17_405554_406431,7.4013,23.7199,26.300375,5.5367,0.00000,0.0000,7.9212,34.9609,0.0000,0.0000,...,0.0,0.0,5.3271,0.0,0.0,9.3144,0.0,13.8167,0.0,0.000000
chr17_405555_406430,7.4013,23.7199,26.300375,5.5367,0.00000,0.0000,7.9212,34.9609,0.0000,0.0000,...,0.0,0.0,5.3271,0.0,0.0,9.3144,0.0,13.8167,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chr17_249612_250572,0.0000,0.0000,0.000000,0.0000,7.99585,0.0000,8.2346,0.0000,0.0000,0.0000,...,0.0,0.0,1.9919,0.0,0.0,0.0000,0.0,0.0000,0.0,0.000000
chr17_249613_250572,0.0000,0.0000,0.000000,0.0000,7.99585,0.0000,8.2346,0.0000,0.0000,0.0000,...,0.0,0.0,1.9919,0.0,0.0,0.0000,0.0,0.0000,0.0,0.000000
chr17_402824_404444,0.0000,0.0000,0.000000,0.0000,0.00000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0000,0.0,0.000000
chr17_402946_404554,0.0000,0.0000,0.000000,0.0000,0.00000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0000,0.0,17.155067


In [50]:
t = 10 * 800_000 / 200
t / 60 / 60 # min

11.11111111111111