**Set environemnt**

In [1]:
import sys
sys.path.append('../')
from config_sing import *

You are on Duke Server: Singularity: Proj CombEffect
BASE DIRECTORY:     /mount/work
PATH OF SOURCE:     /mount/work/source
PATH OF EXECUTABLE: /mount/work/exe
PATH OF ANNOTATION: /mount/work/annotation
PATH OF PROJECT:    /mount/project
PATH OF RESULTS:    /mount/work/out/proj_combeffect


In [2]:
import sqlite3
import itertools as it
from collections import defaultdict
from mpl_toolkits.axes_grid1 import make_axes_locatable
plt.style.use("seaborn-whitegrid")

import math
import random
from sklearn.manifold import TSNE
from functools import reduce

### Set Samples
fun = np.core.defchararray.add
idx = np.arange(1,6).astype("str")

INPUT    = reduce(fun, ["Input", idx])
INPUT20X = reduce(fun, ["Input", idx,     "_20x"])
TFX_DMSO = reduce(fun, ["TFX",   idx[1:], "_DMSO"])
TFX_DEX  = reduce(fun, ["TFX",   idx[1:], "_Dex"])
SAMPLES  = np.concatenate([INPUT20X, TFX_DMSO, TFX_DEX])

### file path of database
fdiry = os.path.join(FD_RES, 'database')
fname = "fragment.db"
FPATH_DB = os.path.join(fdiry, fname)

In [3]:
### https://stackoverflow.com/questions/12581437/python-random-sample-with-a-generator-iterable-iterator
def iter_sample_fast(iterable, samplesize):
    
    ### init
    results = []
    iterator = iter(iterable)
    
    ### Fill in the first samplesize elements:
    try:
        for _ in range(samplesize):
            results.append(next(iterator))
    except StopIteration:
        ### return everything if samplesize > len(iterable)
        random.shuffle(results)
        return results
    
    ### continue iterating through the elements and update the list
    random.shuffle(results)  # Randomize their positions
    for i, v in enumerate(iterator, samplesize):
        r = random.randint(0, i)
        if r < samplesize:
            results[r] = v  # at a decreasing rate, replace random items
    return results

In [4]:
def get_frag(sample):
    ### set query
    query_out = f"""
        SELECT Cnt.fragment, Cnt.count, Cnt.sample
        FROM   Count Cnt
        WHERE  Cnt.sample = '{sample}'
        """
    
    ### file path of database
    fdiry = os.path.join(FD_RES, 'database')
    fname = "fragment.db"
    fpath_db = os.path.join(fdiry, fname)

    ### query out fragments from the sample
    with sqlite3.connect(fpath_db) as conn:
        query  = query_out
        cursor = conn.cursor()
        rows   = cursor.execute(query)
        for row in rows:
            yield row

def get_pct(frags):
    """"""
    ###
    txt = ', '.join('?' for _ in frags)
    query_out = f"""
        SELECT   Frg.fragment, Frg.pct_gc
        FROM     Fragment Frg
        WHERE    Frg.fragment IN ({txt})
        ORDER BY Frg.fragment
        """
    fpath_db = FPATH_DB
    with sqlite3.connect(fpath_db) as conn:
        query  = query_out
        cursor = conn.cursor()
        rows   = cursor.execute(query, frags)
    return rows

def get_depth(frag):
    """..."""
    ###
    chrom, start, end = frag.split("_")
    mid = math.ceil((int(start) + int(end)) / 2)
    query_out = f"""
        SELECT Cov.sample, Cov.depth, Sam.treatment, Sam.size
        FROM   Coverage Cov
        JOIN   Sample Sam ON Cov.sample = Sam.sample
        WHERE  Cov.location = {mid};
        """
    ###
    fpath_db = FPATH_DB
    with sqlite3.connect(fpath_db) as conn:
        query  = query_out
        cursor = conn.cursor()
        cursor.execute(query)
        rows   = cursor.fetchall()
    
    ###
    cnames = ["Sample", "Depth", "Trt", "Size"]
    dat = pd.DataFrame(rows, columns=cnames)
    dat = dat.assign(LogNorm_Depth=lambda x: np.log1p(x.Depth) - np.log(x.Size))
    res1 = pd.Series(dat["LogNorm_Depth"].values, index=dat["Sample"])
    
    ###
    dat = dat.groupby("Trt")["LogNorm_Depth"].mean()
    dat["TFX_DMSO_vs_Inp20x"] = dat["TFX_DMSO"] - dat["Input_20x"]
    dat["TFX_Dex_vs_Inp20x"]  = dat["TFX_Dex"]  - dat["Input_20x"]
    res2 = dat
    
    return pd.concat([res1, res2], axis=0)

def get_annot(frag, verbose=False):
    ###
    chrom, start, end = frag.split("_")
    
    ### set query
    #query_out = f"""
    #    SELECT a.fragment, m.motif, m.score
    #    FROM Annotation a
    #    JOIN Motif      m ON a.binding = m.binding
    #    WHERE a.fragment = '{frag}'
    #    ;"""
    query_out = (f"""
        SELECT Mtf.motif, Mtf.score
        FROM Motif Mtf
        WHERE Mtf.start >= {start} AND Mtf.end <= {end}
        ;""")
        #WHERE m.chrom = '{chrom}' AND m.start >= {start} AND m.end <= {end}
    ### file path of database
    fdiry = os.path.join(FD_RES, 'database')
    fname = "fragment.db"
    fpath_db = os.path.join(fdiry, fname)

    ### query out motif bindings on the fragment
    dct = defaultdict(lambda: 0)
    with sqlite3.connect(fpath_db) as conn:
        query  = query_out
        cursor = conn.cursor()
        rows   = cursor.execute(query)
        rows   = list(rows)
        #for idx, row in enumerate(rows):
        #    ### parse info
        #    motif, score = row
        #    
        #    ### sum the scores with same motifs
        #    dct[motif] += score
    
    #if verbose:
    #    print(f"Fragment: {frag}; # Bindings: {idx+1}; # Motifs: {len(dct)}")
    print(f"Fragment: {frag}")
    return rows


def get_annot3(frg, fpath_db=FPATH_DB, verbose=True):
    ### parse fragment info
    chrom, start, end = frg.split("_")
    
    with sqlite3.connect(fpath_db) as conn:
        ### set query: query all motifs binding to a given fragment
        query_out = f"""
            SELECT Ant.binding
            FROM Annotation Ant
            WHERE Ant.fragment = '{frg}'
            ;"""    
        
        ### query out motif bindings on the fragment
        query  = query_out
        cursor = conn.cursor()
        rows   = cursor.execute(query)
    
        ### set query: query the corresponding motif info of each annotation
        fun  = lambda x, y: x + y
        mtfs = list(rows)
        mtfs = reduce(fun, mtfs)
        txt  = ','.join('?' * len(mtfs))

        query_out = f"""
            SELECT Mtf.motif, Mtf.score
            FROM Motif Mtf
            WHERE Mtf.binding IN ({txt})
            ;"""
    
        ### query out motif info
        query  = query_out
        cursor = conn.cursor()
        rows   = cursor.execute(query, mtfs)
    
    ### summarize the motif scores
    dct = defaultdict(lambda: 0)
    for idx, row in enumerate(rows):
        ### parse info
        motif, score = row

        ### sum the scores with same motifs
        dct[motif] += score
        
    if verbose:
        print(f"Fragment: {frg}; # Bindings: {idx+1}; # Motifs: {len(dct)}")
    
    return dct

In [5]:
N   = 10
random.seed(123)
sam = "Input1_20x"
gen = get_frag(sam)
lst = iter_sample_fast(gen, N)
lst = np.sort(lst, axis=0)
print(len(lst))
lst

10


array([['chr17_8148522_8149462', '1', 'Input1_20x'],
       ['chr17_8149255_8150236', '1', 'Input1_20x'],
       ['chr17_8149779_8150654', '1', 'Input1_20x'],
       ['chr17_8149919_8150849', '1', 'Input1_20x'],
       ['chr17_8150088_8151099', '1', 'Input1_20x'],
       ['chr17_8150346_8151352', '1', 'Input1_20x'],
       ['chr17_8151372_8152277', '1', 'Input1_20x'],
       ['chr17_8151907_8152829', '1', 'Input1_20x'],
       ['chr17_8152675_8153701', '2', 'Input1_20x'],
       ['chr17_8156648_8157597', '3', 'Input1_20x']], dtype='<U21')

In [6]:
query_out = """
    SELECT name 
    FROM sqlite_master 
    WHERE type = 'index';
    """
###
fpath_db = FPATH_DB
with sqlite3.connect(fpath_db) as conn:
    query  = query_out
    cursor = conn.cursor()
    cursor.execute(query)
    rows   = cursor.fetchall()
rows

[('sqlite_autoindex_Sample_1',),
 ('sqlite_autoindex_Fragment_1',),
 ('sqlite_autoindex_Motif_1',),
 ('idx_location',),
 ('idx_motif_loc',),
 ('sqlite_autoindex_Annotation_1',)]

In [7]:
%%time
random.seed(123)
N = 10

lst_frg = list()
for sam in SAMPLES:
    print(sam)
    
    ### random select fragments
    gen = get_frag(sam)
    lst = iter_sample_fast(gen, N)
    lst = np.sort(lst, axis=0)
    lst_frg.append(lst)

Input1_20x
Input2_20x
Input3_20x
Input4_20x
Input5_20x
TFX2_DMSO
TFX3_DMSO
TFX4_DMSO
TFX5_DMSO
TFX2_Dex
TFX3_Dex
TFX4_Dex
TFX5_Dex
CPU times: user 103 ms, sys: 17 ms, total: 120 ms
Wall time: 118 ms


In [8]:
dat = pd.DataFrame(np.concatenate(lst_frg), columns=["Fragment", "Count", "Sample"])
dat[['Chrom', 'Start', 'End']] = dat['Fragment'].str.split('_', expand=True)

dat = dat.astype({"Start": int, "End": int, "Count": int})
dat = dat.assign(Mid = lambda x: np.ceil((x.Start + x.End) / 2))
dat = dat.astype({"Mid": int})

dat = dat.set_index("Fragment")
dat_frg = dat
dat_frg

Unnamed: 0_level_0,Count,Sample,Chrom,Start,End,Mid
Fragment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
chr17_8148522_8149462,1,Input1_20x,chr17,8148522,8149462,8148992
chr17_8149255_8150236,1,Input1_20x,chr17,8149255,8150236,8149746
chr17_8149779_8150654,1,Input1_20x,chr17,8149779,8150654,8150217
chr17_8149919_8150849,1,Input1_20x,chr17,8149919,8150849,8150384
chr17_8150088_8151099,1,Input1_20x,chr17,8150088,8151099,8150594
...,...,...,...,...,...,...
chr17_8151684_8152721,1,TFX5_Dex,chr17,8151684,8152721,8152203
chr17_8153613_8154670,1,TFX5_Dex,chr17,8153613,8154670,8154142
chr17_8153620_8154586,2,TFX5_Dex,chr17,8153620,8154586,8154103
chr17_8153998_8155033,2,TFX5_Dex,chr17,8153998,8155033,8154516


In [9]:
frags = dat.index.to_numpy()
frags = np.unique(frags)
frags.shape

(129,)

In [10]:
%%time
gen = get_pct(frags)
dat_pct = pd.DataFrame(gen, columns = ["Fragment", "Pct_GC"])
dat_pct = dat_pct.set_index("Fragment")
dat_pct

CPU times: user 3.46 ms, sys: 4 ms, total: 7.45 ms
Wall time: 6.2 ms


Unnamed: 0_level_0,Pct_GC
Fragment,Unnamed: 1_level_1
chr17_8148192_8149098,0.569536
chr17_8148308_8149342,0.558994
chr17_8148371_8149369,0.553106
chr17_8148522_8149462,0.569149
chr17_8148537_8149571,0.580271
...,...
chr17_8156648_8157597,0.538462
chr17_8157616_8158582,0.540373
chr17_8158846_8159873,0.524830
chr17_8159408_8160355,0.517423


In [11]:
%%time
dct = dict()
for frg in frags:
    dct[frg] = get_depth(frg)
    
dat_cov = pd.DataFrame.from_dict(dct, orient="index")
dat_cov.index.name = 'Fragment'
dat_cov

CPU times: user 973 ms, sys: 67.5 ms, total: 1.04 s
Wall time: 2.83 s


Unnamed: 0_level_0,Input1,Input2,Input3,Input4,Input5,Input1_20x,Input2_20x,Input3_20x,Input4_20x,Input5_20x,...,TFX2_Dex,TFX3_Dex,TFX4_Dex,TFX5_Dex,Input,Input_20x,TFX_DMSO,TFX_Dex,TFX_DMSO_vs_Inp20x,TFX_Dex_vs_Inp20x
Fragment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chr17_8148192_8149098,-14.177299,-14.334697,-14.255097,-14.121099,-14.242412,-13.895917,-14.042648,-13.917685,-14.014188,-13.967544,...,-14.165585,-13.592392,-14.525864,-14.264703,-14.226121,-13.967596,-13.494354,-14.137136,0.473243,-0.169540
chr17_8148308_8149342,-14.439663,-14.047015,-14.255097,-14.562932,-14.465556,-13.990644,-13.989912,-13.862286,-14.081287,-13.970963,...,-14.020403,-13.592392,-14.874171,-14.341664,-14.354053,-13.979018,-13.577040,-14.207158,0.401978,-0.228139
chr17_8148371_8149369,-14.439663,-13.986391,-14.190559,-14.680715,-14.465556,-13.984255,-13.976306,-13.838616,-14.038070,-13.933984,...,-14.020403,-13.592392,-14.874171,-14.094804,-14.352577,-13.954246,-13.553238,-14.145443,0.401008,-0.191196
chr17_8148522_8149462,-14.344353,-13.775082,-14.255097,-14.562932,-14.242412,-14.006800,-13.952933,-13.889602,-14.047175,-13.953985,...,-13.942441,-13.533552,-14.314555,-13.977021,-14.235975,-13.970099,-13.619920,-13.941892,0.350179,0.028206
chr17_8148537_8149571,-14.344353,-13.728562,-14.129934,-14.362262,-14.347773,-13.943687,-13.901475,-13.868292,-14.023077,-13.924132,...,-13.917749,-13.451314,-14.314555,-13.871660,-14.182577,-13.932133,-13.680365,-13.888820,0.251768,0.043313
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chr17_8156648_8157597,-13.909035,-14.111554,-13.436787,-13.987568,-13.836947,-13.762386,-13.753163,-13.770793,-13.811910,-13.930689,...,-13.016200,-13.197080,-12.763958,-12.583806,-13.856378,-13.805788,-12.508154,-12.890261,1.297634,0.915527
chr17_8157616_8158582,-14.439663,-15.433310,-15.576853,-14.814247,-14.347773,-14.777820,-14.806853,-14.902742,-14.609080,-14.706075,...,-14.858732,-17.088900,-15.161853,-17.560540,-14.922369,-14.760514,-15.362203,-16.167506,-0.601689,-1.406992
chr17_8158846_8159873,-14.545023,-14.622379,-14.398198,-14.562932,-14.347773,-14.208195,-14.282171,-14.205367,-14.351251,-14.231617,...,-16.021883,-17.088900,-17.359078,-16.174246,-14.495261,-14.255720,-16.335621,-16.661027,-2.079902,-2.405307
chr17_8159408_8160355,-14.344353,-14.622379,-14.255097,-14.680715,-15.158703,-14.278326,-14.451730,-14.390223,-14.541871,-14.553967,...,-16.021883,-15.297141,-15.413168,-15.768780,-14.612250,-14.443223,-15.967295,-15.625243,-1.524072,-1.182019


In [13]:
2.5 / 130 * 6500

125.0

In [23]:
%%time
frg = "chr17_8148192_8149098"
dct = get_annot3(frg, verbose=True)
lst = np.array(lst)
tmp = np.unique(lst[:,1])
print(lst.shape, tmp.shape)

Fragment: chr17_8148192_8149098 828
(828, 2) (207,)
CPU times: user 480 ms, sys: 351 ms, total: 831 ms
Wall time: 833 ms


In [20]:
tmp

array(['chr17_8148192_8149098', 'chr17_8148196_8148204_HD/22',
       'chr17_8148197_8148217_Ebox/CAGATGG',
       'chr17_8148197_8148217_GC-tract', 'chr17_8148202_8148210_MYB/5',
       'chr17_8148202_8148212_Ebox/CAGCTG',
       'chr17_8148203_8148226_KLF/SP/2',
       'chr17_8148204_8148211_Ebox/CACCTG',
       'chr17_8148207_8148220_KLF/SP/1', 'chr17_8148236_8148250_GLIS',
       'chr17_8148238_8148250_INSM1', 'chr17_8148243_8148253_NFKB/1',
       'chr17_8148243_8148257_PLAG1', 'chr17_8148244_8148262_REST/NRSF',
       'chr17_8148247_8148265_TBX/3', 'chr17_8148247_8148266_ZNF143',
       'chr17_8148251_8148273_ZNF41', 'chr17_8148252_8148271_ZNF320',
       'chr17_8148255_8148275_GC-tract', 'chr17_8148257_8148268_AP1/2',
       'chr17_8148259_8148277_ZNF554', 'chr17_8148263_8148280_ZNF768',
       'chr17_8148272_8148282_GLI', 'chr17_8148276_8148291_NR/15',
       'chr17_8148298_8148343_GC-tract', 'chr17_8148299_8148313_HIC/1',
       'chr17_8148310_8148319_SREBF1', 'chr17_8148316_8

In [12]:
%%time
dct = dict()
for frg in frags:
    dct[frg] = get_annot3(frg, verbose=True)

Fragment: chr17_8148192_8149098; # Bindings: 207; # Motifs: 104
Fragment: chr17_8148308_8149342; # Bindings: 238; # Motifs: 119
Fragment: chr17_8148371_8149369; # Bindings: 229; # Motifs: 118
Fragment: chr17_8148522_8149462; # Bindings: 244; # Motifs: 121
Fragment: chr17_8148537_8149571; # Bindings: 278; # Motifs: 126
Fragment: chr17_8148942_8150026; # Bindings: 270; # Motifs: 126
Fragment: chr17_8148968_8149983; # Bindings: 246; # Motifs: 121
Fragment: chr17_8149102_8150207; # Bindings: 264; # Motifs: 125
Fragment: chr17_8149202_8150152; # Bindings: 228; # Motifs: 119
Fragment: chr17_8149255_8150236; # Bindings: 228; # Motifs: 116
Fragment: chr17_8149497_8150482; # Bindings: 199; # Motifs: 106
Fragment: chr17_8149499_8150480; # Bindings: 199; # Motifs: 106
Fragment: chr17_8149748_8150805; # Bindings: 241; # Motifs: 114
Fragment: chr17_8149775_8150701; # Bindings: 203; # Motifs: 104
Fragment: chr17_8149779_8150654; # Bindings: 188; # Motifs: 100
Fragment: chr17_8149786_8150753; # Bindi

In [14]:
%%time
dct = dict()
for frg in frags:
    dct[frg] = get_annot(frg, verbose=True)

Fragment: chr17_8148192_8149098
Fragment: chr17_8148308_8149342
Fragment: chr17_8148371_8149369
Fragment: chr17_8148522_8149462
Fragment: chr17_8148537_8149571
Fragment: chr17_8148942_8150026
Fragment: chr17_8148968_8149983
Fragment: chr17_8149102_8150207
Fragment: chr17_8149202_8150152
Fragment: chr17_8149255_8150236
Fragment: chr17_8149497_8150482
Fragment: chr17_8149499_8150480
Fragment: chr17_8149748_8150805
Fragment: chr17_8149775_8150701
Fragment: chr17_8149779_8150654
Fragment: chr17_8149786_8150753
Fragment: chr17_8149787_8150748
Fragment: chr17_8149804_8150896
Fragment: chr17_8149810_8150902
Fragment: chr17_8149875_8150843
Fragment: chr17_8149919_8150849
Fragment: chr17_8150082_8151049
Fragment: chr17_8150088_8151099
Fragment: chr17_8150346_8151352
Fragment: chr17_8150419_8151281
Fragment: chr17_8150421_8151368
Fragment: chr17_8150423_8151409
Fragment: chr17_8150426_8151276
Fragment: chr17_8150473_8151514
Fragment: chr17_8150505_8151321
Fragment: chr17_8150638_8151508
Fragment

In [19]:
%%time
dct = dict()
for frg in frags:
    dct[frg] = get_annot(frg, verbose=True)
    
dat_ant = pd.DataFrame.from_dict(dct, orient="index").fillna(0)
dat_ant

Fragment: chr17_8148192_8149098; # Bindings: 207; # Motifs: 104
Fragment: chr17_8148308_8149342; # Bindings: 238; # Motifs: 119
Fragment: chr17_8148371_8149369; # Bindings: 229; # Motifs: 118
Fragment: chr17_8148522_8149462; # Bindings: 244; # Motifs: 121
Fragment: chr17_8148537_8149571; # Bindings: 278; # Motifs: 126
Fragment: chr17_8148942_8150026; # Bindings: 270; # Motifs: 126
Fragment: chr17_8148968_8149983; # Bindings: 246; # Motifs: 121
Fragment: chr17_8149102_8150207; # Bindings: 264; # Motifs: 125
Fragment: chr17_8149202_8150152; # Bindings: 228; # Motifs: 119
Fragment: chr17_8149255_8150236; # Bindings: 228; # Motifs: 116
Fragment: chr17_8149497_8150482; # Bindings: 199; # Motifs: 106
Fragment: chr17_8149499_8150480; # Bindings: 199; # Motifs: 106
Fragment: chr17_8149748_8150805; # Bindings: 241; # Motifs: 114
Fragment: chr17_8149775_8150701; # Bindings: 203; # Motifs: 104
Fragment: chr17_8149779_8150654; # Bindings: 188; # Motifs: 100
Fragment: chr17_8149786_8150753; # Bindi

Unnamed: 0,HD/22,Ebox/CAGATGG,GC-tract,MYB/5,Ebox/CAGCTG,KLF/SP/2,Ebox/CACCTG,KLF/SP/1,GLIS,INSM1,...,NR/14,DDIT3+CEBPA,IRF/4,OVOL1,HLTF,HD/25,AIRE,SOX/5,HD/7,CREB/ATF/3
chr17_8148192_8149098,14.6036,35.13880,75.147987,9.1443,19.95310,98.509625,11.3327,63.95465,7.9934,19.9736,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
chr17_8148308_8149342,7.3287,24.84800,79.606299,0.0000,9.66970,125.719225,0.0000,54.42955,3.9846,7.5934,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
chr17_8148371_8149369,7.3287,25.03620,81.191616,0.0000,9.66970,119.292975,0.0000,54.42955,3.9846,14.4522,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
chr17_8148522_8149462,7.3287,25.03620,70.541116,0.0000,9.66970,120.509942,8.3829,54.42955,10.9963,21.3256,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
chr17_8148537_8149571,7.3287,25.03620,79.562016,0.0000,18.92440,127.703642,28.5823,54.42955,15.7707,21.3256,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chr17_8156648_8157597,30.6499,7.29210,42.570350,0.0000,25.69250,54.087850,0.0000,13.59790,0.0000,0.0000,...,7.6234,7.8289,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
chr17_8157616_8158582,18.6213,0.00000,109.272767,0.0000,17.17370,87.904137,18.1517,25.86000,6.1203,0.0000,...,7.4834,8.6869,6.0865,8.0798,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
chr17_8158846_8159873,7.2749,26.64630,64.984319,0.0000,24.34465,108.724865,9.7583,77.16285,27.9354,21.3141,...,0.0000,0.0000,0.0000,0.0000,5.9501,7.3717,7.5632,5.7624,0.0000,0.0000
chr17_8159408_8160355,7.8631,19.18235,77.019894,8.8571,34.98770,104.709915,19.0668,59.79600,19.7511,5.3954,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,7.3736,5.7624,7.4541,6.7701


In [84]:
%%time
dct = dict()
for frg in frags:
    dct[frg] = get_annot(frg, verbose=True)
    
dat_ant = pd.DataFrame.from_dict(dct, orient="index").fillna(0)
dat_ant

Fragment: chr17_8148192_8149098; # Bindings: 207; # Motifs: 104
Fragment: chr17_8148308_8149342; # Bindings: 238; # Motifs: 119
Fragment: chr17_8148371_8149369; # Bindings: 229; # Motifs: 118
Fragment: chr17_8148522_8149462; # Bindings: 244; # Motifs: 121
Fragment: chr17_8148537_8149571; # Bindings: 278; # Motifs: 126
Fragment: chr17_8148942_8150026; # Bindings: 270; # Motifs: 126
Fragment: chr17_8148968_8149983; # Bindings: 246; # Motifs: 121
Fragment: chr17_8149102_8150207; # Bindings: 264; # Motifs: 125
Fragment: chr17_8149202_8150152; # Bindings: 228; # Motifs: 119
Fragment: chr17_8149255_8150236; # Bindings: 228; # Motifs: 116
Fragment: chr17_8149497_8150482; # Bindings: 199; # Motifs: 106
Fragment: chr17_8149499_8150480; # Bindings: 199; # Motifs: 106
Fragment: chr17_8149748_8150805; # Bindings: 241; # Motifs: 114
Fragment: chr17_8149775_8150701; # Bindings: 203; # Motifs: 104
Fragment: chr17_8149779_8150654; # Bindings: 188; # Motifs: 100
Fragment: chr17_8149786_8150753; # Bindi

Unnamed: 0,HD/22,MYB/5,Ebox/CACCTG,Ebox/CAGCTG,Ebox/CAGATGG,GC-tract,KLF/SP/1,KLF/SP/2,GLIS,INSM1,...,NR/14,DDIT3+CEBPA,IRF/4,OVOL1,HLTF,HD/25,AIRE,SOX/5,HD/7,CREB/ATF/3
chr17_8148192_8149098,14.6036,9.1443,11.3327,19.95310,35.13880,75.147987,63.95465,98.509625,7.9934,19.9736,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
chr17_8148308_8149342,7.3287,0.0000,0.0000,9.66970,24.84800,79.606299,54.42955,125.719225,3.9846,7.5934,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
chr17_8148371_8149369,7.3287,0.0000,0.0000,9.66970,25.03620,81.191616,54.42955,119.292975,3.9846,14.4522,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
chr17_8148522_8149462,7.3287,0.0000,8.3829,9.66970,25.03620,70.541116,54.42955,120.509942,10.9963,21.3256,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
chr17_8148537_8149571,7.3287,0.0000,28.5823,18.92440,25.03620,79.562016,54.42955,127.703642,15.7707,21.3256,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chr17_8156648_8157597,30.6499,0.0000,0.0000,25.69250,7.29210,42.570350,13.59790,54.087850,0.0000,0.0000,...,7.6234,7.8289,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
chr17_8157616_8158582,18.6213,0.0000,18.1517,17.17370,0.00000,109.272767,25.86000,87.904137,6.1203,0.0000,...,7.4834,8.6869,6.0865,8.0798,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
chr17_8158846_8159873,7.2749,0.0000,9.7583,24.34465,26.64630,64.984319,77.16285,108.724865,27.9354,21.3141,...,0.0000,0.0000,0.0000,0.0000,5.9501,7.3717,7.5632,5.7624,0.0000,0.0000
chr17_8159408_8160355,7.8631,8.8571,19.0668,34.98770,19.18235,77.019894,59.79600,104.709915,19.7511,5.3954,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,7.3736,5.7624,7.4541,6.7701


In [85]:
1.5 / 130 * 6500

75.0

In [83]:
dat = dat_frg.join(dat_pct).join(dat_cov)
dat = dat.reset_index()

dat["Sample"] = pd.Categorical(dat["Sample"], SAMPLES)
dat = dat.sort_values(["Sample", "Fragment"])
dat

Unnamed: 0,Fragment,Count,Sample,Chrom,Start,End,Mid,Pct_GC,Input1,Input2,...,TFX2_Dex,TFX3_Dex,TFX4_Dex,TFX5_Dex,Input,Input_20x,TFX_DMSO,TFX_Dex,TFX_DMSO_vs_Inp20x,TFX_Dex_vs_Inp20x
3,chr17_8148522_8149462,1,Input1_20x,chr17,8148522,8149462,8148992,0.569149,-14.344353,-13.775082,...,-13.942441,-13.533552,-14.314555,-13.977021,-14.235975,-13.970099,-13.619920,-13.941892,0.350179,0.028206
9,chr17_8149255_8150236,1,Input1_20x,chr17,8149255,8150236,8149746,0.612640,-14.257341,-13.775082,...,-13.212480,-13.997858,-13.316026,-13.166091,-13.774732,-13.506116,-13.566212,-13.423114,-0.060096,0.083002
14,chr17_8149779_8150654,1,Input1_20x,chr17,8149779,8150654,8150217,0.608000,-13.564194,-13.684110,...,-13.131511,-13.375328,-13.351744,-12.764749,-13.491223,-13.199594,-13.355003,-13.155833,-0.155409,0.043761
20,chr17_8149919_8150849,1,Input1_20x,chr17,8149919,8150849,8150384,0.613979,-13.564194,-13.775082,...,-12.958492,-13.238752,-13.154385,-12.985829,-13.506364,-13.219094,-13.225711,-13.084365,-0.006616,0.134730
22,chr17_8150088_8151099,1,Input1_20x,chr17,8150088,8151099,8150594,0.606330,-13.523372,-13.684110,...,-13.066973,-13.045849,-12.805201,-12.833152,-13.528651,-13.177106,-12.844342,-12.937794,0.332764,0.239313
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51,chr17_8151684_8152721,1,TFX5_Dex,chr17,8151684,8152721,8152203,0.637416,-13.484151,-13.264256,...,-11.332372,-11.172698,-11.099496,-11.347934,-13.477111,-13.300605,-12.839762,-11.238125,0.460843,2.062480
72,chr17_8153613_8154670,1,TFX5_Dex,chr17,8153613,8154670,8154142,0.575213,-14.034198,-14.111554,...,-11.140597,-11.189003,-11.061968,-10.923282,-14.380759,-14.044646,-12.897609,-11.078713,1.147036,2.965933
73,chr17_8153620_8154586,2,TFX5_Dex,chr17,8153620,8154586,8154103,0.566253,-14.103191,-14.047015,...,-11.249505,-11.314348,-11.279144,-11.050282,-14.431022,-14.141515,-13.023038,-11.223320,1.118477,2.918195
76,chr17_8153998_8155033,2,TFX5_Dex,chr17,8153998,8155033,8154516,0.583575,-13.969659,-14.254655,...,-10.746834,-10.539249,-10.609146,-10.395046,-14.222971,-13.820009,-12.618269,-10.572569,1.201741,3.247440


In [23]:
dat = pd.DataFrame(np.concatenate(lst_frg), columns=["Fragment", "Count", "Sample"])
dat["Sample"] = pd.Categorical(dat["Sample"], SAMPLES)
dat.sort_values("Sample")
dat

Unnamed: 0,Fragment,Count,Sample
0,chr17_8148522_8149462,1,Input1_20x
1,chr17_8149255_8150236,1,Input1_20x
2,chr17_8149779_8150654,1,Input1_20x
3,chr17_8149919_8150849,1,Input1_20x
4,chr17_8150088_8151099,1,Input1_20x
...,...,...,...
125,chr17_8151684_8152721,1,TFX5_Dex
126,chr17_8153613_8154670,1,TFX5_Dex
127,chr17_8153620_8154586,2,TFX5_Dex
128,chr17_8153998_8155033,2,TFX5_Dex


In [28]:
SAMPLES

array(['Input1_20x', 'Input2_20x', 'Input3_20x', 'Input4_20x',
       'Input5_20x', 'TFX2_DMSO', 'TFX3_DMSO', 'TFX4_DMSO', 'TFX5_DMSO',
       'TFX2_Dex', 'TFX3_Dex', 'TFX4_Dex', 'TFX5_Dex'], dtype='<U30')

In [30]:
dat[SAMPLES]

Input1_20x   -20.212248
Input2_20x   -20.134596
Input3_20x   -20.042905
Input4_20x   -20.270194
Input5_20x   -20.136211
TFX2_DMSO    -18.785983
TFX3_DMSO    -19.547500
TFX4_DMSO    -19.976617
TFX5_DMSO    -20.460597
TFX2_Dex     -20.151217
TFX3_Dex     -19.566608
TFX4_Dex     -20.721927
TFX5_Dex     -20.205221
dtype: float64

In [38]:
SAMPLES[:5]

array(['Input1_20x', 'Input2_20x', 'Input3_20x', 'Input4_20x',
       'Input5_20x'], dtype='<U30')

In [44]:
frag="chr17_8148522_8149462"
dat = get_depth(frag)
dat.drop(labels=INPUT).drop(labels="Input")
dat

Input1               -20.832030
Input2               -19.943631
Input3               -20.665294
Input4               -21.179795
Input5               -20.699460
Input1_20x           -20.212248
Input2_20x           -20.134596
Input3_20x           -20.042905
Input4_20x           -20.270194
Input5_20x           -20.136211
TFX2_DMSO            -18.785983
TFX3_DMSO            -19.547500
TFX4_DMSO            -19.976617
TFX5_DMSO            -20.460597
TFX2_Dex             -20.151217
TFX3_Dex             -19.566608
TFX4_Dex             -20.721927
TFX5_Dex             -20.205221
Input                -20.664042
Input_20x            -20.159231
TFX_DMSO             -19.692674
TFX_Dex              -20.161243
TFX_DMSO_vs_Inp20x     0.466557
TFX_Dex_vs_Inp20x     -0.002013
dtype: float64

In [None]:
%%time
random.seed(123)
N = 500

lst_ant = list()
for sam in SAMPLES:
    print(sam)
    
    ### random select fragments
    gen = gen_frag(sam)
    lst = iter_sample_fast(gen, N)
    
    ### for those fragments, get motif annotations
    for row in lst:
        frag, length, pct_gc = row
        dct = get_annot(frag)
        dct["Sample"] = sam
        dct["Length"] = length
        lst_ant.append(dct)