# Sampling output fragments from  ChrX:48,780,000-48,826,000

**Set environment**

In [1]:
### basic
import sys
sys.path.append('../')
from config_sing import *

### specific tools
from collections import defaultdict
from functools import reduce
import itertools as it
import math
import random
import sqlite3
# https://stackoverflow.com/questions/49456158/integer-in-python-pandas-becomes-blob-binary-in-sqlite
sqlite3.register_adapter(np.int64, lambda val: int(val))
sqlite3.register_adapter(np.int32, lambda val: int(val))

### file path of fragment database
fdiry = os.path.join(FD_RES, "KS91_K562_ASTARRseq", 'database')
fname = "fragment_astarr_gata1.db"
FPATH_DB_ASTARR = os.path.join(fdiry, fname)

fdiry = os.path.join(FD_RES, "Tewhey_K562_TileMPRA", 'database')
fname = "fragment_tilempra_gata1.db"
FPATH_DB_TMPRA = os.path.join(fdiry, fname)

FPATHS_DB = [FPATH_DB_ASTARR, FPATH_DB_TMPRA]

### samples of ATAC-STARR-seq
SAMPLES = ["Output_rep1", "Output_rep2", "Output_rep3", "Output_rep4"]

You are in: Singularity: singularity_proj_combeffect
BASE DIRECTORY:     /mount/work
PATH OF SOURCE:     /mount/work/source
PATH OF EXECUTABLE: /mount/work/exe
PATH OF ANNOTATION: /mount/work/annotation
PATH OF PROJECT:    /mount/project
PATH OF RESULTS:    /mount/work/out/proj_combeffect_encode_fcc



## Helper function

In [2]:
### https://stackoverflow.com/questions/12581437/python-random-sample-with-a-generator-iterable-iterator
def iter_sample_fast(iterable, samplesize):
    
    ### init
    results = []
    iterator = iter(iterable)
    
    ### Fill in the first samplesize elements:
    try:
        for _ in range(samplesize):
            results.append(next(iterator))
    except StopIteration:
        ### return everything if samplesize > len(iterable)
        random.shuffle(results)
        return results
    
    ### continue iterating through the elements and update the list
    random.shuffle(results)  # Randomize their positions
    for i, v in enumerate(iterator, samplesize):
        r = random.randint(0, i)
        if r < samplesize:
            results[r] = v  # at a decreasing rate, replace random items
    return results

In [3]:
def get_frag_astarr(sample, start=None, end=None, fpath_db = FPATH_DB_ASTARR):
    """sample fragments from ATAC-STARR-seq with probability proportion to count"""
    ### set query
    if (start is None) and (end is None):
        query  = f"""
            SELECT Cnt.fragment, Cnt.sample, Frg.pct_gc, Cnt.count
            FROM   Count    Cnt
            JOIN   Fragment Frg
            ON     Cnt.fragment = Frg.fragment
            WHERE  Cnt.sample = '{sample}'
            """
    else:
        query  = f"""
            SELECT Cnt.fragment, Cnt.sample, Frg.pct_gc, Cnt.count
            FROM   Count    Cnt
            JOIN   Fragment Frg
            ON     Cnt.fragment = Frg.fragment
            WHERE  Cnt.sample = '{sample}' AND 
                   Frg.start >= '{start}'  AND 
                   Frg.end   <= '{end}'
            """
    
    with sqlite3.connect(fpath_db) as conn:
        ### query the fragment annotations
        cursor = conn.cursor()
        cursor = cursor.execute(query)
        rows   = cursor
        
        ### generate fragments
        for row in rows:
            ### parse info
            frg, sample, pct_gc, count = row
            
            ### repeat the each fragment based on its count
            for _ in range(count):
                yield row
                
                
def get_frag_tmpra(start=None, end=None, fpath_db = FPATH_DB_TMPRA):
    """sample fragments from TileMPRA with probability proportion to fold change"""
    ### set query
    if (start is None) and (end is None):
        query  = """
            SELECT Cnt.fragment, Frg.pct_gc, Cnt.count_input, Cnt.count_output, Cnt.log2fc 
            FROM   Count Cnt
            JOIN   Fragment Frg
            ON     Cnt.fragment = Frg.fragment
            """
    else:
        query  = f"""
            SELECT Cnt.fragment, Frg.pct_gc, Cnt.count_input, Cnt.count_output, Cnt.log2fc 
            FROM   Count    Cnt
            JOIN   Fragment Frg
            ON     Cnt.fragment = Frg.fragment
            WHERE  Frg.start >= '{start}' AND 
                   Frg.end   <= '{end}'
            """
        
    with sqlite3.connect(fpath_db) as conn:
        ### query the fragment annotations
        cursor = conn.cursor()
        cursor = cursor.execute(query)
        rows   = cursor

        ### generate fragments
        for row in rows:
            ### parse info
            frg, pct_gc, inp, out, log2fc = row
            
            ### repeat the each fragment based on its ratio
            ratio = np.exp2(log2fc)
            count = np.ceil(ratio).astype(np.int)
            for _ in range(count):
                yield row

In [4]:
def get_depth(locs, fpath_db = FPATH_DB_ASTARR):
    """get"""
    ### set query
    txt   = ', '.join('?' for _ in locs)
    query = f"""
        SELECT   Cov.location, Cov.sample, Cov.depth, Sam.treatment, Sam.size
        FROM     Coverage Cov
        JOIN     Sample   Sam 
        ON       Cov.sample = Sam.sample
        WHERE    Cov.location IN ({txt})
        ORDER BY Cov.location
        """
    
    ### query out from database
    with sqlite3.connect(fpath_db) as conn:
        cursor = conn.cursor()
        cursor = cursor.execute(query, locs)
    
    ### generate each row
    for row in cursor:
        yield row
        
def get_annot(frgs, fpath_db):
    """get annotation from the given fragments"""
    ### set query
    txt   = ', '.join('?' for _ in frgs)
    query = f"""
        SELECT   Ant.Fragment, Mtf.motif, Mtf.score
        FROM     Annotation Ant
        JOIN     Motif      Mtf 
        ON       Ant.binding = Mtf.binding
        WHERE    Ant.Fragment IN ({txt})
        ORDER BY Ant.Fragment
        """
    
    ## query out from database
    with sqlite3.connect(fpath_db) as conn:
        cursor = conn.cursor()    
        cursor = cursor.execute(query, frgs)
        
    ### summarize the motif annotation scores
    dct_ann = defaultdict(lambda: defaultdict(lambda: 0.0))
    for row in cursor:
        frg, mtf, val      = row
        dct_ann[frg][mtf] += val
            
    return dct_ann

**Number of fragments**

In [5]:
%%time
fpath_db  = FPATH_DB_ASTARR
count_tot = 0

for sample in SAMPLES:
    ### set query
    query = f"""
        SELECT Cnt.fragment, Cnt.count
        FROM   Count    Cnt
        JOIN   Fragment Frg
        ON     Cnt.fragment = Frg.fragment
        WHERE  Cnt.sample = '{sample}'
        """
    
    ### query the fragment counts
    with sqlite3.connect(fpath_db) as conn:    
        cursor = conn.cursor()
        cursor = cursor.execute(query)
        rows   = cursor

    ### count fragments
    count = 0
    for row in rows:
        frg, cnt = row
        count += cnt
        
    count_tot += count
    print(sample, count)
    
print("Total      ", count_tot)

Output_rep1 505724
Output_rep2 686033
Output_rep3 441621
Output_rep4 739000
Total       2372378
CPU times: user 987 ms, sys: 696 ms, total: 1.68 s
Wall time: 1min 18s


In [6]:
%%time
fpath_db  = FPATH_DB_ASTARR
count_tot = 0
start = 48780000
end   = 48826000

for sample in SAMPLES:
    ### set query
    query = f"""
        SELECT Cnt.fragment, Cnt.count
        FROM   Count    Cnt
        JOIN   Fragment Frg
        ON     Cnt.fragment = Frg.fragment
        WHERE  Cnt.sample = '{sample}' AND 
               Frg.start >= '{start}'  AND 
               Frg.end   <= '{end}'
        """
    
    ### query the fragment counts
    with sqlite3.connect(fpath_db) as conn:    
        cursor = conn.cursor()
        cursor = cursor.execute(query)
        rows   = cursor

    ### count fragments
    count = 0
    for row in rows:
        frg, cnt = row
        count += cnt
        
    count_tot += count
    print(sample, count)
    
print("Total      ", count_tot)

Output_rep1 51085
Output_rep2 70830
Output_rep3 43535
Output_rep4 80358
Total       245808
CPU times: user 602 ms, sys: 1.1 s, total: 1.7 s
Wall time: 40.8 s


In [10]:
print(51085/ 505724 * 100)
print(70830/ 686033 * 100)
print(43535/ 441621 * 100)
print(80358/ 739000 * 100)
print(245808/2372378 * 100)

10.10135963489967
10.32457622300968
9.858000412118082
10.873883626522328
10.361249345593325


In [7]:
%%time
fpath_db  = FPATH_DB_TMPRA
count_tot = 0

### set query
query = f"""
    SELECT Cnt.fragment, Cnt.log2fc
    FROM   Count Cnt
    """

### query the fragment counts
with sqlite3.connect(fpath_db) as conn:
    cursor = conn.cursor()
    cursor = cursor.execute(query)
    rows   = cursor

### count fragments
for row in rows:
    frg, log2fc = row
    ratio = np.exp2(log2fc)
    count = np.ceil(ratio).astype(np.int)
    count_tot += count
    
print("Total", count_tot)

Total 245191
CPU times: user 202 ms, sys: 3.28 ms, total: 205 ms
Wall time: 1.04 s


In [8]:
%%time
fpath_db  = FPATH_DB_TMPRA
count_tot = 0

### set query
query = f"""
    SELECT Cnt.fragment, Cnt.log2fc
    FROM   Count    Cnt
    JOIN   Fragment Frg
    ON     Cnt.fragment = Frg.fragment
    WHERE  Frg.start >= '{start}' AND 
           Frg.end   <= '{end}'
    """

### query the fragment counts
with sqlite3.connect(fpath_db) as conn:
    cursor = conn.cursor()
    cursor = cursor.execute(query)
    rows   = cursor

### count fragments
for row in rows:
    frg, log2fc = row
    ratio = np.exp2(log2fc)
    count = np.ceil(ratio).astype(np.int)
    count_tot += count
    
print("Total", count_tot)

Total 18811
CPU times: user 11.5 ms, sys: 2.64 ms, total: 14.2 ms
Wall time: 208 ms


In [12]:
18811/245191*100

7.671978172118879

In [18]:
print(120000/245808 * 100)
print(10000 /18811 * 100)

48.81859011911736
53.16038488118654


## Sampling fragments from ATAC-STARR-seq

**Get Fragments**

In [19]:
%%time

### init
N = 30000
random.seed(123)
start = 48780000
end   = 48826000

### random sample fragments from ATAC-STARR-seq
lst_frg = []
for sample in SAMPLES:
    gen = get_frag_astarr(sample, start=start, end=end)
    lst = iter_sample_fast(gen, N)
    lst_frg += lst

CPU times: user 1.07 s, sys: 1.31 s, total: 2.38 s
Wall time: 1min 34s


In [20]:
dat = pd.DataFrame(lst_frg, columns=["Fragment", "Sample", "Pct_GC", "Count"])
dat[['Chrom', 'Start', 'End']] = dat['Fragment'].str.split('_', expand=True)
dat = dat.astype({"Start": int, "End": int})
dat = dat.assign(Loc = lambda x: np.ceil((x.Start + x.End) / 2))
dat = dat.astype({"Loc": int})

dat_frg = dat
print(dat_frg.shape)
dat_frg.head()

(120000, 8)


Unnamed: 0,Fragment,Sample,Pct_GC,Count,Chrom,Start,End,Loc
0,chrX_48801177_48801376,Output_rep1,0.502513,13,chrX,48801177,48801376,48801277
1,chrX_48801367_48801673,Output_rep1,0.666667,151,chrX,48801367,48801673,48801520
2,chrX_48823097_48823336,Output_rep1,0.65272,58,chrX,48823097,48823336,48823217
3,chrX_48824236_48824429,Output_rep1,0.590674,47,chrX,48824236,48824429,48824333
4,chrX_48802614_48802885,Output_rep1,0.605166,4,chrX,48802614,48802885,48802750


In [21]:
frgs = dat_frg.Fragment
locs = dat_frg.Loc

**Get Coverage**

In [22]:
%%time
gen = get_depth(locs)
dat = pd.DataFrame(gen, columns = ["Loc", "Sample", "Depth", "Trt", "Size"])
dat = dat.assign(Depth_LogNorm = lambda x: np.log2(x.Depth + 1) - np.log2(x.Size))
dat = dat.groupby(["Loc", "Trt"])["Depth_LogNorm"].mean().unstack(level=1)
dat = dat.assign(Log2fc = lambda x: x.Output - x.Input)
dat = dat.reset_index()

dat_cov = dat
print(dat_cov.shape)
dat_cov.head()

(7922, 4)
CPU times: user 785 ms, sys: 12.7 ms, total: 797 ms
Wall time: 1.88 s


Trt,Loc,Input,Output,Log2fc
0,48780115,-12.825223,-13.422444,-0.59722
1,48780139,-12.877393,-13.474395,-0.597001
2,48780140,-12.886198,-13.474395,-0.588196
3,48780158,-12.998695,-13.345469,-0.346773
4,48780168,-13.000823,-13.324554,-0.323731


**Get annotations**

In [23]:
%%time
### query the annotation
fpath_db = FPATH_DB_ASTARR
dct = get_annot(frgs, fpath_db)

### convert dict of dict into a table
dat = pd.DataFrame.from_dict(dct, orient="index").fillna(0)
dat = dat.add_prefix("Mtf_")
dat.index.name = 'Fragment'
dat = dat.reset_index()

### show the table
dat_ant = dat
print(dat_ant.shape)
dat_ant.head()

(19612, 271)
CPU times: user 10 s, sys: 1.14 s, total: 11.1 s
Wall time: 3min


Unnamed: 0,Fragment,Mtf_FOX/1,Mtf_HD/20,Mtf_ZNF449,Mtf_RFX/1,Mtf_ZFN121,Mtf_PAX/2,Mtf_YY1,Mtf_NFKB/2,Mtf_SPZ1,...,Mtf_MYB/1,Mtf_HINFP1/3,Mtf_ZNF435,Mtf_POU/2,Mtf_HD/17,Mtf_HD/9,Mtf_BCL6/1,Mtf_SOX/7,Mtf_HOMEZ,Mtf_HINFP1/2
0,chrX_48780002_48780314,8.4641,7.2633,7.3633,13.6886,22.1408,21.16,6.9956,7.1419,8.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,chrX_48781044_48781411,7.3364,0.0,0.0,0.0,19.8311,15.7609,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,chrX_48781090_48781411,7.3364,0.0,0.0,0.0,19.8311,15.7609,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,chrX_48781106_48781544,7.3364,0.0,0.0,0.0,19.8311,15.7609,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,chrX_48781223_48781370,7.3364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Merge**

In [24]:
dat = dat_frg
dat = dat.merge(dat_cov, how="left", on="Loc")

cnames = ["Sample", "Count", "Fragment", "Chrom", "Start", "End", "Loc", "Pct_GC", "Input", "Output", "Log2fc"]
dat = dat.loc[:, cnames]

dat = dat.merge(dat_ant, how="left", on="Fragment") 
dat["Sample"] = pd.Categorical(dat["Sample"], SAMPLES)
dat = dat.sort_values(["Sample", "Loc"])

dat_astarr = dat
print(dat_astarr.shape)
dat_astarr.head()

(120000, 281)


Unnamed: 0,Sample,Count,Fragment,Chrom,Start,End,Loc,Pct_GC,Input,Output,...,Mtf_MYB/1,Mtf_HINFP1/3,Mtf_ZNF435,Mtf_POU/2,Mtf_HD/17,Mtf_HD/9,Mtf_BCL6/1,Mtf_SOX/7,Mtf_HOMEZ,Mtf_HINFP1/2
6436,Output_rep1,1,chrX_48780151_48780425,chrX,48780151,48780425,48780288,0.405109,-13.575177,-14.434689,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
684,Output_rep1,62,chrX_48780394_48780686,chrX,48780394,48780686,48780540,0.537671,-13.752054,-13.423511,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1324,Output_rep1,62,chrX_48780394_48780686,chrX,48780394,48780686,48780540,0.537671,-13.752054,-13.423511,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2045,Output_rep1,62,chrX_48780394_48780686,chrX,48780394,48780686,48780540,0.537671,-13.752054,-13.423511,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2242,Output_rep1,62,chrX_48780394_48780686,chrX,48780394,48780686,48780540,0.537671,-13.752054,-13.423511,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Store**

In [25]:
fdiry = os.path.join(FD_RES, "regulatory_landscape")
fname = "dat_output_ASTARR_zoom1.csv"
fpath = os.path.join(fdiry, fname)

dat_astarr.to_csv(fpath, index=False)

## Sampling fragments from TileMPRA

**Get fragments**

In [26]:
%%time

### init
N = 10000
random.seed(123)

### random sample fragments from TileMPRA
gen = get_frag_tmpra()
lst = iter_sample_fast(gen, N)

CPU times: user 633 ms, sys: 12.5 ms, total: 646 ms
Wall time: 1.61 s


In [27]:
lst[0]

('chrX_48447201_48447400',
 0.442211,
 438.006351442587,
 326.345555586604,
 -0.423046418549338)

In [28]:
dat = pd.DataFrame(lst, columns=["Fragment", "Pct_GC", "Input", "Output", "Log2fc"])
dat[['Chrom', 'Start', 'End']] = dat['Fragment'].str.split('_', expand=True)
dat = dat.astype({"Start": int, "End": int})
dat = dat.assign(Loc = lambda x: np.ceil((x.Start + x.End) / 2))
dat = dat.astype({"Loc": int})

dat_frg = dat
print(dat_frg.shape)
dat_frg.head()

(10000, 9)


Unnamed: 0,Fragment,Pct_GC,Input,Output,Log2fc,Chrom,Start,End,Loc
0,chrX_48447201_48447400,0.442211,438.006351,326.345556,-0.423046,chrX,48447201,48447400,48447301
1,chrX_49176511_49176710,0.567839,78.892058,417.636707,2.401436,chrX,49176511,49176710,49176611
2,chrX_49043441_49043640,0.658291,149.449977,26201.326867,7.453614,chrX,49043441,49043640,49043541
3,chrX_48539391_48539590,0.592965,198.102787,7327.104227,5.208483,chrX,48539391,48539590,48539491
4,chrX_48801371_48801570,0.658291,191.716495,32234.599546,7.393833,chrX,48801371,48801570,48801471


In [29]:
frgs = dat_frg.Fragment

**Get annotations**

In [30]:
%%time
### query the annotation
fpath_db = FPATH_DB_TMPRA
dct = get_annot(frgs, fpath_db)

### convert dict of dict into a table
dat = pd.DataFrame.from_dict(dct, orient="index").fillna(0)
dat = dat.add_prefix("Mtf_")
dat.index.name = 'Fragment'
dat = dat.reset_index()

### show the table
dat_ant = dat
print(dat_ant.shape)
dat_ant.head()

(6179, 273)
CPU times: user 2.29 s, sys: 317 ms, total: 2.61 s
Wall time: 50.1 s


Unnamed: 0,Fragment,Mtf_GRHL,Mtf_Ebox/CAGCTG,Mtf_ZNF24,Mtf_HD/2,Mtf_HD/9,Mtf_SMARCA5,Mtf_NFAC/2,Mtf_ZNF354,Mtf_ETS/2,...,Mtf_CREB/ATF/3,Mtf_ZNF435,Mtf_HD/19,Mtf_MYB/1,Mtf_ZBED1,Mtf_ARI5B,Mtf_HINFP1/2,Mtf_MYB/4,Mtf_GMEB2/1,Mtf_HINFP1/3
0,chrX_47786601_47786800,7.9452,8.7102,6.3645,8.1112,7.7946,7.5413,9.4055,15.0817,15.8259,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,chrX_47793751_47793950,7.6912,16.6246,24.80198,0.0,0.0,1.3843,0.0,7.57725,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,chrX_47800051_47800250,9.8912,9.149,0.0,0.0,0.0,0.0,7.5848,0.0,8.3931,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,chrX_47806151_47806350,7.2463,8.05735,0.0,0.0,0.0,0.0,0.0,0.0,26.3377,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,chrX_47806251_47806450,13.831,8.05735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Merge**

In [31]:
dat = dat_frg

cnames = ["Fragment", "Chrom", "Start", "End", "Loc", "Pct_GC", "Input", "Output", "Log2fc"]
dat = dat.loc[:, cnames]

dat = dat.merge(dat_ant, how="left", on="Fragment") 
dat = dat.sort_values(["Loc"])

dat_tmpra = dat
print(dat_tmpra.shape)
dat_tmpra.head()

(10000, 281)


Unnamed: 0,Fragment,Chrom,Start,End,Loc,Pct_GC,Input,Output,Log2fc,Mtf_GRHL,...,Mtf_CREB/ATF/3,Mtf_ZNF435,Mtf_HD/19,Mtf_MYB/1,Mtf_ZBED1,Mtf_ARI5B,Mtf_HINFP1/2,Mtf_MYB/4,Mtf_GMEB2/1,Mtf_HINFP1/3
3591,chrX_47786601_47786800,chrX,47786601,47786800,47786701,0.346734,2102.916335,2202.384549,0.0667,7.9452,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5655,chrX_47787901_47788100,chrX,47787901,47788100,47788001,0.427136,861.235634,1065.590162,0.305847,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8747,chrX_47788551_47788750,chrX,47788551,47788750,47788651,0.422111,64.687094,78.172043,0.279845,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8522,chrX_47788951_47789150,chrX,47788951,47789150,47789051,0.577889,664.32771,2001.708558,1.590553,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1090,chrX_47789101_47789300,chrX,47789101,47789300,47789201,0.60804,749.682266,38769.994562,5.692439,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Store**

In [32]:
fdiry = os.path.join(FD_RES, "regulatory_landscape")
fname = "dat_output_TileMPRA_zoom1.csv"
fpath = os.path.join(fdiry, fname)

dat_tmpra.to_csv(fpath, index=False)