**Set environment**

In [1]:
### basic
import sys
sys.path.append('../')
from config_sing import *

### specific tools
from collections import defaultdict
from functools import reduce
import itertools as it
import math
import random
import sqlite3
# https://stackoverflow.com/questions/49456158/integer-in-python-pandas-becomes-blob-binary-in-sqlite
sqlite3.register_adapter(np.int64, lambda val: int(val))
sqlite3.register_adapter(np.int32, lambda val: int(val))

### file path of fragment database
fdiry = os.path.join(FD_RES, "KS91_K562_ASTARRseq", 'database')
fname = "fragment_astarr_gata1.db"
FPATH_DB_ASTARR = os.path.join(fdiry, fname)

fdiry = os.path.join(FD_RES, "Tewhey_K562_TileMPRA", 'database')
fname = "fragment_tilempra_gata1.db"
FPATH_DB_TMPRA = os.path.join(fdiry, fname)

FPATHS_DB = [FPATH_DB_ASTARR, FPATH_DB_TMPRA]

### samples of ATAC-STARR-seq
SAMPLES = ["Output_rep1", "Output_rep2", "Output_rep3", "Output_rep4"]

You are on Duke Server: Singularity: Proj ENCODE FCC
BASE DIRECTORY:     /mount/work
PATH OF SOURCE:     /mount/work/source
PATH OF EXECUTABLE: /mount/work/exe
PATH OF ANNOTATION: /mount/work/annotation
PATH OF PROJECT:    /mount/project
PATH OF RESULTS:    /mount/work/out/proj_combeffect_encode_fcc



In [2]:
### https://stackoverflow.com/questions/12581437/python-random-sample-with-a-generator-iterable-iterator
def iter_sample_fast(iterable, samplesize):
    
    ### init
    results = []
    iterator = iter(iterable)
    
    ### Fill in the first samplesize elements:
    try:
        for _ in range(samplesize):
            results.append(next(iterator))
    except StopIteration:
        ### return everything if samplesize > len(iterable)
        random.shuffle(results)
        return results
    
    ### continue iterating through the elements and update the list
    random.shuffle(results)  # Randomize their positions
    for i, v in enumerate(iterator, samplesize):
        r = random.randint(0, i)
        if r < samplesize:
            results[r] = v  # at a decreasing rate, replace random items
    return results

In [3]:
def get_frag_astarr(sample, fpath_db = FPATH_DB_ASTARR):
    """sample fragments from ATAC-STARR-seq with probability proportion to count"""
    with sqlite3.connect(fpath_db) as conn:
        ### query the fragment annotations
        cursor = conn.cursor()
        query  = f"""
            SELECT *
            FROM   Count Cnt
            WHERE  Cnt.sample = '{sample}'
            """
        cursor = cursor.execute(query)
        rows   = cursor
        
        ### generate fragments
        for row in rows:
            ### parse info
            frg, sample, count = row
            
            ### repeat the each fragment based on its count
            for _ in range(count):
                yield frg, sample
                
                
def get_frag_tmpra(fpath_db = FPATH_DB_TMPRA):
    """sample fragments from TileMPRA with probability proportion to fold change"""
    with sqlite3.connect(fpath_db) as conn:
        ### query the fragment annotations
        cursor = conn.cursor()
        query  = """
            SELECT Cnt.fragment, Cnt.count_input, Cnt.count_output, Cnt.log2fc 
            FROM   Count Cnt"""
        cursor = cursor.execute(query)
        rows   = cursor

        ### generate fragments
        for row in rows:
            ### parse info
            frg, inp, out, log2fc = row
            
            ### repeat the each fragment based on its ratio
            ratio = np.exp2(log2fc)
            count = np.ceil(ratio).astype(np.int)
            for _ in range(count):
                yield frg, inp, out, log2fc

In [4]:
def get_depth(locs, fpath_db = FPATH_DB_ASTARR):
    """get"""
    ### set query
    txt   = ', '.join('?' for _ in locs)
    query = f"""
        SELECT   Cov.location, Cov.sample, Cov.depth, Sam.treatment, Sam.size
        FROM     Coverage Cov
        JOIN     Sample   Sam 
        ON       Cov.sample = Sam.sample
        WHERE    Cov.location IN ({txt})
        ORDER BY Cov.location
        """
    
    ### query out from database
    with sqlite3.connect(fpath_db) as conn:
        cursor = conn.cursor()
        cursor = cursor.execute(query, locs)
    
    ### generate each row
    for row in cursor:
        yield row
        
def get_pct(frgs, fpath_db):
    """get GC content of the given fragments"""
    ### set query
    txt   = ', '.join('?' for _ in frgs)
    query = f"""
        SELECT   Frg.fragment, Frg.pct_gc
        FROM     Fragment Frg
        WHERE    Frg.fragment IN ({txt})
        ORDER BY Frg.fragment
        """
    
    ### query out from database
    with sqlite3.connect(fpath_db) as conn:
        cursor = conn.cursor()
        cursor = cursor.execute(query, frgs)
    
    ### generate each row
    for row in cursor:
        yield row
        
def get_annot(frgs, fpath_db):
    """get annotation from the given fragments"""
    ### set query
    txt   = ', '.join('?' for _ in frgs)
    query = f"""
        SELECT   Ant.Fragment, Mtf.motif, Mtf.score
        FROM     Annotation Ant
        JOIN     Motif      Mtf 
        ON       Ant.binding = Mtf.binding
        WHERE    Ant.Fragment IN ({txt})
        ORDER BY Ant.Fragment
        """
    
    ## query out from database
    with sqlite3.connect(fpath_db) as conn:
        cursor = conn.cursor()    
        cursor = cursor.execute(query, frgs)
        
    ### summarize the motif annotation scores
    dct_ann = defaultdict(lambda: defaultdict(lambda: 0.0))
    for row in cursor:
        frg, mtf, val      = row
        dct_ann[frg][mtf] += val
            
    return dct_ann

## Sampling fragments from ATAC-STARR-seq

**Get fragments**

In [5]:
%%time

### init
N = 25000
random.seed(123)
lst_frg = []

### random sample fragments from ATAC-STARR-seq
for sample in SAMPLES:
    gen = get_frag_astarr(sample)
    lst = iter_sample_fast(gen, N)
    lst_frg += lst

CPU times: user 6.29 s, sys: 125 ms, total: 6.41 s
Wall time: 19.7 s


In [6]:
dat = pd.DataFrame(lst_frg, columns=["Fragment", "Sample"])
dat[['Chrom', 'Start', 'End']] = dat['Fragment'].str.split('_', expand=True)
dat = dat.astype({"Start": int, "End": int})
dat = dat.assign(Loc = lambda x: np.ceil((x.Start + x.End) / 2))
dat = dat.astype({"Loc": int})

dat_frg = dat
print(dat_frg.shape)
dat_frg.head()

(100000, 6)


Unnamed: 0,Fragment,Sample,Chrom,Start,End,Loc
0,chrX_48028143_48028381,Output_rep1,chrX,48028143,48028381,48028262
1,chrX_48737381_48737711,Output_rep1,chrX,48737381,48737711,48737546
2,chrX_48632340_48632551,Output_rep1,chrX,48632340,48632551,48632446
3,chrX_49043431_49043522,Output_rep1,chrX,49043431,49043522,49043477
4,chrX_49288483_49288680,Output_rep1,chrX,49288483,49288680,49288582


In [7]:
frgs = dat_frg.Fragment
locs = dat_frg.Loc

**Get GC content**

In [8]:
%%time
fpath_db = FPATH_DB_ASTARR
gen = get_pct(frgs, fpath_db)
dat = pd.DataFrame(gen, columns = ["Fragment", "Pct_GC"])

dat_pct = dat
print(dat_pct.shape)
dat_pct.head()

(62836, 2)
CPU times: user 1.15 s, sys: 622 ms, total: 1.77 s
Wall time: 1min 38s


Unnamed: 0,Fragment,Pct_GC
0,chrX_47787189_47787363,0.442529
1,chrX_47787224_47787406,0.401099
2,chrX_47787569_47787766,0.375635
3,chrX_47787570_47787766,0.372449
4,chrX_47788659_47788978,0.482759


**Get coverage**

In [9]:
%%time
gen = get_depth(locs)
dat = pd.DataFrame(gen, columns = ["Loc", "Sample", "Depth", "Trt", "Size"])
dat = dat.assign(Depth_LogNorm = lambda x: np.log2(x.Depth + 1) - np.log2(x.Size))
dat = dat.groupby(["Loc", "Trt"])["Depth_LogNorm"].mean().unstack(level=1)
dat = dat.assign(Log2fc = lambda x: x.Output - x.Input)
dat = dat.reset_index()

dat_cov = dat
print(dat_cov.shape)
dat_cov.head()

(48468, 4)
CPU times: user 2.35 s, sys: 628 ms, total: 2.97 s
Wall time: 41 s


Trt,Loc,Input,Output,Log2fc
0,47787276,-17.644715,-17.113573,0.531142
1,47787315,-17.527722,-17.113573,0.41415
2,47787668,-16.03748,-17.884795,-1.847315
3,47788819,-15.14187,-14.504588,0.637282
4,47788837,-15.048508,-14.489238,0.55927


**Get annotations**

In [10]:
%%time
### query the annotation
fpath_db = FPATH_DB_ASTARR
dct = get_annot(frgs, fpath_db)

### convert dict of dict into a table
dat = pd.DataFrame.from_dict(dct, orient="index").fillna(0)
dat = dat.add_prefix("Mtf_")
dat.index.name = 'Fragment'
dat = dat.reset_index()

### show the table
dat_ant = dat
print(dat_ant.shape)
dat_ant.head()

(62836, 273)
CPU times: user 31.6 s, sys: 6.6 s, total: 38.2 s
Wall time: 38min 7s


Unnamed: 0,Fragment,Mtf_ZNF146,Mtf_NR/15,Mtf_NR/20,Mtf_FOX/4,Mtf_TBX/1,Mtf_ZIC,Mtf_TBX/4,Mtf_PRDM4,Mtf_FOX/8,...,Mtf_HD/1,Mtf_HIF,Mtf_ZNF306,Mtf_ZNF547,Mtf_E2F/4,Mtf_ZNF713,Mtf_MYB/1,Mtf_HINFP1/2,Mtf_ZBED1,Mtf_GMEB2/1
0,chrX_47787189_47787363,1.8019,8.0507,8.1423,8.8403,11.3184,6.6872,7.42815,1.6998,4.7093,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,chrX_47793673_47794164,21.0629,0.0,0.0,16.423,0.0,0.0,15.273,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,chrX_47798498_47798887,4.2139,7.363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,chrX_47799445_47799583,3.3133,0.0,0.0,7.3989,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,chrX_47837965_47838529,1.6675,0.0,0.0,0.0,0.0,6.5887,9.7818,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Merge**

In [11]:
dat = dat_frg
dat = dat.merge(dat_pct, how="left", on="Fragment")
dat = dat.merge(dat_cov, how="left", on="Loc")

cnames = ["Sample", "Fragment", "Chrom", "Start", "End", "Loc", "Pct_GC", "Input", "Output", "Log2fc"]
dat = dat.loc[:, cnames]

dat = dat.merge(dat_ant, how="left", on="Fragment") 
dat["Sample"] = pd.Categorical(dat["Sample"], SAMPLES)
dat = dat.sort_values(["Sample", "Loc"])

dat_astarr = dat
print(dat_astarr.shape)
dat_astarr

(100000, 282)


Unnamed: 0,Sample,Fragment,Chrom,Start,End,Loc,Pct_GC,Input,Output,Log2fc,...,Mtf_HD/1,Mtf_HIF,Mtf_ZNF306,Mtf_ZNF547,Mtf_E2F/4,Mtf_ZNF713,Mtf_MYB/1,Mtf_HINFP1/2,Mtf_ZBED1,Mtf_GMEB2/1
4498,Output_rep1,chrX_47787189_47787363,chrX,47787189,47787363,47787276,0.442529,-17.644715,-17.113573,0.531142,...,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0
3752,Output_rep1,chrX_47788659_47788978,chrX,47788659,47788978,47788819,0.482759,-15.141870,-14.504588,0.637282,...,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0
14858,Output_rep1,chrX_47788659_47788979,chrX,47788659,47788979,47788819,0.484375,-15.141870,-14.504588,0.637282,...,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0
19111,Output_rep1,chrX_47789145_47789363,chrX,47789145,47789363,47789254,0.614679,-13.270079,-13.019050,0.251028,...,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0
13446,Output_rep1,chrX_47789186_47789370,chrX,47789186,47789370,47789278,0.597826,-13.187144,-13.173681,0.013462,...,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88631,Output_rep4,chrX_49784006_49784320,chrX,49784006,49784320,49784163,0.417197,-14.558624,-17.009170,-2.450546,...,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0
86141,Output_rep4,chrX_49784522_49784669,chrX,49784522,49784669,49784596,0.476190,-15.204872,-16.705921,-1.501049,...,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0
94311,Output_rep4,chrX_49785005_49785219,chrX,49785005,49785219,49785112,0.504673,-13.510785,-15.663579,-2.152793,...,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0
94348,Output_rep4,chrX_49785112_49785242,chrX,49785112,49785242,49785177,0.523077,-13.371460,-14.711512,-1.340052,...,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0


**Store**

In [12]:
fdiry = os.path.join(FD_RES, "regulatory_landscape")
fname = "dat_output_ASTARR_1e5.csv"
fpath = os.path.join(fdiry, fname)

dat_astarr.to_csv(fpath, index=False)

## Sampling fragments from TileMPRA

**Get fragments**

In [13]:
%%time

### init
N = 100000
random.seed(123)

### random sample fragments from TileMPRA
gen = get_frag_tmpra()
lst = iter_sample_fast(gen, N)

CPU times: user 721 ms, sys: 9.61 ms, total: 730 ms
Wall time: 4.79 s


In [14]:
dat = pd.DataFrame(lst, columns=["Fragment", "Input", "Output", "Log2fc"])
dat[['Chrom', 'Start', 'End']] = dat['Fragment'].str.split('_', expand=True)
dat = dat.astype({"Start": int, "End": int})
dat = dat.assign(Loc = lambda x: np.ceil((x.Start + x.End) / 2))
dat = dat.astype({"Loc": int})

dat_frg = dat
print(dat_frg.shape)
dat_frg.head()

(100000, 8)


Unnamed: 0,Fragment,Input,Output,Log2fc,Chrom,Start,End,Loc
0,chrX_48205751_48205950,112.049538,138.580402,0.309308,chrX,48205751,48205950,48205851
1,chrX_49176451_49176650,420.272885,11995.228253,4.834874,chrX,49176451,49176650,49176551
2,chrX_49043431_49043630,102.79283,11627.502218,6.824022,chrX,49043431,49043630,49043531
3,chrX_48579001_48579200,1787.590823,2029.826901,0.183023,chrX,48579001,48579200,48579101
4,chrX_48801351_48801550,323.699401,46565.147678,7.168568,chrX,48801351,48801550,48801451


In [15]:
frgs = dat_frg.Fragment

**Get GC content**

In [16]:
%%time
fpath_db = FPATH_DB_TMPRA
gen = get_pct(frgs, fpath_db)
dat = pd.DataFrame(gen, columns = ["Fragment", "Pct_GC"])

dat_pct = dat
print(dat_pct.shape)
dat_pct.head()

(33068, 2)
CPU times: user 444 ms, sys: 5.63 ms, total: 449 ms
Wall time: 2.91 s


Unnamed: 0,Fragment,Pct_GC
0,chrX_47786401_47786600,0.331658
1,chrX_47786451_47786650,0.341709
2,chrX_47786501_47786700,0.326633
3,chrX_47786601_47786800,0.346734
4,chrX_47786701_47786900,0.386935


**Get annotations**

In [17]:
%%time
### query the annotation
fpath_db = FPATH_DB_TMPRA
dct = get_annot(frgs, fpath_db)

### convert dict of dict into a table
dat = pd.DataFrame.from_dict(dct, orient="index").fillna(0)
dat = dat.add_prefix("Mtf_")
dat.index.name = 'Fragment'
dat = dat.reset_index()

### show the table
dat_ant = dat
print(dat_ant.shape)
dat_ant.head()

(33068, 273)
CPU times: user 6.22 s, sys: 348 ms, total: 6.57 s
Wall time: 41 s


Unnamed: 0,Fragment,Mtf_KLF/SP/2,Mtf_ZNF28,Mtf_FOX/4,Mtf_ZNF382,Mtf_ZNF136,Mtf_ZNF41,Mtf_IRF/1,Mtf_RUNX/2,Mtf_ZNF320,...,Mtf_NFAT/3,Mtf_MYB/4,Mtf_HD/1,Mtf_HIF,Mtf_ZNF306,Mtf_ZNF547,Mtf_ZNF713,Mtf_GMEB2/1,Mtf_HINFP1/2,Mtf_ZBED1
0,chrX_47786401_47786600,6.7001,15.9435,8.4743,24.6662,2.6078,7.1286,5.3024,8.9312,5.6607,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,chrX_47787101_47787300,11.829133,0.0,8.8403,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,chrX_47787851_47788050,7.6293,7.5885,0.0,0.0,12.2768,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,chrX_47788101_47788300,6.1933,7.6976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,chrX_47788151_47788350,6.1933,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Merge**

In [18]:
dat = dat_frg
dat = dat.merge(dat_pct, how="left", on="Fragment")

cnames = ["Fragment", "Chrom", "Start", "End", "Loc", "Pct_GC", "Input", "Output", "Log2fc"]
dat = dat.loc[:, cnames]

dat = dat.merge(dat_ant, how="left", on="Fragment") 
dat = dat.sort_values(["Loc"])

dat_tmpra = dat
print(dat_tmpra.shape)
dat_tmpra.head()

(100000, 281)


Unnamed: 0,Fragment,Chrom,Start,End,Loc,Pct_GC,Input,Output,Log2fc,Mtf_KLF/SP/2,...,Mtf_NFAT/3,Mtf_MYB/4,Mtf_HD/1,Mtf_HIF,Mtf_ZNF306,Mtf_ZNF547,Mtf_ZNF713,Mtf_GMEB2/1,Mtf_HINFP1/2,Mtf_ZBED1
66004,chrX_47786401_47786600,chrX,47786401,47786600,47786501,0.331658,2117.786841,10474.868703,2.305939,6.7001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70401,chrX_47786401_47786600,chrX,47786401,47786600,47786501,0.331658,2117.786841,10474.868703,2.305939,6.7001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
81498,chrX_47786401_47786600,chrX,47786401,47786600,47786501,0.331658,2117.786841,10474.868703,2.305939,6.7001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
65102,chrX_47786401_47786600,chrX,47786401,47786600,47786501,0.331658,2117.786841,10474.868703,2.305939,6.7001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33617,chrX_47786451_47786650,chrX,47786451,47786650,47786551,0.341709,1386.007392,2387.118966,0.783056,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Store**

In [19]:
fdiry = os.path.join(FD_RES, "regulatory_landscape")
fname = "dat_output_TileMPRA_1e5.csv"
fpath = os.path.join(fdiry, fname)

dat_tmpra.to_csv(fpath, index=False)