**Set environment**

In [1]:
### basic
import sys
sys.path.append('../')
from config_sing import *

### specific tools
from collections import defaultdict
from functools import reduce
import itertools as it
import math
import random
import sqlite3
# https://stackoverflow.com/questions/49456158/integer-in-python-pandas-becomes-blob-binary-in-sqlite
sqlite3.register_adapter(np.int64, lambda val: int(val))
sqlite3.register_adapter(np.int32, lambda val: int(val))

### file path of fragment database
fdiry = os.path.join(FD_RES, "KS91_K562_ASTARRseq", 'database')
fname = "fragment_astarr_gata1.db"
FPATH_DB_ASTARR = os.path.join(fdiry, fname)

fdiry = os.path.join(FD_RES, "Tewhey_K562_TileMPRA", 'database')
fname = "fragment_tilempra_gata1.db"
FPATH_DB_TMPRA = os.path.join(fdiry, fname)

FPATHS_DB = [FPATH_DB_ASTARR, FPATH_DB_TMPRA]

### samples of ATAC-STARR-seq
SAMPLES = ["Output_rep1", "Output_rep2", "Output_rep3", "Output_rep4"]

You are in: Singularity: singularity_proj_combeffect
BASE DIRECTORY:     /mount/work
PATH OF SOURCE:     /mount/work/source
PATH OF EXECUTABLE: /mount/work/exe
PATH OF ANNOTATION: /mount/work/annotation
PATH OF PROJECT:    /mount/project
PATH OF RESULTS:    /mount/work/out/proj_combeffect_encode_fcc



In [2]:
### https://stackoverflow.com/questions/12581437/python-random-sample-with-a-generator-iterable-iterator
def iter_sample_fast(iterable, samplesize):
    
    ### init
    results = []
    iterator = iter(iterable)
    
    ### Fill in the first samplesize elements:
    try:
        for _ in range(samplesize):
            results.append(next(iterator))
    except StopIteration:
        ### return everything if samplesize > len(iterable)
        random.shuffle(results)
        return results
    
    ### continue iterating through the elements and update the list
    random.shuffle(results)  # Randomize their positions
    for i, v in enumerate(iterator, samplesize):
        r = random.randint(0, i)
        if r < samplesize:
            results[r] = v  # at a decreasing rate, replace random items
    return results

In [3]:
def get_frag_astarr(sample, start=None, end=None, fpath_db = FPATH_DB_ASTARR):
    """sample fragments from ATAC-STARR-seq with probability proportion to count"""
    ### set query
    if (start is None) and (end is None):
        query  = f"""
            SELECT Cnt.fragment, Cnt.sample, Frg.pct_gc, Cnt.count
            FROM   Count    Cnt
            JOIN   Fragment Frg
            ON     Cnt.fragment = Frg.fragment
            WHERE  Cnt.sample = '{sample}'
            """
    else:
        query  = f"""
            SELECT Cnt.fragment, Cnt.sample, Frg.pct_gc, Cnt.count
            FROM   Count    Cnt
            JOIN   Fragment Frg
            ON     Cnt.fragment = Frg.fragment
            WHERE  Cnt.sample = '{sample}' AND 
                   Frg.start >= '{start}'  AND 
                   Frg.end   <= '{end}'
            """
    
    fpath_db = "file:" + fpath_db + "?mode=ro"
    with sqlite3.connect(fpath_db, uri=True) as conn:
        ### query the fragment annotations
        cursor = conn.cursor()
        cursor = cursor.execute(query)
        rows   = cursor
        
        ### generate fragments
        for row in rows:
            ### parse info
            frg, sample, pct_gc, count = row
            
            ### repeat the each fragment based on its count
            for _ in range(count):
                yield row
                
                
def get_frag_tmpra(start=None, end=None, fpath_db = FPATH_DB_TMPRA):
    """sample fragments from TileMPRA with probability proportion to fold change"""
    ### set query
    if (start is None) and (end is None):
        query  = """
            SELECT Cnt.fragment, Frg.pct_gc, Cnt.count_input, Cnt.count_output, Cnt.log2fc 
            FROM   Count Cnt
            JOIN   Fragment Frg
            ON     Cnt.fragment = Frg.fragment
            """
    else:
        query  = f"""
            SELECT Cnt.fragment, Frg.pct_gc, Cnt.count_input, Cnt.count_output, Cnt.log2fc 
            FROM   Count    Cnt
            JOIN   Fragment Frg
            ON     Cnt.fragment = Frg.fragment
            WHERE  Frg.start >= '{start}' AND 
                   Frg.end   <= '{end}'
            """
        
    fpath_db = "file:" + fpath_db + "?mode=ro"
    with sqlite3.connect(fpath_db, uri=True) as conn:
        ### query the fragment annotations
        cursor = conn.cursor()
        cursor = cursor.execute(query)
        rows   = cursor

        ### generate fragments
        for row in rows:
            ### parse info
            frg, pct_gc, inp, out, log2fc = row
            
            ### repeat the each fragment based on its ratio
            ratio = np.exp2(log2fc)
            count = np.ceil(ratio).astype(np.int)
            for _ in range(count):
                yield row

In [4]:
def get_depth(locs, fpath_db = FPATH_DB_ASTARR):
    """get"""
    ### set query
    txt   = ', '.join('?' for _ in locs)
    query = f"""
        SELECT   Cov.location, Cov.sample, Cov.depth, Sam.treatment, Sam.size
        FROM     Coverage Cov
        JOIN     Sample   Sam 
        ON       Cov.sample = Sam.sample
        WHERE    Cov.location IN ({txt})
        ORDER BY Cov.location
        """
    
    ### query out from database
    fpath_db = "file:" + fpath_db + "?mode=ro"
    with sqlite3.connect(fpath_db, uri=True) as conn:
        cursor = conn.cursor()
        cursor = cursor.execute(query, locs)
    
    ### generate each row
    for row in cursor:
        yield row
        
def get_annot(frgs, fpath_db):
    """get annotation from the given fragments"""
    ### set query
    txt   = ', '.join('?' for _ in frgs)
    query = f"""
        SELECT   Ant.Fragment, Mtf.motif, Mtf.score
        FROM     Annotation Ant
        JOIN     Motif      Mtf 
        ON       Ant.binding = Mtf.binding
        WHERE    Ant.Fragment IN ({txt})
        ORDER BY Ant.Fragment
        """
    
    ## query out from database
    fpath_db = "file:" + fpath_db + "?mode=ro"
    with sqlite3.connect(fpath_db, uri=True) as conn:
        cursor = conn.cursor()    
        cursor = cursor.execute(query, frgs)
        
    ### summarize the motif annotation scores
    dct_ann_count = defaultdict(lambda: defaultdict(lambda: 0))
    dct_ann_score = defaultdict(lambda: defaultdict(lambda: 0.0))
    
    for row in cursor:
        ### parse info
        frg, mtf, val = row
        
        ### count and sum the annotation scores
        dct_ann_count[frg][mtf] += 1
        dct_ann_score[frg][mtf] += val
    
    ### arrange and return
    dct_ann = dict()
    dct_ann["count"] = dct_ann_count
    dct_ann["score"] = dct_ann_score
    return dct_ann

## Sampling fragments from ATAC-STARR-seq

**Get fragments**

In [5]:
%%time

### init
N = 25000
random.seed(123)
start = 48780000
end   = 48826000

### random sample fragments from ATAC-STARR-seq
lst_frg = []
for sample in SAMPLES:
    gen = get_frag_astarr(sample, start=start, end=end)
    lst = iter_sample_fast(gen, N)
    lst_frg += lst

CPU times: user 1.28 s, sys: 1.88 s, total: 3.17 s
Wall time: 1min 20s


In [6]:
dat = pd.DataFrame(lst_frg, columns=["Fragment", "Sample", "Pct_GC", "Count"])
dat[['Chrom', 'Start', 'End']] = dat['Fragment'].str.split('_', expand=True)
dat = dat.astype({"Start": int, "End": int})
dat = dat.assign(Loc = lambda x: np.ceil((x.Start + x.End) / 2))
dat = dat.astype({"Loc": int})

dat_frg = dat
print(dat_frg.shape)
dat_frg.head()

(100000, 8)


Unnamed: 0,Fragment,Sample,Pct_GC,Count,Chrom,Start,End,Loc
0,chrX_48801089_48801262,Output_rep1,0.421965,9,chrX,48801089,48801262,48801176
1,chrX_48801367_48801671,Output_rep1,0.664474,62,chrX,48801367,48801671,48801519
2,chrX_48822713_48822856,Output_rep1,0.608392,26,chrX,48822713,48822856,48822785
3,chrX_48823097_48823336,Output_rep1,0.65272,58,chrX,48823097,48823336,48823217
4,chrX_48802522_48802810,Output_rep1,0.614583,40,chrX,48802522,48802810,48802666


In [7]:
frgs = dat_frg.Fragment
locs = dat_frg.Loc

**Get coverage**

In [8]:
%%time
gen = get_depth(locs)
dat = pd.DataFrame(gen, columns = ["Loc", "Sample", "Depth", "Trt", "Size"])
dat = dat.assign(Depth_LogNorm = lambda x: np.log2(x.Depth + 1) - np.log2(x.Size))
dat = dat.groupby(["Loc", "Trt"])["Depth_LogNorm"].mean().unstack(level=1)
dat = dat.assign(Log2fc = lambda x: x.Output - x.Input)
dat = dat.reset_index()

dat_cov = dat
print(dat_cov.shape)
dat_cov.head()

(7665, 4)
CPU times: user 561 ms, sys: 26.3 ms, total: 588 ms
Wall time: 762 ms


Trt,Loc,Input,Output,Log2fc
0,48780114,-12.807756,-13.391061,-0.583305
1,48780115,-12.825223,-13.422444,-0.59722
2,48780119,-12.818503,-13.500733,-0.68223
3,48780140,-12.886198,-13.474395,-0.588196
4,48780158,-12.998695,-13.345469,-0.346773


**Get annotations**

In [9]:
%%time
### query the annotation
fpath_db = FPATH_DB_ASTARR
dct = get_annot(frgs, fpath_db)

CPU times: user 5.88 s, sys: 789 ms, total: 6.66 s
Wall time: 1min 32s


In [10]:
### convert dict of dict into a table
dat = pd.DataFrame.from_dict(dct["count"], orient="index").fillna(0)
dat = dat.add_prefix("Mtf_")
dat.index.name = 'Fragment'
dat = dat.reset_index()

### show the table
dat_ant_count = dat
print(dat_ant_count.shape)
dat_ant_count.head()

(18405, 271)


Unnamed: 0,Fragment,Mtf_FOX/1,Mtf_HD/20,Mtf_ZNF449,Mtf_RFX/1,Mtf_ZFN121,Mtf_PAX/2,Mtf_YY1,Mtf_NFKB/2,Mtf_SPZ1,...,Mtf_MYB/1,Mtf_HINFP1/3,Mtf_ZNF435,Mtf_POU/2,Mtf_HD/17,Mtf_HD/9,Mtf_BCL6/1,Mtf_SOX/7,Mtf_HOMEZ,Mtf_HINFP1/2
0,chrX_48780002_48780314,1.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,chrX_48781035_48781370,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,chrX_48781044_48781411,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,chrX_48781090_48781411,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,chrX_48781091_48781411,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
### convert dict of dict into a table
dat = pd.DataFrame.from_dict(dct["score"], orient="index").fillna(0)
dat = dat.add_prefix("Mtf_")
dat.index.name = 'Fragment'
dat = dat.reset_index()

### show the table
dat_ant_score = dat
print(dat_ant_score.shape)
dat_ant_score.head()

(18405, 271)


Unnamed: 0,Fragment,Mtf_FOX/1,Mtf_HD/20,Mtf_ZNF449,Mtf_RFX/1,Mtf_ZFN121,Mtf_PAX/2,Mtf_YY1,Mtf_NFKB/2,Mtf_SPZ1,...,Mtf_MYB/1,Mtf_HINFP1/3,Mtf_ZNF435,Mtf_POU/2,Mtf_HD/17,Mtf_HD/9,Mtf_BCL6/1,Mtf_SOX/7,Mtf_HOMEZ,Mtf_HINFP1/2
0,chrX_48780002_48780314,8.4641,7.2633,7.3633,13.6886,22.1408,21.16,6.9956,7.1419,8.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,chrX_48781035_48781370,7.3364,0.0,0.0,0.0,19.8311,15.7609,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,chrX_48781044_48781411,7.3364,0.0,0.0,0.0,19.8311,15.7609,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,chrX_48781090_48781411,7.3364,0.0,0.0,0.0,19.8311,15.7609,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,chrX_48781091_48781411,7.3364,0.0,0.0,0.0,19.8311,15.7609,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Merge**

In [12]:
dat = dat_frg
dat = dat.merge(dat_cov, how="left", on="Loc")

cnames = ["Sample", "Count", "Fragment", "Chrom", "Start", "End", "Loc", "Pct_GC", "Input", "Output", "Log2fc"]
dat = dat.loc[:, cnames]

dat = dat.merge(dat_ant_count, how="left", on="Fragment") 
dat["Sample"] = pd.Categorical(dat["Sample"], SAMPLES)
dat = dat.sort_values(["Sample", "Loc"])

dat_astarr_count = dat
print(dat_astarr_count.shape)
dat_astarr_count.head()

(100000, 281)


Unnamed: 0,Sample,Count,Fragment,Chrom,Start,End,Loc,Pct_GC,Input,Output,...,Mtf_MYB/1,Mtf_HINFP1/3,Mtf_ZNF435,Mtf_POU/2,Mtf_HD/17,Mtf_HD/9,Mtf_BCL6/1,Mtf_SOX/7,Mtf_HOMEZ,Mtf_HINFP1/2
4572,Output_rep1,1,chrX_48780151_48780425,chrX,48780151,48780425,48780288,0.405109,-13.575177,-14.434689,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6939,Output_rep1,1,chrX_48780151_48780424,chrX,48780151,48780424,48780288,0.406593,-13.575177,-14.434689,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
350,Output_rep1,62,chrX_48780394_48780686,chrX,48780394,48780686,48780540,0.537671,-13.752054,-13.423511,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1016,Output_rep1,62,chrX_48780394_48780686,chrX,48780394,48780686,48780540,0.537671,-13.752054,-13.423511,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1445,Output_rep1,62,chrX_48780394_48780686,chrX,48780394,48780686,48780540,0.537671,-13.752054,-13.423511,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
dat = dat_frg
dat = dat.merge(dat_cov, how="left", on="Loc")

cnames = ["Sample", "Count", "Fragment", "Chrom", "Start", "End", "Loc", "Pct_GC", "Input", "Output", "Log2fc"]
dat = dat.loc[:, cnames]

dat = dat.merge(dat_ant_score, how="left", on="Fragment") 
dat["Sample"] = pd.Categorical(dat["Sample"], SAMPLES)
dat = dat.sort_values(["Sample", "Loc"])

dat_astarr_score = dat
print(dat_astarr_score.shape)
dat_astarr_score.head()

(100000, 281)


Unnamed: 0,Sample,Count,Fragment,Chrom,Start,End,Loc,Pct_GC,Input,Output,...,Mtf_MYB/1,Mtf_HINFP1/3,Mtf_ZNF435,Mtf_POU/2,Mtf_HD/17,Mtf_HD/9,Mtf_BCL6/1,Mtf_SOX/7,Mtf_HOMEZ,Mtf_HINFP1/2
4572,Output_rep1,1,chrX_48780151_48780425,chrX,48780151,48780425,48780288,0.405109,-13.575177,-14.434689,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6939,Output_rep1,1,chrX_48780151_48780424,chrX,48780151,48780424,48780288,0.406593,-13.575177,-14.434689,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
350,Output_rep1,62,chrX_48780394_48780686,chrX,48780394,48780686,48780540,0.537671,-13.752054,-13.423511,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1016,Output_rep1,62,chrX_48780394_48780686,chrX,48780394,48780686,48780540,0.537671,-13.752054,-13.423511,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1445,Output_rep1,62,chrX_48780394_48780686,chrX,48780394,48780686,48780540,0.537671,-13.752054,-13.423511,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Store**

In [14]:
fdiry = os.path.join(FD_RES, "regulatory_landscape")
fname = "dat_output_ASTARR_zoom1_count_1e5.csv"
fpath = os.path.join(fdiry, fname)

dat_astarr_count.to_csv(fpath, index=False)

In [15]:
fdiry = os.path.join(FD_RES, "regulatory_landscape")
fname = "dat_output_ASTARR_zoom1_score_1e5.csv"
fpath = os.path.join(fdiry, fname)

dat_astarr_score.to_csv(fpath, index=False)

In [16]:
## check
dat = dat_astarr_score
print(np.min(dat["Start"]), np.max(dat["End"]))

48780002 48825969


## Sampling fragments from TileMPRA

**Get fragments**

In [43]:
%%time
### init
N = 10000
random.seed(123)
start = 48780000
end   = 48826000

### random sample fragments from TileMPRA
gen = get_frag_tmpra(start=start, end=end)
lst = iter_sample_fast(gen, N)

CPU times: user 59.7 ms, sys: 10.2 ms, total: 69.9 ms
Wall time: 317 ms


In [44]:
lst[0]

('chrX_48791101_48791300',
 0.643216,
 304.213631597311,
 442.755246464732,
 0.545011020620296)

In [45]:
dat = pd.DataFrame(lst, columns=["Fragment", "Pct_GC", "Input", "Output", "Log2fc"])
dat[['Chrom', 'Start', 'End']] = dat['Fragment'].str.split('_', expand=True)
dat = dat.astype({"Start": int, "End": int})
dat = dat.assign(Loc = lambda x: np.ceil((x.Start + x.End) / 2))
dat = dat.astype({"Loc": int})

dat_frg = dat
print(dat_frg.shape)
dat_frg.head()

(10000, 9)


Unnamed: 0,Fragment,Pct_GC,Input,Output,Log2fc,Chrom,Start,End,Loc
0,chrX_48791101_48791300,0.643216,304.213632,442.755246,0.545011,chrX,48791101,48791300,48791201
1,chrX_48823181_48823380,0.643216,193.613181,2560.063477,3.723938,chrX,48823181,48823380,48823281
2,chrX_48801221_48801420,0.547739,282.525231,51815.092072,7.51914,chrX,48801221,48801420,48801321
3,chrX_48801271_48801470,0.58794,286.738684,66549.329438,7.858287,chrX,48801271,48801470,48801371
4,chrX_48801551_48801750,0.693467,280.952191,80613.32542,8.164295,chrX,48801551,48801750,48801651


In [46]:
frgs = dat_frg.Fragment

**Get annotations**

In [47]:
%%time
### query the annotation
fpath_db = FPATH_DB_TMPRA
dct = get_annot(frgs, fpath_db)

CPU times: user 422 ms, sys: 22 ms, total: 444 ms
Wall time: 2.94 s


In [48]:
### convert dict of dict into a table
dat = pd.DataFrame.from_dict(dct["count"], orient="index").fillna(0)
dat = dat.add_prefix("Mtf_")
dat.index.name = 'Fragment'
dat = dat.reset_index()

### show the table
dat_ant_count = dat
print(dat_ant_count.shape)
dat_ant_count.head()

(1353, 271)


Unnamed: 0,Fragment,Mtf_HD/2,Mtf_ZNF332,Mtf_POU/1,Mtf_MYB/5,Mtf_OSR2,Mtf_RFX/1,Mtf_GC-tract,Mtf_ZNF320,Mtf_KLF/SP/2,...,Mtf_MYB/1,Mtf_HINFP1/3,Mtf_ZNF435,Mtf_POU/2,Mtf_HD/17,Mtf_HD/9,Mtf_BCL6/1,Mtf_SOX/7,Mtf_HOMEZ,Mtf_HINFP1/2
0,chrX_48780051_48780250,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,chrX_48780351_48780550,1.0,1.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,chrX_48780401_48780600,1.0,2.0,0.0,0.0,1.0,0.0,3.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,chrX_48780451_48780650,1.0,1.0,0.0,0.0,1.0,0.0,3.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,chrX_48780501_48780700,1.0,1.0,0.0,0.0,1.0,0.0,3.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
### convert dict of dict into a table
dat = pd.DataFrame.from_dict(dct["score"], orient="index").fillna(0)
dat = dat.add_prefix("Mtf_")
dat.index.name = 'Fragment'
dat = dat.reset_index()

### show the table
dat_ant_score = dat
print(dat_ant_score.shape)
dat_ant_score.head()

(1353, 271)


Unnamed: 0,Fragment,Mtf_HD/2,Mtf_ZNF332,Mtf_POU/1,Mtf_MYB/5,Mtf_OSR2,Mtf_RFX/1,Mtf_GC-tract,Mtf_ZNF320,Mtf_KLF/SP/2,...,Mtf_MYB/1,Mtf_HINFP1/3,Mtf_ZNF435,Mtf_POU/2,Mtf_HD/17,Mtf_HD/9,Mtf_BCL6/1,Mtf_SOX/7,Mtf_HOMEZ,Mtf_HINFP1/2
0,chrX_48780051_48780250,6.0185,7.7343,7.5087,8.3283,9.5878,7.2977,10.73138,11.361,10.372733,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,chrX_48780351_48780550,6.0185,6.7109,0.0,0.0,0.0,0.0,18.8693,10.8839,15.6542,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,chrX_48780401_48780600,6.0185,14.6987,0.0,0.0,8.8865,0.0,26.9748,10.8839,15.6542,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,chrX_48780451_48780650,6.0185,7.9878,0.0,0.0,8.8865,0.0,32.8674,0.0,13.6294,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,chrX_48780501_48780700,6.0185,7.9878,0.0,0.0,8.8865,0.0,28.230825,0.0,6.7991,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Merge**

In [50]:
dat = dat_frg

cnames = ["Fragment", "Chrom", "Start", "End", "Loc", "Pct_GC", "Input", "Output", "Log2fc"]
dat = dat.loc[:, cnames]

dat = dat.merge(dat_ant_count, how="left", on="Fragment") 
dat = dat.sort_values(["Loc"])

dat_tmpra_count = dat
print(dat_tmpra_count.shape)
dat_tmpra_count.head()

(10000, 279)


Unnamed: 0,Fragment,Chrom,Start,End,Loc,Pct_GC,Input,Output,Log2fc,Mtf_HD/2,...,Mtf_MYB/1,Mtf_HINFP1/3,Mtf_ZNF435,Mtf_POU/2,Mtf_HD/17,Mtf_HD/9,Mtf_BCL6/1,Mtf_SOX/7,Mtf_HOMEZ,Mtf_HINFP1/2
8604,chrX_48780051_48780250,chrX,48780051,48780250,48780151,0.512563,490.329907,685.302462,0.480485,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3591,chrX_48780251_48780450,chrX,48780251,48780450,48780351,0.356784,31.183412,218.676508,2.813741,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5442,chrX_48780251_48780450,chrX,48780251,48780450,48780351,0.356784,31.183412,218.676508,2.813741,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4992,chrX_48780251_48780450,chrX,48780251,48780450,48780351,0.356784,31.183412,218.676508,2.813741,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7662,chrX_48780251_48780450,chrX,48780251,48780450,48780351,0.356784,31.183412,218.676508,2.813741,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
dat = dat_frg

cnames = ["Fragment", "Chrom", "Start", "End", "Loc", "Pct_GC", "Input", "Output", "Log2fc"]
dat = dat.loc[:, cnames]

dat = dat.merge(dat_ant_score, how="left", on="Fragment") 
dat = dat.sort_values(["Loc"])

dat_tmpra_score = dat
print(dat_tmpra_count.shape)
dat_tmpra_score.head()

(10000, 279)


Unnamed: 0,Fragment,Chrom,Start,End,Loc,Pct_GC,Input,Output,Log2fc,Mtf_HD/2,...,Mtf_MYB/1,Mtf_HINFP1/3,Mtf_ZNF435,Mtf_POU/2,Mtf_HD/17,Mtf_HD/9,Mtf_BCL6/1,Mtf_SOX/7,Mtf_HOMEZ,Mtf_HINFP1/2
8604,chrX_48780051_48780250,chrX,48780051,48780250,48780151,0.512563,490.329907,685.302462,0.480485,6.0185,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3591,chrX_48780251_48780450,chrX,48780251,48780450,48780351,0.356784,31.183412,218.676508,2.813741,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5442,chrX_48780251_48780450,chrX,48780251,48780450,48780351,0.356784,31.183412,218.676508,2.813741,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4992,chrX_48780251_48780450,chrX,48780251,48780450,48780351,0.356784,31.183412,218.676508,2.813741,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7662,chrX_48780251_48780450,chrX,48780251,48780450,48780351,0.356784,31.183412,218.676508,2.813741,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Store**

In [52]:
fdiry = os.path.join(FD_RES, "regulatory_landscape")
fname = "dat_output_TileMPRA_zoom1_count_1e4.csv"
fpath = os.path.join(fdiry, fname)

dat_tmpra_count.to_csv(fpath, index=False)

In [53]:
fdiry = os.path.join(FD_RES, "regulatory_landscape")
fname = "dat_output_TileMPRA_zoom1_score_1e4.csv"
fpath = os.path.join(fdiry, fname)

dat_tmpra_score.to_csv(fpath, index=False)

In [54]:
## check
dat = dat_tmpra_score
print(np.min(dat["Start"]), np.max(dat["End"]))

48780051 48826000
