**Set environment**

In [1]:
### basic
import sys
sys.path.append('../')
from config_sing import *
show_env()

You are in: Singularity | singularity_proj_combeffect
    BASE DIRECTORY:     /mount/work
    PATH OF SOURCE:     /mount/work/source
    PATH OF EXECUTABLE: /mount/work/exe
    PATH OF ANNOTATION: /mount/work/annotation
    PATH OF PROJECT:    /mount/project
    PATH OF RESULTS:    /mount/work/out/proj_combeffect_encode_fcc

Library imported:
    numpy, pandas, matplotlib.pyplot
    os, sys, time, gzip, glob



In [2]:
### specific tools
from collections import defaultdict
from functools import reduce
import itertools as it
import math
import random
import sqlite3
# https://stackoverflow.com/questions/49456158/integer-in-python-pandas-becomes-blob-binary-in-sqlite
sqlite3.register_adapter(np.int64, lambda val: int(val))
sqlite3.register_adapter(np.int32, lambda val: int(val))

### file path of fragment database
fdiry = os.path.join(FD_RES, "KS91_K562_ASTARRseq", 'database')
fname = "fragment_astarr_gata1.db"
FPATH_DB_ASTARR = os.path.join(fdiry, fname)

fdiry = os.path.join(FD_RES, "Tewhey_K562_TileMPRA", 'database')
fname = "fragment_tilempra_gata1.db"
FPATH_DB_TMPRA = os.path.join(fdiry, fname)

FPATHS_DB = [FPATH_DB_ASTARR, FPATH_DB_TMPRA]

### samples of ATAC-STARR-seq
fun = np.core.defchararray.add

idx = np.arange(1,6+1).astype("str")
SAMPLES_INP = reduce(fun, ["Input_rep",   idx])

idx = np.arange(1,4+1).astype("str")
SAMPLES_OUT = reduce(fun, ["Output_rep",  idx])

SAMPLES = np.concatenate([SAMPLES_INP, SAMPLES_OUT])
GROUPS  = np.r_[
    np.repeat("Input",  len(SAMPLES_INP)),
    np.repeat("Output", len(SAMPLES_OUT))
]

print(SAMPLES)
print(GROUPS)

['Input_rep1' 'Input_rep2' 'Input_rep3' 'Input_rep4' 'Input_rep5'
 'Input_rep6' 'Output_rep1' 'Output_rep2' 'Output_rep3' 'Output_rep4']
['Input' 'Input' 'Input' 'Input' 'Input' 'Input' 'Output' 'Output'
 'Output' 'Output']


In [3]:
### https://stackoverflow.com/questions/12581437/python-random-sample-with-a-generator-iterable-iterator
def iter_sample_fast(iterable, samplesize):
    
    ### init
    results = []
    iterator = iter(iterable)
    
    ### Fill in the first samplesize elements:
    try:
        for _ in range(samplesize):
            results.append(next(iterator))
    except StopIteration:
        ### return everything if samplesize > len(iterable)
        random.shuffle(results)
        return results
    
    ### continue iterating through the elements and update the list
    random.shuffle(results)  # Randomize their positions
    for i, v in enumerate(iterator, samplesize):
        r = random.randint(0, i)
        if r < samplesize:
            results[r] = v  # at a decreasing rate, replace random items
    return results

In [4]:
def get_frag_astarr(sample, start=None, end=None, fpath_db = FPATH_DB_ASTARR):
    """sample fragments from ATAC-STARR-seq with probability proportion to count"""
    ### set query
    if (start is None) and (end is None):
        query  = f"""
            SELECT Cnt.fragment, Cnt.sample, Frg.pct_gc, Cnt.count
            FROM   Count    Cnt
            JOIN   Fragment Frg
            ON     Cnt.fragment = Frg.fragment
            WHERE  Cnt.sample = '{sample}'
            """
    else:
        query  = f"""
            SELECT Cnt.fragment, Cnt.sample, Frg.pct_gc, Cnt.count
            FROM   Count    Cnt
            JOIN   Fragment Frg
            ON     Cnt.fragment = Frg.fragment
            WHERE  Cnt.sample = '{sample}' AND 
                   Frg.start >= '{start}'  AND 
                   Frg.end   <= '{end}'
            """
    
    fpath_db = "file:" + fpath_db + "?mode=ro"
    with sqlite3.connect(fpath_db, uri=True) as conn:
        ### query the fragment annotations
        cursor = conn.cursor()
        cursor = cursor.execute(query)
        rows   = cursor
        
        ### generate fragments
        for row in rows:
            ### parse info
            frg, sample, pct_gc, count = row
            yield row
            
            ### repeat the each fragment based on its count
            #for _ in range(count):
            #    yield row    
                
def get_frag_tmpra(start=None, end=None, fpath_db = FPATH_DB_TMPRA):
    """sample fragments from TileMPRA with probability proportion to fold change"""
    ### set query
    if (start is None) and (end is None):
        query  = """
            SELECT Cnt.fragment, Frg.pct_gc, Cnt.count_input, Cnt.count_output, Cnt.log2fc 
            FROM   Count Cnt
            JOIN   Fragment Frg
            ON     Cnt.fragment = Frg.fragment
            """
    else:
        query  = f"""
            SELECT Cnt.fragment, Frg.pct_gc, Cnt.count_input, Cnt.count_output, Cnt.log2fc 
            FROM   Count    Cnt
            JOIN   Fragment Frg
            ON     Cnt.fragment = Frg.fragment
            WHERE  Frg.start >= '{start}' AND 
                   Frg.end   <= '{end}'
            """
        
    fpath_db = "file:" + fpath_db + "?mode=ro"
    with sqlite3.connect(fpath_db, uri=True) as conn:
        ### query the fragment annotations
        cursor = conn.cursor()
        cursor = cursor.execute(query)
        rows   = cursor

        ### generate fragments
        for row in rows:
            ### parse info
            frg, pct_gc, inp, out, log2fc = row
            yield row
            
            ### repeat the each fragment based on its ratio
            #ratio = np.exp2(log2fc)
            #count = np.ceil(ratio).astype(np.int)
            #for _ in range(count):
            #    yield row

In [5]:
def get_depth(locs, fpath_db = FPATH_DB_ASTARR):
    """get"""
    ### set query
    txt   = ', '.join('?' for _ in locs)
    query = f"""
        SELECT   Cov.location, Cov.sample, Cov.depth, Sam.treatment, Sam.size
        FROM     Coverage Cov
        JOIN     Sample   Sam 
        ON       Cov.sample = Sam.sample
        WHERE    Cov.location IN ({txt})
        ORDER BY Cov.location
        """
    
    ### query out from database
    fpath_db = "file:" + fpath_db + "?mode=ro"
    with sqlite3.connect(fpath_db, uri=True) as conn:
        cursor = conn.cursor()
        cursor = cursor.execute(query, locs)
    
    ### generate each row
    for row in cursor:
        yield row
        
def get_annot(frgs, fpath_db):
    """get annotation from the given fragments"""
    ### set query
    txt   = ', '.join('?' for _ in frgs)
    query = f"""
        SELECT   Ant.Fragment, Mtf.motif, Mtf.score
        FROM     Annotation Ant
        JOIN     Motif      Mtf 
        ON       Ant.binding = Mtf.binding
        WHERE    Ant.Fragment IN ({txt})
        ORDER BY Ant.Fragment
        """
    
    ## query out from database
    fpath_db = "file:" + fpath_db + "?mode=ro"
    with sqlite3.connect(fpath_db, uri=True) as conn:
        cursor = conn.cursor()    
        cursor = cursor.execute(query, frgs)
        
    ### summarize the motif annotation scores
    dct_ann_count = defaultdict(lambda: defaultdict(lambda: 0))
    dct_ann_score = defaultdict(lambda: defaultdict(lambda: 0.0))
    
    for row in cursor:
        ### parse info
        frg, mtf, val = row
        
        ### count and sum the annotation scores
        dct_ann_count[frg][mtf] += 1
        dct_ann_score[frg][mtf] += val
    
    ### arrange and return
    dct_ann = dict()
    dct_ann["count"] = dct_ann_count
    dct_ann["score"] = dct_ann_score
    return dct_ann

## Sampling fragments from ATAC-STARR-seq

**Get fragments**

In [8]:
%%time

### init
#N = 25000
N = 10000
random.seed(123)
start = 48780000
end   = 48826000

### random sample fragments from ATAC-STARR-seq
lst_frg = []
for sample in SAMPLES:
    gen = get_frag_astarr(sample, start=start, end=end)
    lst = iter_sample_fast(gen, N)
    lst_frg += lst

CPU times: user 4.04 s, sys: 1.77 s, total: 5.82 s
Wall time: 55.4 s


In [9]:
dat = pd.DataFrame(lst_frg, columns=["Fragment", "Sample", "Pct_GC", "Count"])
dat[['Chrom', 'Start', 'End']] = dat['Fragment'].str.split('_', expand=True)
dat = dat.astype({"Start": int, "End": int})
dat = dat.assign(Loc = lambda x: np.ceil((x.Start + x.End) / 2))
dat = dat.astype({"Loc": int})

dat_frg = dat
print(dat_frg.shape)
dat_frg.head()

(78259, 8)


Unnamed: 0,Fragment,Sample,Pct_GC,Count,Chrom,Start,End,Loc
0,chrX_48785113_48785286,Input_rep1,0.485549,1,chrX,48785113,48785286,48785200
1,chrX_48801343_48801669,Input_rep1,0.662577,1,chrX,48801343,48801669,48801506
2,chrX_48788613_48788827,Input_rep1,0.551402,1,chrX,48788613,48788827,48788720
3,chrX_48802012_48802308,Input_rep1,0.628378,1,chrX,48802012,48802308,48802160
4,chrX_48801829_48802113,Input_rep1,0.644366,1,chrX,48801829,48802113,48801971


In [13]:
from collections import Counter

In [14]:
Counter(dat_frg.Sample)

Counter({'Input_rep1': 10000,
         'Input_rep2': 10000,
         'Input_rep3': 10000,
         'Input_rep4': 10000,
         'Input_rep5': 10000,
         'Output_rep1': 4163,
         'Output_rep2': 6970,
         'Output_rep3': 7126,
         'Output_rep4': 10000})

In [15]:
frgs = dat_frg.Fragment
locs = dat_frg.Loc

**Get coverage**

In [16]:
%%time
gen = get_depth(locs)
dat = pd.DataFrame(gen, columns = ["Loc", "Sample", "Depth", "Trt", "Size"])
dat = dat.assign(Depth_LogNorm = lambda x: np.log2(x.Depth + 1) - np.log2(x.Size))
dat = dat.groupby(["Loc", "Trt"])["Depth_LogNorm"].mean().unstack(level=1)
dat = dat.assign(Log2fc = lambda x: x.Output - x.Input)
dat = dat.reset_index()

dat_cov = dat
print(dat_cov.shape)
dat_cov.head()

(17415, 4)
CPU times: user 1.02 s, sys: 54.8 ms, total: 1.07 s
Wall time: 1.31 s


Trt,Loc,Input,Output,Log2fc
0,48780102,-12.767418,-13.386689,-0.619271
1,48780108,-12.769107,-13.386689,-0.617582
2,48780112,-12.797729,-13.386689,-0.58896
3,48780114,-12.807756,-13.391061,-0.583305
4,48780115,-12.825223,-13.422444,-0.59722


**Get annotations**

In [17]:
%%time
### query the annotation
fpath_db = FPATH_DB_ASTARR
dct = get_annot(frgs, fpath_db)

CPU times: user 13.6 s, sys: 824 ms, total: 14.5 s
Wall time: 29.8 s


In [18]:
### convert dict of dict into a table
dat = pd.DataFrame.from_dict(dct["count"], orient="index").fillna(0)
dat = dat.add_prefix("Mtf_")
dat.index.name = 'Fragment'
dat = dat.reset_index()

### show the table
dat_ant_count = dat
print(dat_ant_count.shape)
dat_ant_count.head()

(56232, 271)


Unnamed: 0,Fragment,Mtf_FOX/1,Mtf_HD/20,Mtf_ZNF449,Mtf_RFX/1,Mtf_ZFN121,Mtf_PAX/2,Mtf_YY1,Mtf_NFKB/2,Mtf_SPZ1,...,Mtf_MYB/1,Mtf_HINFP1/3,Mtf_ZNF435,Mtf_POU/2,Mtf_HD/17,Mtf_HD/9,Mtf_BCL6/1,Mtf_SOX/7,Mtf_HOMEZ,Mtf_HINFP1/2
0,chrX_48780002_48780202,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,chrX_48780002_48780314,1.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,chrX_48780936_48781524,1.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,chrX_48781012_48781424,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,chrX_48781035_48781370,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
### convert dict of dict into a table
dat = pd.DataFrame.from_dict(dct["score"], orient="index").fillna(0)
dat = dat.add_prefix("Mtf_")
dat.index.name = 'Fragment'
dat = dat.reset_index()

### show the table
dat_ant_score = dat
print(dat_ant_score.shape)
dat_ant_score.head()

(56232, 271)


Unnamed: 0,Fragment,Mtf_FOX/1,Mtf_HD/20,Mtf_ZNF449,Mtf_RFX/1,Mtf_ZFN121,Mtf_PAX/2,Mtf_YY1,Mtf_NFKB/2,Mtf_SPZ1,...,Mtf_MYB/1,Mtf_HINFP1/3,Mtf_ZNF435,Mtf_POU/2,Mtf_HD/17,Mtf_HD/9,Mtf_BCL6/1,Mtf_SOX/7,Mtf_HOMEZ,Mtf_HINFP1/2
0,chrX_48780002_48780202,8.4641,7.2633,7.3633,13.6886,12.0497,10.9269,6.9956,7.1419,8.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,chrX_48780002_48780314,8.4641,7.2633,7.3633,13.6886,22.1408,21.16,6.9956,7.1419,8.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,chrX_48780936_48781524,7.3364,0.0,0.0,9.441,19.8311,26.119,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,chrX_48781012_48781424,7.3364,0.0,0.0,9.441,19.8311,15.7609,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,chrX_48781035_48781370,7.3364,0.0,0.0,0.0,19.8311,15.7609,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Merge**

In [20]:
dat = dat_frg
dat = dat.merge(dat_cov, how="left", on="Loc")

cnames = ["Sample", "Count", "Fragment", "Chrom", "Start", "End", "Loc", "Pct_GC", "Input", "Output", "Log2fc"]
dat = dat.loc[:, cnames]

dat = dat.merge(dat_ant_count, how="left", on="Fragment") 
dat["Sample"] = pd.Categorical(dat["Sample"], SAMPLES)
dat = dat.sort_values(["Sample", "Loc"])

dat_astarr_count = dat
print(dat_astarr_count.shape)
dat_astarr_count.head()

(78259, 281)


Unnamed: 0,Sample,Count,Fragment,Chrom,Start,End,Loc,Pct_GC,Input,Output,...,Mtf_MYB/1,Mtf_HINFP1/3,Mtf_ZNF435,Mtf_POU/2,Mtf_HD/17,Mtf_HD/9,Mtf_BCL6/1,Mtf_SOX/7,Mtf_HOMEZ,Mtf_HINFP1/2
8604,Input_rep1,1,chrX_48780017_48780199,chrX,48780017,48780199,48780108,0.565934,-12.769107,-13.386689,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4992,Input_rep1,1,chrX_48780026_48780218,chrX,48780026,48780218,48780122,0.557292,-12.830641,-13.474395,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3591,Input_rep1,1,chrX_48780036_48780225,chrX,48780036,48780225,48780131,0.555556,-12.870405,-13.474395,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6678,Input_rep1,1,chrX_48780030_48780280,chrX,48780030,48780280,48780155,0.492,-12.988914,-13.345469,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7662,Input_rep1,1,chrX_48780025_48780333,chrX,48780025,48780333,48780179,0.457792,-13.120371,-13.436881,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
dat = dat_frg
dat = dat.merge(dat_cov, how="left", on="Loc")

cnames = ["Sample", "Count", "Fragment", "Chrom", "Start", "End", "Loc", "Pct_GC", "Input", "Output", "Log2fc"]
dat = dat.loc[:, cnames]

dat = dat.merge(dat_ant_score, how="left", on="Fragment") 
dat["Sample"] = pd.Categorical(dat["Sample"], SAMPLES)
dat = dat.sort_values(["Sample", "Loc"])

dat_astarr_score = dat
print(dat_astarr_score.shape)
dat_astarr_score.head()

(78259, 281)


Unnamed: 0,Sample,Count,Fragment,Chrom,Start,End,Loc,Pct_GC,Input,Output,...,Mtf_MYB/1,Mtf_HINFP1/3,Mtf_ZNF435,Mtf_POU/2,Mtf_HD/17,Mtf_HD/9,Mtf_BCL6/1,Mtf_SOX/7,Mtf_HOMEZ,Mtf_HINFP1/2
8604,Input_rep1,1,chrX_48780017_48780199,chrX,48780017,48780199,48780108,0.565934,-12.769107,-13.386689,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4992,Input_rep1,1,chrX_48780026_48780218,chrX,48780026,48780218,48780122,0.557292,-12.830641,-13.474395,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3591,Input_rep1,1,chrX_48780036_48780225,chrX,48780036,48780225,48780131,0.555556,-12.870405,-13.474395,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6678,Input_rep1,1,chrX_48780030_48780280,chrX,48780030,48780280,48780155,0.492,-12.988914,-13.345469,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7662,Input_rep1,1,chrX_48780025_48780333,chrX,48780025,48780333,48780179,0.457792,-13.120371,-13.436881,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Store**

In [22]:
fdiry = os.path.join(FD_RES, "regulatory_landscape")
fname = "dat_test_ASTARR_zoom1_count.csv"
fpath = os.path.join(fdiry, fname)

dat_astarr_count.to_csv(fpath, index=False)

In [23]:
fdiry = os.path.join(FD_RES, "regulatory_landscape")
fname = "dat_test_ASTARR_zoom1_score.csv"
fpath = os.path.join(fdiry, fname)

dat_astarr_score.to_csv(fpath, index=False)

In [24]:
## check
dat = dat_astarr_score
print(np.min(dat["Start"]), np.max(dat["End"]))

48780002 48825997


## Sampling fragments from TileMPRA

**Get fragments**

In [38]:
%%time
### init
N = 10000
random.seed(123)
start = 48780000
end   = 48826000

### random sample fragments from TileMPRA
gen = get_frag_tmpra(start=start, end=end)
lst = list(gen)
print(len(lst))

1746
CPU times: user 11.2 ms, sys: 2.95 ms, total: 14.2 ms
Wall time: 12.7 ms


In [25]:
%%time
### init
N = 10000
random.seed(123)
start = 48780000
end   = 48826000

### random sample fragments from TileMPRA
gen = get_frag_tmpra(start=start, end=end)
lst = iter_sample_fast(gen, N)

CPU times: user 12 ms, sys: 1.05 ms, total: 13.1 ms
Wall time: 126 ms


In [26]:
lst[0]

('chrX_48787021_48787220',
 0.507538,
 53.2654017707863,
 78.7816894754911,
 0.55615512111572)

In [28]:
dat = pd.DataFrame(lst, columns=["Fragment", "Pct_GC", "Input", "Output", "Log2fc"])
dat[['Chrom', 'Start', 'End']] = dat['Fragment'].str.split('_', expand=True)
dat = dat.astype({"Start": int, "End": int})
dat = dat.assign(Loc = lambda x: np.ceil((x.Start + x.End) / 2))
dat = dat.astype({"Loc": int})

dat_frg = dat
print(dat_frg.shape)
dat_frg.head()

(1746, 9)


Unnamed: 0,Fragment,Pct_GC,Input,Output,Log2fc,Chrom,Start,End,Loc
0,chrX_48787021_48787220,0.507538,53.265402,78.781689,0.556155,chrX,48787021,48787220,48787121
1,chrX_48817901_48818100,0.552764,363.208436,687.531955,0.917804,chrX,48817901,48818100,48818001
2,chrX_48813601_48813800,0.502513,843.048346,1340.199386,0.66866,chrX,48813601,48813800,48813701
3,chrX_48788801_48789000,0.472362,307.606545,621.335965,1.015533,chrX,48788801,48789000,48788901
4,chrX_48819361_48819560,0.432161,115.73687,246.681037,1.089973,chrX,48819361,48819560,48819461


In [29]:
frgs = dat_frg.Fragment

**Get annotations**

In [30]:
%%time
### query the annotation
fpath_db = FPATH_DB_TMPRA
dct = get_annot(frgs, fpath_db)

CPU times: user 458 ms, sys: 78.8 ms, total: 537 ms
Wall time: 826 ms


In [31]:
### convert dict of dict into a table
dat = pd.DataFrame.from_dict(dct["count"], orient="index").fillna(0)
dat = dat.add_prefix("Mtf_")
dat.index.name = 'Fragment'
dat = dat.reset_index()

### show the table
dat_ant_count = dat
print(dat_ant_count.shape)
dat_ant_count.head()

(1746, 271)


Unnamed: 0,Fragment,Mtf_FOX/1,Mtf_HD/20,Mtf_ZNF449,Mtf_RFX/1,Mtf_ZFN121,Mtf_PAX/2,Mtf_YY1,Mtf_NFKB/2,Mtf_SPZ1,...,Mtf_MYB/1,Mtf_HINFP1/3,Mtf_ZNF435,Mtf_POU/2,Mtf_HD/17,Mtf_HD/9,Mtf_BCL6/1,Mtf_SOX/7,Mtf_HOMEZ,Mtf_HINFP1/2
0,chrX_48780001_48780200,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,chrX_48781201_48781400,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,chrX_48781251_48781450,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,chrX_48781301_48781500,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,chrX_48781601_48781800,1.0,3.0,1.0,1.0,1.0,2.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
### convert dict of dict into a table
dat = pd.DataFrame.from_dict(dct["score"], orient="index").fillna(0)
dat = dat.add_prefix("Mtf_")
dat.index.name = 'Fragment'
dat = dat.reset_index()

### show the table
dat_ant_score = dat
print(dat_ant_score.shape)
dat_ant_score.head()

(1746, 271)


Unnamed: 0,Fragment,Mtf_FOX/1,Mtf_HD/20,Mtf_ZNF449,Mtf_RFX/1,Mtf_ZFN121,Mtf_PAX/2,Mtf_YY1,Mtf_NFKB/2,Mtf_SPZ1,...,Mtf_MYB/1,Mtf_HINFP1/3,Mtf_ZNF435,Mtf_POU/2,Mtf_HD/17,Mtf_HD/9,Mtf_BCL6/1,Mtf_SOX/7,Mtf_HOMEZ,Mtf_HINFP1/2
0,chrX_48780001_48780200,8.4641,7.2633,7.3633,13.6886,12.0497,10.9269,6.9956,7.1419,8.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,chrX_48781201_48781400,7.3364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,chrX_48781251_48781450,7.3364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,chrX_48781301_48781500,7.3364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,chrX_48781601_48781800,7.9644,23.7002,7.2955,5.45645,16.4063,20.4358,0.0,0.0,7.2983,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Merge**

In [33]:
dat = dat_frg

cnames = ["Fragment", "Chrom", "Start", "End", "Loc", "Pct_GC", "Input", "Output", "Log2fc"]
dat = dat.loc[:, cnames]

dat = dat.merge(dat_ant_count, how="left", on="Fragment") 
dat = dat.sort_values(["Loc"])

dat_tmpra_count = dat
print(dat_tmpra_count.shape)
dat_tmpra_count.head()

(1746, 279)


Unnamed: 0,Fragment,Chrom,Start,End,Loc,Pct_GC,Input,Output,Log2fc,Mtf_FOX/1,...,Mtf_MYB/1,Mtf_HINFP1/3,Mtf_ZNF435,Mtf_POU/2,Mtf_HD/17,Mtf_HD/9,Mtf_BCL6/1,Mtf_SOX/7,Mtf_HOMEZ,Mtf_HINFP1/2
1600,chrX_48780001_48780200,chrX,48780001,48780200,48780101,0.552764,775.19676,724.09127,-0.096545,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1201,chrX_48780051_48780250,chrX,48780051,48780250,48780151,0.512563,490.329907,685.302462,0.480485,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
328,chrX_48780101_48780300,chrX,48780101,48780300,48780201,0.477387,287.330074,308.05981,0.103002,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1722,chrX_48780151_48780350,chrX,48780151,48780350,48780251,0.417085,38.837467,23.540292,-0.719225,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1041,chrX_48780201_48780400,chrX,48780201,48780400,48780301,0.321608,69.14795,52.45016,-0.405771,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
dat = dat_frg

cnames = ["Fragment", "Chrom", "Start", "End", "Loc", "Pct_GC", "Input", "Output", "Log2fc"]
dat = dat.loc[:, cnames]

dat = dat.merge(dat_ant_score, how="left", on="Fragment") 
dat = dat.sort_values(["Loc"])

dat_tmpra_score = dat
print(dat_tmpra_count.shape)
dat_tmpra_score.head()

(1746, 279)


Unnamed: 0,Fragment,Chrom,Start,End,Loc,Pct_GC,Input,Output,Log2fc,Mtf_FOX/1,...,Mtf_MYB/1,Mtf_HINFP1/3,Mtf_ZNF435,Mtf_POU/2,Mtf_HD/17,Mtf_HD/9,Mtf_BCL6/1,Mtf_SOX/7,Mtf_HOMEZ,Mtf_HINFP1/2
1600,chrX_48780001_48780200,chrX,48780001,48780200,48780101,0.552764,775.19676,724.09127,-0.096545,8.4641,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1201,chrX_48780051_48780250,chrX,48780051,48780250,48780151,0.512563,490.329907,685.302462,0.480485,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
328,chrX_48780101_48780300,chrX,48780101,48780300,48780201,0.477387,287.330074,308.05981,0.103002,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1722,chrX_48780151_48780350,chrX,48780151,48780350,48780251,0.417085,38.837467,23.540292,-0.719225,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1041,chrX_48780201_48780400,chrX,48780201,48780400,48780301,0.321608,69.14795,52.45016,-0.405771,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Store**

In [35]:
fdiry = os.path.join(FD_RES, "regulatory_landscape")
fname = "dat_test_TileMPRA_zoom1_count.csv"
fpath = os.path.join(fdiry, fname)

dat_tmpra_count.to_csv(fpath, index=False)

In [36]:
fdiry = os.path.join(FD_RES, "regulatory_landscape")
fname = "dat_test_TileMPRA_zoom1_score.csv"
fpath = os.path.join(fdiry, fname)

dat_tmpra_score.to_csv(fpath, index=False)

In [37]:
## check
dat = dat_tmpra_score
print(np.min(dat["Start"]), np.max(dat["End"]))

48780001 48826000
