**Set environment**

In [1]:
import sys
sys.path.append('../')
from config_sing import *

You are on Duke Server: Singularity: Proj CombEffect
BASE DIRECTORY:     /mount/work
PATH OF SOURCE:     /mount/work/source
PATH OF EXECUTABLE: /mount/work/exe
PATH OF ANNOTATION: /mount/work/annotation
PATH OF PROJECT:    /mount/project
PATH OF RESULTS:    /mount/work/out/proj_combeffect



In [2]:
### import tools
import sqlite3
import itertools as it
from functools import reduce
from collections import defaultdict
from mpl_toolkits.axes_grid1 import make_axes_locatable
plt.style.use("seaborn-whitegrid")

### Set Samples
fun = np.core.defchararray.add
idx = np.arange(1,6).astype("str")

INPUT20X = reduce(fun, ["Input", idx,     "_20x"])
TFX_DMSO = reduce(fun, ["TFX",   idx[1:], "_DMSO"])
TFX_DEX  = reduce(fun, ["TFX",   idx[1:], "_Dex"])
SAMPLES  = np.concatenate([INPUT20X, TFX_DMSO, TFX_DEX])

### file path of database
fdiry = os.path.join(FD_RES, 'database')
fname = "fragment_chr17.db"
FPATH_DB = os.path.join(fdiry, fname)

In [3]:
### set query
query_out = """
    SELECT name 
    FROM sqlite_master 
    WHERE type = 'table';
    """

### get table names
fpath_db = FPATH_DB
with sqlite3.connect(fpath_db) as conn:
    query  = query_out
    cursor = conn.cursor()
    cursor = cursor.execute(query)
    rows   = cursor.fetchall()
    rows   = np.sort([row[0] for row in rows])

for row in rows:
    print(row)

Annotation
Count
Coverage
Fragment
Motif
Sample


In [4]:
### set query
query_out = """
    SELECT name 
    FROM sqlite_master 
    WHERE type = 'index';
    """

### get index names
fpath_db = FPATH_DB
with sqlite3.connect(fpath_db) as conn:
    query  = query_out
    cursor = conn.cursor()
    cursor = cursor.execute(query)
    rows   = cursor.fetchall()
    rows   = np.sort([row[0] for row in rows])

for row in rows:
    print(row)

idx_annot_frag
idx_count_sample
idx_cov_loc
idx_frag_loc
idx_motif_loc
sqlite_autoindex_Annotation_1
sqlite_autoindex_Fragment_1
sqlite_autoindex_Motif_1
sqlite_autoindex_Sample_1


## Helper function

In [5]:
def get_frag(sample, fpath_db = FPATH_DB):
    ### set query
    query_out = f"""
        SELECT Cnt.fragment, Cnt.count, Cnt.sample
        FROM   Count Cnt
        WHERE  Cnt.sample = '{sample}'
        """
    
    ### query out fragments from the sample
    with sqlite3.connect(fpath_db) as conn:
        query  = query_out
        cursor = conn.cursor()
        rows   = cursor.execute(query)
        for row in rows:
            yield row

In [6]:
def get_annot1(frag, fpath_db=FPATH_DB, verbose=False):
    ### set query
    query_out = f"""
        SELECT Mtf.motif, Mtf.score
        FROM Annotation Ant
        JOIN Motif      Mtf ON Ant.binding = Mtf.binding
        WHERE Ant.fragment = '{frag}'
        ;"""
    
    ### query out motif bindings on the fragment
    with sqlite3.connect(fpath_db) as conn:
        query  = query_out
        cursor = conn.cursor()
        rows   = cursor.execute(query)
        
    ### summarize the motif scores
    dct = defaultdict(lambda: 0)
    for idx, row in enumerate(rows):
        ### parse info
        motif, score = row

        ### sum the scores with same motifs
        dct[motif] += score
        
    if verbose:
        print(f"Fragment: {frg}; # Bindings: {idx+1}; # Motifs: {len(dct)}")
            
    return dct

In [7]:
def get_annot2(frag, fpath_db=FPATH_DB, verbose=False):
    ###
    chrom, start, end = frag.split("_")
    
    ### set query
    query_out = f"""
        SELECT Mtf.motif, Mtf.score
        FROM Motif Mtf
        WHERE Mtf.start >= {start} AND Mtf.end <= {end}
        ;"""
        
    ### query out motif bindings on the fragment
    with sqlite3.connect(fpath_db) as conn:
        query  = query_out
        cursor = conn.cursor()
        rows   = cursor.execute(query)
    
    ### summarize the motif scores
    dct = defaultdict(lambda: 0)
    for idx, row in enumerate(rows):
        ### parse info
        motif, score = row

        ### sum the scores with same motifs
        dct[motif] += score
        
    if verbose:
        print(f"Fragment: {frg}; # Bindings: {idx+1}; # Motifs: {len(dct)}")
    return dct

In [8]:
def get_annot3(frg, fpath_db=FPATH_DB, verbose=False):
    ### parse fragment info
    chrom, start, end = frg.split("_")
    
    with sqlite3.connect(fpath_db) as conn:
        ### set query: query all motifs binding to a given fragment
        query_out = f"""
            SELECT Ant.binding
            FROM Annotation Ant
            WHERE Ant.fragment = '{frg}'
            ;"""    
        
        ### query out motif bindings on the fragment
        query  = query_out
        cursor = conn.cursor()
        rows   = cursor.execute(query)
    
        ### set query: query the corresponding motif info of each annotation
        fun  = lambda x, y: x + y
        mtfs = list(rows)
        mtfs = reduce(fun, mtfs)
        txt  = ','.join('?' * len(mtfs))

        query_out = f"""
            SELECT Mtf.motif, Mtf.score
            FROM Motif Mtf
            WHERE Mtf.binding IN ({txt})
            ;"""
    
        ### query out motif info
        query  = query_out
        cursor = conn.cursor()
        rows   = cursor.execute(query, mtfs)
    
    ### summarize the motif scores
    dct = defaultdict(lambda: 0)
    for idx, row in enumerate(rows):
        ### parse info
        motif, score = row

        ### sum the scores with same motifs
        dct[motif] += score
        
    if verbose:
        print(f"Fragment: {frg}; # Bindings: {idx+1}; # Motifs: {len(dct)}")
    
    return dct

## Query fragments

**Select a few fragments from each sample**

In [9]:
N = 1
lst_frg = list()

for sam in SAMPLES:
    print(sam)
    
    ### select a few fragments
    gen = get_frag(sam)
    lst = it.islice(gen, N)
    lst = list(lst)
    
    ### arrange and collect
    lst = np.sort(lst, axis=0)
    lst_frg.append(lst)

Input1_20x
Input2_20x
Input3_20x
Input4_20x
Input5_20x
TFX2_DMSO
TFX3_DMSO
TFX4_DMSO
TFX5_DMSO
TFX2_Dex
TFX3_Dex
TFX4_Dex
TFX5_Dex


**Arrange fragments into a table**

In [10]:
dat = pd.DataFrame(np.concatenate(lst_frg), columns=["Fragment", "Count", "Sample"])
dat[['Chrom', 'Start', 'End']] = dat['Fragment'].str.split('_', expand=True)

dat = dat.astype({"Start": int, "End": int, "Count": int})
dat = dat.assign(Mid = lambda x: np.ceil((x.Start + x.End) / 2))
dat = dat.assign(Len = lambda x: x.End - x.Start)
dat = dat.astype({"Mid": int})

dat = dat.set_index("Fragment")
dat_frg = dat
dat_frg

Unnamed: 0_level_0,Count,Sample,Chrom,Start,End,Mid,Len
Fragment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
chr17_107410_108464,1,Input1_20x,chr17,107410,108464,107937,1054
chr17_158043_159067,1,Input2_20x,chr17,158043,159067,158555,1024
chr17_158183_159063,1,Input3_20x,chr17,158183,159063,158623,880
chr17_152808_153715,1,Input4_20x,chr17,152808,153715,153262,907
chr17_120797_122228,1,Input5_20x,chr17,120797,122228,121513,1431
chr17_87067_87989,1,TFX2_DMSO,chr17,87067,87989,87528,922
chr17_159962_160793,1,TFX3_DMSO,chr17,159962,160793,160378,831
chr17_201155_201962,1,TFX4_DMSO,chr17,201155,201962,201559,807
chr17_101601_102522,1,TFX5_DMSO,chr17,101601,102522,102062,921
chr17_83638_84547,1,TFX2_Dex,chr17,83638,84547,84093,909


## Query annotations

In [11]:
frags = dat.index.to_numpy()
frags = np.unique(frags)

In [12]:
get_annot = get_annot1
dct = dict()

for frg in frags:
    dct[frg] = get_annot(frg)
    
dat_ant = pd.DataFrame.from_dict(dct, orient="index").fillna(0)
dat_ant = dat_ant.add_prefix("Mtf_")
dat_ant.index.name = 'Fragment'
dat_ant

Unnamed: 0_level_0,Mtf_SREBF1,Mtf_ZBTB48,Mtf_MEF2,Mtf_ZNF250,Mtf_Ebox/CAGATGG,Mtf_ZSCAN3,Mtf_KLF/SP/2,Mtf_GC-tract,Mtf_GCM,Mtf_OSR2,...,Mtf_GMEB2/1,Mtf_SOX/4,Mtf_SOX/6,Mtf_PROX1,Mtf_CUX/2,Mtf_HD/11,Mtf_MYB/1,Mtf_ZNF435,Mtf_OCT4+SOX2,Mtf_FOX/2
Fragment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chr17_101601_102522,27.1003,8.0202,8.5217,4.4231,7.8695,18.8976,47.535633,58.819195,8.86135,26.6573,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr17_107410_108464,7.5326,0.0,17.2987,1.0741,16.5796,8.741,48.25125,88.0813,7.32,17.758,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr17_152808_153715,38.2407,0.0,8.777,0.0,65.002433,17.2574,57.9984,70.686,0.0,17.0657,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr17_158043_159067,59.3961,8.1082,8.777,0.0,18.2035,16.197,94.276309,120.924275,0.0,29.3262,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr17_158183_159063,50.9617,8.1082,8.777,0.0,7.6651,16.197,65.376922,86.528767,0.0,8.664,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr17_159962_160793,8.2689,0.0,10.7556,0.0,17.6839,0.0,112.679756,90.63012,0.0,27.1736,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr17_160133_161194,25.93045,8.4197,10.7556,0.0,43.0189,0.0,126.516942,104.33421,0.0,18.2871,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr17_201155_201962,58.4664,9.175,0.0,1.2716,52.0986,0.0,109.25656,69.6915,48.513,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr17_83638_84547,7.7626,0.0,8.9194,0.0,12.4871,0.0,43.71505,57.563519,0.0,8.1787,...,0.0,2.8792,1.6703,7.7494,0.0,0.0,0.0,0.0,0.0,0.0
chr17_120738_123102,0.0,0.0,6.9162,0.0,7.5921,7.7812,24.1662,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Test the queries with a single fragment**

In [13]:
frg = frags[0]
print(frg)

chr17_101601_102522


In [15]:
dct1 = get_annot1(frg, verbose=True)
dct2 = get_annot2(frg, verbose=True)
dct3 = get_annot3(frg, verbose=True)

Fragment: chr17_101601_102522; # Bindings: 157; # Motifs: 100
Fragment: chr17_101601_102522; # Bindings: 157; # Motifs: 100
Fragment: chr17_101601_102522; # Bindings: 157; # Motifs: 100


In [16]:
### check results
print(dct1 == dct2)
print(dct1 == dct3)

True
True


In [17]:
%%timeit
dct1 = get_annot1(frg)

1.29 ms ± 21.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [18]:
%%timeit
dct2 = get_annot2(frg)

2.87 s ± 343 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
%%timeit
dct3 = get_annot3(frg)

1.78 ms ± 10.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [23]:
1.29 * 1281022 / 1000 / 60

27.541973000000002