# Demonstration of fragment database

**Set environment**

In [1]:
### import tools
import numpy  as np
import pandas as pd
import os, sys
import sqlite3
import itertools as it
from functools import reduce
from collections import defaultdict

### global variables: file path
FD_PREFIX = "/gpfs/fs1"
FD_WORK   = os.path.join(FD_PREFIX, "data", "reddylab", "Kuei")
FD_RES    = os.path.join(FD_WORK, "out/proj_combeffect")
FPATH_DB  = os.path.join(FD_RES, "database", "fragment.db")

### global variables: samples
fun = np.core.defchararray.add
idx = np.arange(1,6).astype("str")

INPUT    = reduce(fun, ["Input", idx])
INPUT20X = reduce(fun, ["Input", idx,     "_20x"])
TFX_DMSO = reduce(fun, ["TFX",   idx[1:], "_DMSO"])
TFX_DEX  = reduce(fun, ["TFX",   idx[1:], "_Dex"])
SAMPLES  = np.concatenate([INPUT20X, TFX_DMSO, TFX_DEX])

Note: the file path of the database file in HARDAC

In [2]:
print(FPATH_DB)

/gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/database/fragment.db


## Tables in fragment database

**There are six tables in total**
- **Sample**
```
sample    TEXT PRIMARY KEY, 
treatment TEXT,
size      INTEGER
```

- **Fragment**
```
fragment TEXT PRIMARY KEY, 
chrom    TEXT,
start    INTEGER,
end      INTEGER,
pct_at   REAL, (AT%)
pct_gc   REAL, (GC%; GC content)
num_A    INTEGER,
num_C    INTEGER,
num_G    INTEGER,
num_T    INTEGER,
num_N    INTEGER,
num_oth  INTEGER
```

- **Motif**
```
binding TEXT PRIMARY KEY, 
chrom   TEXT,
start   INTEGER,
end     INTEGER,
motif   TEXT,
score   REAL
```

- **Annotation**
```
fragment TEXT, 
binding  TEXT,
FOREIGN KEY (fragment) REFERENCES Fragment (fragment),
FOREIGN KEY (binding)  REFERENCES Motif    (binding),
UNIQUE (fragment, binding) ON CONFLICT IGNORE
```

- **Count**
```
fragment TEXT, 
sample   TEXT,
count    INTEGER,
FOREIGN KEY (fragment) REFERENCES Fragment (fragment),
FOREIGN KEY (sample)   REFERENCES Sample   (sample)
```

- **Coverage**
```
chrom    TEXT,
location INTEGER,
depth    INTEGER,
sample   TEXT,
FOREIGN KEY (sample) REFERENCES Sample (sample)
```

In [3]:
### set query
query_out = """
    SELECT name 
    FROM sqlite_master 
    WHERE type = 'table';
    """

### get table names
fpath_db = FPATH_DB
with sqlite3.connect(fpath_db) as conn:
    query  = query_out
    cursor = conn.cursor()
    cursor.execute(query)
    rows   = cursor.fetchall()
rows

[('Sample',),
 ('Fragment',),
 ('Motif',),
 ('Count',),
 ('Coverage',),
 ('Annotation',)]

**Index:** Besides the primary keys of each table, I have further created two indices.
```
CREATE INDEX idx_motif_loc  ON Motif (start, end)
CREATE INDEX idx_location   ON Coverage (location)
CREATE INDEX idx_annot_frag ON Annotation (fragment)
```

In [4]:
### set query
query_out = """
    SELECT name 
    FROM sqlite_master 
    WHERE type = 'index';
    """

### get index names
fpath_db = FPATH_DB
with sqlite3.connect(fpath_db) as conn:
    query  = query_out
    cursor = conn.cursor()
    cursor.execute(query)
    rows   = cursor.fetchall()
rows

[('sqlite_autoindex_Sample_1',),
 ('sqlite_autoindex_Fragment_1',),
 ('sqlite_autoindex_Motif_1',),
 ('idx_location',),
 ('idx_motif_loc',),
 ('sqlite_autoindex_Annotation_1',),
 ('idx_annot_frag',)]

## Helper function

In [5]:
def get_frag(sample, fpath_db = FPATH_DB):
    ### set query
    query_out = f"""
        SELECT Cnt.fragment, Cnt.count, Cnt.sample
        FROM   Count Cnt
        WHERE  Cnt.sample = '{sample}'
        """
    
    ### query out fragments from the sample
    with sqlite3.connect(fpath_db) as conn:
        query  = query_out
        cursor = conn.cursor()
        rows   = cursor.execute(query)
        for row in rows:
            yield row

In [6]:
def get_annot(frg, fpath_db=FPATH_DB, verbose=False):
    ### parse fragment info
    chrom, start, end = frg.split("_")
    
    with sqlite3.connect(fpath_db) as conn:
        ### set query: query all motifs binding to a given fragment
        query_out = f"""
            SELECT Ant.binding
            FROM Annotation Ant
            WHERE Ant.fragment = '{frg}'
            ;"""    
        
        ### query out motif bindings on the fragment
        query  = query_out
        cursor = conn.cursor()
        rows   = cursor.execute(query)
    
        ### set query: query the corresponding motif info of each annotation
        fun  = lambda x, y: x + y
        mtfs = list(rows)
        mtfs = reduce(fun, mtfs)
        txt  = ','.join('?' * len(mtfs))

        query_out = f"""
            SELECT Mtf.motif, Mtf.score
            FROM Motif Mtf
            WHERE Mtf.binding IN ({txt})
            ;"""
    
        ### query out motif info
        query  = query_out
        cursor = conn.cursor()
        rows   = cursor.execute(query, mtfs)
    
    ### summarize the motif scores
    dct = defaultdict(lambda: 0)
    for idx, row in enumerate(rows):
        ### parse info
        motif, score = row

        ### sum the scores with same motifs
        dct[motif] += score
        
    if verbose:
        print(f"Fragment: {frg}; # Bindings: {idx+1}; # Motifs: {len(dct)}")
    
    return dct

## Query fragments

**Select a few fragments from each sample**

In [7]:
N = 5
lst_frg = list()

for sam in SAMPLES:
    print(sam)
    
    ### select a few fragments
    gen = get_frag(sam)
    lst = it.islice(gen, N)
    lst = list(lst)
    
    ### arrange and collect
    lst = np.sort(lst, axis=0)
    lst_frg.append(lst)

Input1_20x
Input2_20x
Input3_20x
Input4_20x
Input5_20x
TFX2_DMSO
TFX3_DMSO
TFX4_DMSO
TFX5_DMSO
TFX2_Dex
TFX3_Dex
TFX4_Dex
TFX5_Dex


**Arrange fragments into a table**

In [8]:
dat = pd.DataFrame(np.concatenate(lst_frg), columns=["Fragment", "Count", "Sample"])
dat[['Chrom', 'Start', 'End']] = dat['Fragment'].str.split('_', expand=True)

dat = dat.astype({"Start": int, "End": int, "Count": int})
dat = dat.assign(Mid = lambda x: np.ceil((x.Start + x.End) / 2))
dat = dat.assign(Len = lambda x: x.End - x.Start)
dat = dat.astype({"Mid": int})

dat = dat.set_index("Fragment")
dat_frg = dat
dat_frg

Unnamed: 0_level_0,Count,Sample,Chrom,Start,End,Mid,Len
Fragment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
chr17_8148003_8148983,1,Input1_20x,chr17,8148003,8148983,8148493,980
chr17_8148004_8148925,1,Input1_20x,chr17,8148004,8148925,8148465,921
chr17_8148004_8148962,1,Input1_20x,chr17,8148004,8148962,8148483,958
chr17_8148004_8148963,1,Input1_20x,chr17,8148004,8148963,8148484,959
chr17_8148005_8149014,3,Input1_20x,chr17,8148005,8149014,8148510,1009
...,...,...,...,...,...,...,...
chr17_8148018_8148882,1,TFX5_Dex,chr17,8148018,8148882,8148450,864
chr17_8148019_8148882,1,TFX5_Dex,chr17,8148019,8148882,8148451,863
chr17_8148056_8149063,1,TFX5_Dex,chr17,8148056,8149063,8148560,1007
chr17_8148195_8149069,1,TFX5_Dex,chr17,8148195,8149069,8148632,874


## Query annotations

In [9]:
frags = dat.index.to_numpy()
frags = np.unique(frags)

In [10]:
dct = dict()
for frg in frags:
    dct[frg] = get_annot(frg)
    
dat_ant = pd.DataFrame.from_dict(dct, orient="index").fillna(0)
dat_ant = dat_ant.add_prefix("Mtf_")
dat_ant.index.name = 'Fragment'
dat_ant

Unnamed: 0_level_0,Mtf_GLIS,Mtf_GC-tract,Mtf_ZNF680,Mtf_GCM,Mtf_NR/19,Mtf_ZNF143,Mtf_ZFN121,Mtf_PLAG1,Mtf_CTCF,Mtf_NR/16,...,Mtf_ZNF418,Mtf_ZNF85,Mtf_FOX/1,Mtf_PRDM1,Mtf_MBD2,Mtf_STAT/2,Mtf_STAT/1,Mtf_RUNX/1,Mtf_SCRT1,Mtf_ZBTB6
Fragment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chr17_8148000_8148887,23.1883,91.402615,15.3419,7.5087,23.57,38.43685,4.971,11.3956,24.1703,44.586567,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr17_8148000_8148888,23.1883,91.402615,15.3419,7.5087,23.57,38.43685,4.971,11.3956,24.1703,44.586567,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr17_8148000_8148993,23.1883,114.212037,15.3419,7.5087,23.57,54.69065,4.971,11.3956,33.3872,55.261133,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr17_8148001_8148888,23.1883,91.402615,15.3419,7.5087,23.57,38.43685,4.971,11.3956,24.1703,44.586567,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr17_8148002_8148836,23.1883,91.402615,15.3419,7.5087,15.8511,38.43685,4.971,11.3956,24.1703,36.011967,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr17_8148002_8149023,23.1883,114.212037,15.3419,7.5087,23.57,54.69065,19.0648,11.3956,33.3872,55.261133,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr17_8148002_8149046,23.1883,114.212037,15.3419,7.5087,23.57,54.69065,19.0648,11.3956,33.3872,55.261133,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr17_8148002_8149066,23.1883,114.212037,15.3419,7.5087,23.57,54.69065,19.0648,11.3956,33.3872,55.261133,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr17_8148002_8149072,23.1883,114.212037,15.3419,7.5087,23.57,54.69065,19.0648,11.3956,33.3872,55.261133,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr17_8148003_8148983,23.1883,114.212037,15.3419,7.5087,23.57,54.69065,4.971,11.3956,33.3872,44.586567,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Time different query method

In [11]:
def get_annot_old1(frag, fpath_db=FPATH_DB, verbose=False):
    ### set query
    query_out = f"""
        SELECT Mtf.motif, Mtf.score
        FROM Annotation Ant
        JOIN Motif      Mtf ON Ant.binding = Mtf.binding
        WHERE Ant.fragment = '{frag}'
        ;"""
    
    ### query out motif bindings on the fragment
    with sqlite3.connect(fpath_db) as conn:
        query  = query_out
        cursor = conn.cursor()
        rows   = cursor.execute(query)
        
    ### summarize the motif scores
    dct = defaultdict(lambda: 0)
    for idx, row in enumerate(rows):
        ### parse info
        motif, score = row

        ### sum the scores with same motifs
        dct[motif] += score
        
    if verbose:
        print(f"Fragment: {frg}; # Bindings: {idx+1}; # Motifs: {len(dct)}")
            
    return dct

In [12]:
def get_annot_old2(frag, fpath_db=FPATH_DB, verbose=False):
    ###
    chrom, start, end = frag.split("_")
    
    ### set query
    query_out = f"""
        SELECT Mtf.motif, Mtf.score
        FROM Motif Mtf
        WHERE Mtf.start >= {start} AND Mtf.end <= {end}
        ;"""
        
    ### query out motif bindings on the fragment
    with sqlite3.connect(fpath_db) as conn:
        query  = query_out
        cursor = conn.cursor()
        rows   = cursor.execute(query)
    
    ### summarize the motif scores
    dct = defaultdict(lambda: 0)
    for idx, row in enumerate(rows):
        ### parse info
        motif, score = row

        ### sum the scores with same motifs
        dct[motif] += score
        
    if verbose:
        print(f"Fragment: {frg}; # Bindings: {idx+1}; # Motifs: {len(dct)}")
    return dct

**Test the queries with a single fragment**

In [13]:
frag = frags[0]
print(frag)

chr17_8148000_8148887


In [14]:
dct1 = get_annot(frg, verbose=True)
dct2 = get_annot_old1(frg, verbose=True)
dct3 = get_annot_old2(frg, verbose=True)

Fragment: chr17_8148484_8149538; # Bindings: 273; # Motifs: 124
Fragment: chr17_8148484_8149538; # Bindings: 273; # Motifs: 124
Fragment: chr17_8148484_8149538; # Bindings: 273; # Motifs: 124


In [15]:
### check results
print(dct1 == dct2)
print(dct1 == dct3)

True
True


In [16]:
%%timeit
dct1 = get_annot(frg)

2.46 ms ± 96 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [17]:
%%timeit
dct2 = get_annot_old1(frg)

1.53 ms ± 12.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [18]:
%%timeit
dct3 = get_annot_old2(frg)

1.43 s ± 11.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
