**Set environment**

In [10]:
### basic
import sys
sys.path.append('../')
from config_sing import *
from config_func import *

### specific tools
from collections import defaultdict
from functools import reduce
import itertools as it
import sqlite3
import random
# https://stackoverflow.com/questions/49456158/integer-in-python-pandas-becomes-blob-binary-in-sqlite
sqlite3.register_adapter(np.int64, lambda val: int(val))
sqlite3.register_adapter(np.int32, lambda val: int(val))

### file path of fragment database
fdiry = os.path.join(FD_RES, "KS91_K562_ASTARRseq", 'database')
fname = "fragment_astarr_gata1.db"
FPATH_DB_ASTARR = os.path.join(fdiry, fname)

fdiry = os.path.join(FD_RES, "Tewhey_K562_TileMPRA", 'database')
fname = "fragment_tilempra_gata1.db"
FPATH_DB_TMPRA = os.path.join(fdiry, fname)

FPATHS_DB = [FPATH_DB_ASTARR, FPATH_DB_TMPRA]

### samples of ATAC-STARR-seq
SAMPLES = ["Output_rep1", "Output_rep2", "Output_rep3", "Output_rep4"]

## Helper function

In [3]:
### https://stackoverflow.com/questions/12581437/python-random-sample-with-a-generator-iterable-iterator
def iter_sample_fast(iterable, samplesize):
    
    ### init
    results = []
    iterator = iter(iterable)
    
    ### Fill in the first samplesize elements:
    try:
        for _ in range(samplesize):
            results.append(next(iterator))
    except StopIteration:
        ### return everything if samplesize > len(iterable)
        random.shuffle(results)
        return results
    
    ### continue iterating through the elements and update the list
    random.shuffle(results)  # Randomize their positions
    for i, v in enumerate(iterator, samplesize):
        r = random.randint(0, i)
        if r < samplesize:
            results[r] = v  # at a decreasing rate, replace random items
    return results

In [4]:
def get_frag_astarr(sample, fpath_db = FPATH_DB_ASTARR):
    """sample fragments from ATAC-STARR-seq with probability proportion to count"""
    with sqlite3.connect(fpath_db) as conn:
        ### query the fragment annotations
        cursor = conn.cursor()
        query  = f"""
            SELECT *
            FROM   Count Cnt
            WHERE  Cnt.sample = '{sample}'
            """
        cursor = cursor.execute(query)
        rows   = cursor
        
        ### generate fragments
        for row in rows:
            ### parse info
            frg, sample, count = row
            
            ### repeat the each fragment based on its count
            for _ in range(count):
                yield frg, sample
                
                
def get_frag_tmpra(fpath_db = FPATH_DB_TMPRA):
    """sample fragments from TileMPRA with probability proportion to fold change"""
    with sqlite3.connect(fpath_db) as conn:
        ### query the fragment annotations
        cursor = conn.cursor()
        query  = """
            SELECT Cnt.fragment, Cnt.count_input, Cnt.count_output, Cnt.log2fc 
            FROM   Count Cnt"""
        cursor = cursor.execute(query)
        rows   = cursor

        ### generate fragments
        for row in rows:
            ### parse info
            frg, inp, out, log2fc = row
            
            ### repeat the each fragment based on its ratio
            ratio = np.exp2(log2fc)
            count = np.ceil(ratio).astype(np.int)
            for _ in range(count):
                yield frg, inp, out, log2fc

In [6]:
def get_annot(frgs, fpath_db):
    """get annotation from the given fragments"""
    ### set query
    txt   = ', '.join('?' for _ in frgs)
    query = f"""
        SELECT   Ant.Fragment, Mtf.motif, Mtf.score
        FROM     Annotation Ant
        JOIN     Motif      Mtf 
        ON       Ant.binding = Mtf.binding
        WHERE    Ant.Fragment IN ({txt})
        ORDER BY Ant.Fragment
        """
    
    ## query out from database
    with sqlite3.connect(fpath_db) as conn:
        cursor = conn.cursor()    
        cursor = cursor.execute(query, frgs)
        
    ### summarize the motif annotation scores
    dct_ann_count = defaultdict(lambda: defaultdict(lambda: 0))
    dct_ann_score = defaultdict(lambda: defaultdict(lambda: 0.0))
    
    for row in cursor:
        ### parse info
        frg, mtf, val = row
        
        ### count and sum the annotation scores
        dct_ann_count[frg][mtf] += 1
        dct_ann_score[frg][mtf] += val
    
    ### arrange and return
    dct_ann = dict()
    dct_ann["count"] = dct_ann_count
    dct_ann["score"] = dct_ann_score
    return dct_ann

**Test**

In [11]:
%%time

### init
N = 10
random.seed(123)
lst_frg = []

### random sample fragments from ATAC-STARR-seq
for sample in SAMPLES:
    gen = get_frag_astarr(sample)
    lst = iter_sample_fast(gen, N)
    lst_frg += lst

CPU times: user 5.79 s, sys: 94.4 ms, total: 5.89 s
Wall time: 24.3 s


In [12]:
lst_frg[0]

('chrX_48028863_48029070', 'Output_rep1')

In [13]:
dat = pd.DataFrame(lst_frg, columns=["Fragment", "Sample"])
dat[['Chrom', 'Start', 'End']] = dat['Fragment'].str.split('_', expand=True)
dat = dat.astype({"Start": int, "End": int})
dat = dat.assign(Loc = lambda x: np.ceil((x.Start + x.End) / 2))
dat = dat.astype({"Loc": int})

dat_frg = dat
print(dat_frg.shape)
dat_frg.head()

(40, 6)


Unnamed: 0,Fragment,Sample,Chrom,Start,End,Loc
0,chrX_48028863_48029070,Output_rep1,chrX,48028863,48029070,48028967
1,chrX_48737386_48737695,Output_rep1,chrX,48737386,48737695,48737541
2,chrX_48633182_48633325,Output_rep1,chrX,48633182,48633325,48633254
3,chrX_49043433_49043609,Output_rep1,chrX,49043433,49043609,49043521
4,chrX_49290019_49290383,Output_rep1,chrX,49290019,49290383,49290201


In [15]:
frgs = dat_frg["Fragment"]

In [16]:
%%time
### query the annotation
fpath_db = FPATH_DB_ASTARR
dct = get_annot(frgs, fpath_db)

CPU times: user 29.1 ms, sys: 16.6 ms, total: 45.6 ms
Wall time: 5.79 s


In [17]:
dct.keys()

dict_keys(['count', 'score'])

In [19]:
### convert dict of dict into a table
dat = pd.DataFrame.from_dict(dct["count"], orient="index").fillna(0)
dat = dat.add_prefix("Mtf_")
dat.index.name = 'Fragment'
dat = dat.reset_index()

### show the table
dat_ant_count = dat
print(dat_ant_count.shape)
dat_ant_count.head()

(40, 235)


Unnamed: 0,Fragment,Mtf_ZNF436,Mtf_ZBTB7A,Mtf_MECP2,Mtf_ZNF354,Mtf_HD/12,Mtf_GCM,Mtf_E2F/1,Mtf_CTCF,Mtf_CREB3/XBP1,...,Mtf_SCRT1,Mtf_EVI1/MECOM,Mtf_SOX/6,Mtf_FOX/6,Mtf_POU/3,Mtf_GMEB2/1,Mtf_ZNF435,Mtf_ZNF410,Mtf_HD/19,Mtf_OCT4+SOX2
0,chrX_48026846_48027198,1.0,2.0,2.0,3.0,3.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,chrX_48433836_48434001,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,chrX_48597821_48598014,1.0,0.0,3.0,2.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,chrX_48737386_48737695,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,chrX_48958104_48958517,1.0,0.0,2.0,0.0,2.0,1.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
### convert dict of dict into a table
dat = pd.DataFrame.from_dict(dct["score"], orient="index").fillna(0)
dat = dat.add_prefix("Mtf_")
dat.index.name = 'Fragment'
dat = dat.reset_index()

### show the table
dat_ant_score = dat
print(dat_ant_score.shape)
dat_ant_score.head()

(40, 235)


Unnamed: 0,Fragment,Mtf_ZNF436,Mtf_ZBTB7A,Mtf_MECP2,Mtf_ZNF354,Mtf_HD/12,Mtf_GCM,Mtf_E2F/1,Mtf_CTCF,Mtf_CREB3/XBP1,...,Mtf_SCRT1,Mtf_EVI1/MECOM,Mtf_SOX/6,Mtf_FOX/6,Mtf_POU/3,Mtf_GMEB2/1,Mtf_ZNF435,Mtf_ZNF410,Mtf_HD/19,Mtf_OCT4+SOX2
0,chrX_48026846_48027198,1.4621,17.1287,18.1788,23.6339,22.2181,7.3411,7.4142,10.39655,8.3657,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,chrX_48433836_48434001,4.5269,9.3406,0.0,6.9272,0.0,7.7994,0.0,6.8894,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,chrX_48597821_48598014,4.8584,0.0,26.4362,15.4675,0.0,0.0,0.0,26.2017,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,chrX_48737386_48737695,3.1296,17.5871,9.0894,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,chrX_48958104_48958517,2.3888,0.0,17.2287,0.0,17.6774,6.4469,0.0,35.8925,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## BoW and TF-IDF

https://www.analyticsvidhya.com/blog/2020/02/quick-introduction-bag-of-words-bow-tf-idf/

In [21]:
dat = dat_ant_count
lst = [col for col in dat.columns if col.startswith('Mtf')]
dat = dat[lst]

###
dat_bow = dat
print(dat.shape)
dat.head()

(40, 234)


Unnamed: 0,Mtf_ZNF436,Mtf_ZBTB7A,Mtf_MECP2,Mtf_ZNF354,Mtf_HD/12,Mtf_GCM,Mtf_E2F/1,Mtf_CTCF,Mtf_CREB3/XBP1,Mtf_FOX/9,...,Mtf_SCRT1,Mtf_EVI1/MECOM,Mtf_SOX/6,Mtf_FOX/6,Mtf_POU/3,Mtf_GMEB2/1,Mtf_ZNF435,Mtf_ZNF410,Mtf_HD/19,Mtf_OCT4+SOX2
0,1.0,2.0,2.0,3.0,3.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,3.0,2.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,2.0,0.0,2.0,1.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
dat = dat_bow
dat = dat.apply(lambda x: x / np.sum(x), axis=1)

###
dat_tf = dat
print(dat.shape)
dat.head()

(40, 234)


Unnamed: 0,Mtf_ZNF436,Mtf_ZBTB7A,Mtf_MECP2,Mtf_ZNF354,Mtf_HD/12,Mtf_GCM,Mtf_E2F/1,Mtf_CTCF,Mtf_CREB3/XBP1,Mtf_FOX/9,...,Mtf_SCRT1,Mtf_EVI1/MECOM,Mtf_SOX/6,Mtf_FOX/6,Mtf_POU/3,Mtf_GMEB2/1,Mtf_ZNF435,Mtf_ZNF410,Mtf_HD/19,Mtf_OCT4+SOX2
0,0.009615,0.019231,0.019231,0.028846,0.028846,0.009615,0.009615,0.009615,0.009615,0.009615,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.019608,0.019608,0.0,0.019608,0.0,0.019608,0.0,0.019608,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.013158,0.0,0.039474,0.026316,0.0,0.0,0.0,0.052632,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.01087,0.021739,0.01087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.007874,0.0,0.015748,0.0,0.015748,0.007874,0.0,0.031496,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
dat = dat_bow
dat = dat.apply(lambda x: np.log(len(x) / np.sum(x != 0)), axis=0)

###
dat_idf = dat
print(dat.shape)
dat.head()

(234,)


Mtf_ZNF436    1.897120
Mtf_ZBTB7A    1.049822
Mtf_MECP2     1.491655
Mtf_ZNF354    1.123930
Mtf_HD/12     1.203973
dtype: float64

In [28]:
dat_tfidf = dat_tf * dat_idf
dat_tfidf.head()

Unnamed: 0,Mtf_ZNF436,Mtf_ZBTB7A,Mtf_MECP2,Mtf_ZNF354,Mtf_HD/12,Mtf_GCM,Mtf_E2F/1,Mtf_CTCF,Mtf_CREB3/XBP1,Mtf_FOX/9,...,Mtf_SCRT1,Mtf_EVI1/MECOM,Mtf_SOX/6,Mtf_FOX/6,Mtf_POU/3,Mtf_GMEB2/1,Mtf_ZNF435,Mtf_ZNF410,Mtf_HD/19,Mtf_OCT4+SOX2
0,0.018242,0.020189,0.028686,0.032421,0.03473,0.011577,0.019995,0.005321,0.02214,0.024906,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.037198,0.020585,0.0,0.022038,0.0,0.023607,0.0,0.010851,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.024962,0.0,0.058881,0.029577,0.0,0.0,0.0,0.029126,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.020621,0.022822,0.016214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.014938,0.0,0.023491,0.0,0.01896,0.00948,0.0,0.017429,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
