In [1]:
### basic
import sys
sys.path.append('../')
from config_sing import *
show_env()

You are in: Singularity | singularity_proj_combeffect
    BASE DIRECTORY:     /mount/work
    PATH OF SOURCE:     /mount/work/source
    PATH OF EXECUTABLE: /mount/work/exe
    PATH OF ANNOTATION: /mount/work/annotation
    PATH OF PROJECT:    /mount/project
    PATH OF RESULTS:    /mount/work/out/proj_combeffect_encode_fcc

Library imported:
    numpy, pandas, matplotlib.pyplot
    os, sys, time, gzip, glob



In [3]:
fdiry = os.path.join(FD_RES, "KS91_K562_ASTARRseq", 'database')
fdiry

'/mount/work/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/database'

In [4]:
!ls '/mount/work/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/database'

 fragment_astarr_gata1.db  'fragment_astarr_gata1.db?mode=ro'


In [2]:
### specific tools
from collections import defaultdict
from functools import reduce
import itertools as it
import math
import random
import sqlite3
# https://stackoverflow.com/questions/49456158/integer-in-python-pandas-becomes-blob-binary-in-sqlite
sqlite3.register_adapter(np.int64, lambda val: int(val))
sqlite3.register_adapter(np.int32, lambda val: int(val))

### file path of fragment database
fdiry = os.path.join(FD_RES, "KS91_K562_ASTARRseq", 'database')
fname = "fragment_astarr_gata1.db"
FPATH_DB_ASTARR_GATA1 = os.path.join(fdiry, fname)

fdiry = os.path.join(FD_RES, "KS91_K562_ASTARRseq", 'database')
fname = "fragment_astarr_gata1.db"
FPATH_DB_ASTARR_GATA1 = os.path.join(fdiry, fname)


fdiry = os.path.join(FD_RES, "Tewhey_K562_TileMPRA", 'database')
fname = "fragment_tilempra_gata1.db"
FPATH_DB_TMPRA = os.path.join(fdiry, fname)

FPATHS_DB = [FPATH_DB_ASTARR, FPATH_DB_TMPRA]

### samples of ATAC-STARR-seq
fun = np.core.defchararray.add

idx = np.arange(1,6+1).astype("str")
SAMPLES_INP = reduce(fun, ["Input_rep",   idx])

idx = np.arange(1,4+1).astype("str")
SAMPLES_OUT = reduce(fun, ["Output_rep",  idx])

SAMPLES = np.concatenate([SAMPLES_INP, SAMPLES_OUT])
GROUPS  = np.r_[
    np.repeat("Input",  len(SAMPLES_INP)),
    np.repeat("Output", len(SAMPLES_OUT))
]

print(SAMPLES)
print(GROUPS)

['Input_rep1' 'Input_rep2' 'Input_rep3' 'Input_rep4' 'Input_rep5'
 'Input_rep6' 'Output_rep1' 'Output_rep2' 'Output_rep3' 'Output_rep4']
['Input' 'Input' 'Input' 'Input' 'Input' 'Input' 'Output' 'Output'
 'Output' 'Output']


In [None]:
### https://stackoverflow.com/questions/12581437/python-random-sample-with-a-generator-iterable-iterator
def iter_sample_fast(iterable, samplesize):
    
    ### init
    results = []
    iterator = iter(iterable)
    
    ### Fill in the first samplesize elements:
    try:
        for _ in range(samplesize):
            results.append(next(iterator))
    except StopIteration:
        ### return everything if samplesize > len(iterable)
        random.shuffle(results)
        return results
    
    ### continue iterating through the elements and update the list
    random.shuffle(results)  # Randomize their positions
    for i, v in enumerate(iterator, samplesize):
        r = random.randint(0, i)
        if r < samplesize:
            results[r] = v  # at a decreasing rate, replace random items
    return results

In [None]:
def get_frag_astarr(sample, start=None, end=None, fpath_db = FPATH_DB_ASTARR):
    """sample fragments from ATAC-STARR-seq with probability proportion to count"""
    ### set query
    if (start is None) and (end is None):
        query  = f"""
            SELECT Cnt.fragment, Cnt.sample, Frg.pct_gc, Cnt.count
            FROM   Count    Cnt
            JOIN   Fragment Frg
            ON     Cnt.fragment = Frg.fragment
            WHERE  Cnt.sample = '{sample}'
            """
    else:
        query  = f"""
            SELECT Cnt.fragment, Cnt.sample, Frg.pct_gc, Cnt.count
            FROM   Count    Cnt
            JOIN   Fragment Frg
            ON     Cnt.fragment = Frg.fragment
            WHERE  Cnt.sample = '{sample}' AND 
                   Frg.start >= '{start}'  AND 
                   Frg.end   <= '{end}'
            """
    
    fpath_db = "file:" + fpath_db + "?mode=ro"
    with sqlite3.connect(fpath_db, uri=True) as conn:
        ### query the fragment annotations
        cursor = conn.cursor()
        cursor = cursor.execute(query)
        rows   = cursor
        
        ### generate fragments
        for row in rows:
            ### parse info
            frg, sample, pct_gc, count = row
            yield row
            
            ### repeat the each fragment based on its count
            #for _ in range(count):
            #    yield row    
                
def get_frag_tmpra(start=None, end=None, fpath_db = FPATH_DB_TMPRA):
    """sample fragments from TileMPRA with probability proportion to fold change"""
    ### set query
    if (start is None) and (end is None):
        query  = """
            SELECT Cnt.fragment, Frg.pct_gc, Cnt.count_input, Cnt.count_output, Cnt.log2fc 
            FROM   Count Cnt
            JOIN   Fragment Frg
            ON     Cnt.fragment = Frg.fragment
            """
    else:
        query  = f"""
            SELECT Cnt.fragment, Frg.pct_gc, Cnt.count_input, Cnt.count_output, Cnt.log2fc 
            FROM   Count    Cnt
            JOIN   Fragment Frg
            ON     Cnt.fragment = Frg.fragment
            WHERE  Frg.start >= '{start}' AND 
                   Frg.end   <= '{end}'
            """
        
    fpath_db = "file:" + fpath_db + "?mode=ro"
    with sqlite3.connect(fpath_db, uri=True) as conn:
        ### query the fragment annotations
        cursor = conn.cursor()
        cursor = cursor.execute(query)
        rows   = cursor

        ### generate fragments
        for row in rows:
            ### parse info
            frg, pct_gc, inp, out, log2fc = row
            yield row
            
            ### repeat the each fragment based on its ratio
            #ratio = np.exp2(log2fc)
            #count = np.ceil(ratio).astype(np.int)
            #for _ in range(count):
            #    yield row