In [1]:
import sys
sys.path.append('../')
from config_sing import *
show_env()

You are in: Singularity | singularity_proj_combeffect
    BASE DIRECTORY:     /mount/work
    PATH OF SOURCE:     /mount/work/source
    PATH OF EXECUTABLE: /mount/work/exe
    PATH OF ANNOTATION: /mount/work/annotation
    PATH OF PROJECT:    /mount/project
    PATH OF RESULTS:    /mount/work/out/proj_combeffect

Library imported:
    numpy, pandas, matplotlib.pyplot
    os, sys, time, gzip, glob



In [2]:
import sqlite3
import itertools as it
from collections import defaultdict
import matplotlib as mpl
from matplotlib.lines import Line2D
from mpl_toolkits.axes_grid1 import make_axes_locatable
plt.style.use("seaborn-whitegrid")

import math
import random
import json
from sklearn.manifold import TSNE
from functools import partial, reduce
print = partial(print, flush=True)

### Set Samples
fun = np.core.defchararray.add
idx = np.arange(1,6).astype("str")

INPUT    = reduce(fun, ["Input", idx])
INPUT20X = reduce(fun, ["Input", idx,     "_20x"])
TFX_DMSO = reduce(fun, ["TFX",   idx[1:], "_DMSO"])
TFX_DEX  = reduce(fun, ["TFX",   idx[1:], "_Dex"])
SAMPLES  = np.concatenate([INPUT20X, TFX_DMSO, TFX_DEX])
SAMPLES_OUT = np.concatenate([TFX_DMSO, TFX_DEX])
GROUPS   = ["Input", "Input_20x", "TFX_DMSO", "TFX_Dex"]

### file path of database
fdiry = os.path.join(FD_RES, 'database')
fname = "fragment_chr17.db"
FPATH_DB = os.path.join(fdiry, fname)

## test stream fragments

In [3]:
def prep_line(line):
    cnames = (
        "Frag_Chrom", 
        "Frag_Start", 
        "Frag_End", 
        "Frag_Count", 
        "Region_Chrom", 
        "Region_Start", 
        "Region_End",
        "Region_Name", 
        "Region_Score",
        "Region_Strand",
        "Overlap")
    lst = line.decode('ASCII').strip().split('\t')  
    dct = dict(zip(cnames, lst))
    fragment = "_".join([dct["Frag_Chrom"],   dct["Frag_Start"],   dct["Frag_End"]]) 
    region   = "_".join([dct["Region_Chrom"], dct["Region_Start"], dct["Region_End"]]) 
    return fragment, region

### helper function to get a chunk of file
def get_chunks(gen, rows=10):
    """Divides the data into #rows in each list"""
    iterable = iter(gen)
    while True:
        x = list(it.islice(iterable, rows))
        if not x:
            return
        yield x

In [4]:
fdiry = os.path.join(FD_RES, "count_fragment", "TFX2_DMSO")
fname = "region_dex_GR_P300_dnase_chr17.bed.gz"
fpath = os.path.join(fdiry, fname)
print(fpath)

frgs = []
with gzip.open(fpath, 'rb') as file:
    chunks = get_chunks(file)
    chunk  = next(chunks)
    for line in chunk:
        print(line)
        frg, reg = prep_line(line)
        frgs.append(frg)
        
frgs

/mount/work/out/proj_combeffect/count_fragment/TFX2_DMSO/region_dex_GR_P300_dnase_chr17.bed.gz
b'chr17\t6001570\t6002624\t2\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t63\n'
b'chr17\t6001571\t6002624\t1\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t63\n'
b'chr17\t6001762\t6002691\t2\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t130\n'
b'chr17\t6001762\t6002692\t1\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t131\n'
b'chr17\t6001763\t6002692\t1\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t131\n'
b'chr17\t6001765\t6002692\t1\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t131\n'
b'chr17\t6001969\t6002895\t1\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t334\n'
b'chr17\t6001969\t6002896\t1\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t335\n'
b'chr17\t6001970\t6002896\t1\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t335\n'
b'chr17\t6002054\t6002943\t2\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t382\

['chr17_6001570_6002624',
 'chr17_6001571_6002624',
 'chr17_6001762_6002691',
 'chr17_6001762_6002692',
 'chr17_6001763_6002692',
 'chr17_6001765_6002692',
 'chr17_6001969_6002895',
 'chr17_6001969_6002896',
 'chr17_6001970_6002896',
 'chr17_6002054_6002943']

## Annotation: motif

In [6]:
def get_annot(frgs, fpath_db=FPATH_DB):
    """get annotation from the given fragments"""
    ### set query
    txt   = ', '.join('?' for _ in frgs)
    query = f"""
        SELECT   Ant.Fragment, Mtf.motif, Mtf.score
        FROM     Annotation Ant
        JOIN     Motif      Mtf 
        ON       Ant.binding = Mtf.binding
        WHERE    Ant.Fragment IN ({txt})
        ORDER BY Ant.Fragment
        """
    
    ## query out from database
    with sqlite3.connect(fpath_db) as conn:
        cursor = conn.cursor()    
        cursor = cursor.execute(query, frgs)
        
    ### summarize the motif annotation scores
    dct_ann_count = defaultdict(lambda: defaultdict(lambda: 0))
    dct_ann_score = defaultdict(lambda: defaultdict(lambda: 0.0))
    
    for row in cursor:
        ### parse info
        frg, mtf, val = row
        
        ### count and sum the annotation scores
        dct_ann_count[frg][mtf] += 1
        dct_ann_score[frg][mtf] += val
    
    ### arrange and return
    dct_ann = dict()
    dct_ann["count"] = dct_ann_count
    dct_ann["score"] = dct_ann_score
    return dct_ann

In [10]:
dct_ann = get_annot(frgs)

In [16]:
dct_ann_count = dct_ann["count"]
#dct_ann_count = {k:dict(v) for k,v in dct_ann_count.items()}
for k,v in dct_ann_count.items():
    print(k)
    print(str(dict(v)))
    break

chr17_6001570_6002624
{'HD/23': 1, 'SOX/1': 2, 'EVI1/MECOM': 1, 'ZNF768': 1, 'NFY': 1, 'EBF1': 1, 'GC-tract': 8, 'ZNF320': 2, 'CTCF': 2, 'KLF/SP/2': 9, 'INSM1': 1, 'ETS/2': 4, 'ZBTB48': 1, 'Ebox/CAGATGG': 2, 'HEN1': 4, 'OSR2': 2, 'Ebox/CAGCTG': 3, 'NR/3': 5, 'NR/15': 2, 'KLF/SP/1': 3, 'GLI': 1, 'ZNF53': 2, 'TFAP2/1': 4, 'NR/16': 2, 'PRDM14': 1, 'ZIC': 1, 'Ebox/CACCTG': 1, 'IRF/2': 4, 'ZIM3': 1, 'ZNF354': 2, 'YY1': 3, 'HD/14': 2, 'RFX/1': 2, 'ZNF549': 2, 'FEZF1': 2, 'GCM': 1, 'SIX/1': 1, 'HIC/1': 3, 'P53-like/1': 2, 'ZNF449': 2, 'ZNF143': 3, 'NR/17': 5, 'TBX/4': 2, 'AP1/1': 2, 'SMAD': 4, 'PAX/2': 2, 'MAF': 1, 'HD/20': 3, 'HD/22': 4, 'TBX/3': 2, 'ZFX': 3, 'PLAG1': 1, 'ZNF324': 2, 'AP1/2': 2, 'ZNF554': 3, 'NR/18': 1, 'NFKB/2': 1, 'Ebox/CACGTG/1': 3, 'SREBF1': 2, 'GRHL': 1, 'ZFN121': 2, 'NR/19': 2, 'MEF2': 1, 'FOX/5': 1, 'HD/2': 3, 'EGR': 1, 'REST/NRSF': 1, 'ZNF335': 1, 'PAX/1': 1, 'POU/3': 1, 'ZNF257': 1, 'E2F/2': 3, 'MFZ1': 2, 'HINFP1/1': 1, 'GFI': 1, 'ZNF146': 1, 'E2F/1': 1, 'CCAAT/CEBP

In [31]:
%%time

for sample in SAMPLES_OUT:

    ### input file
    fdiry = os.path.join(FD_RES, "count_fragment", sample)
    fname = "region_dex_GR_P300_dnase_chr17.bed.gz"
    fpath_inp = os.path.join(fdiry, fname)

    ### output file
    fdiry = os.path.join(FD_RES, "scratch", "region_dex_GR_P300_dnase_chr17")
    fname = f"{sample}_motif_count.tsv"
    fpath_out_cnt = os.path.join(fdiry, fname)

    ### output file
    fdiry = os.path.join(FD_RES, "scratch", "region_dex_GR_P300_dnase_chr17")
    fname = f"{sample}_motif_score.tsv"
    fpath_out_scr = os.path.join(fdiry, fname)

    
    ### show file IO
    print(fpath_inp)
    print(fpath_out_cnt)
    print(fpath_out_scr)

    with gzip.open(fpath_inp, 'rb') as finp, open(fpath_out_cnt, 'w') as fout_cnt, open(fpath_out_scr, 'w') as fout_scr:
        ### set chunks
        chunks = get_chunks(finp)

        ### loop through each chunk
        for chunk in chunks:
            ### get fragments
            frgs = []
            for line in chunk:
                frg, reg = prep_line(line)
                frgs.append(frg)

            ### get motif from database
            dct_ann = get_annot(frgs)
            
            ### output count
            dct_ann_count = dct_ann["count"]
            dct_ann_score = dct_ann["score"]

            for key, val in dct_ann_count.items():
                ### output to a line
                line = "\t".join([key, json.dumps(val)])
                fout_cnt.write(line + "\n")
                #print(line)
                #break
                
            for key, val in dct_ann_score.items():
                ### round the values
                val = {k:np.round(v, decimals=5) for k, v in val.items()}

                ### output to a line
                line = "\t".join([key, json.dumps(val)])
                fout_scr.write(line + "\n")
                #print(line)
                #break
            #break       
        #break
        print()

/mount/work/out/proj_combeffect/count_fragment/TFX2_DMSO/region_dex_GR_P300_dnase_chr17.bed.gz
/mount/work/out/proj_combeffect/scratch/region_dex_GR_P300_dnase_chr17/TFX2_DMSO_motif_count.tsv
/mount/work/out/proj_combeffect/scratch/region_dex_GR_P300_dnase_chr17/TFX2_DMSO_motif_score.tsv

/mount/work/out/proj_combeffect/count_fragment/TFX3_DMSO/region_dex_GR_P300_dnase_chr17.bed.gz
/mount/work/out/proj_combeffect/scratch/region_dex_GR_P300_dnase_chr17/TFX3_DMSO_motif_count.tsv
/mount/work/out/proj_combeffect/scratch/region_dex_GR_P300_dnase_chr17/TFX3_DMSO_motif_score.tsv

/mount/work/out/proj_combeffect/count_fragment/TFX4_DMSO/region_dex_GR_P300_dnase_chr17.bed.gz
/mount/work/out/proj_combeffect/scratch/region_dex_GR_P300_dnase_chr17/TFX4_DMSO_motif_count.tsv
/mount/work/out/proj_combeffect/scratch/region_dex_GR_P300_dnase_chr17/TFX4_DMSO_motif_score.tsv

/mount/work/out/proj_combeffect/count_fragment/TFX5_DMSO/region_dex_GR_P300_dnase_chr17.bed.gz
/mount/work/out/proj_combeffect/scr