**Set environment**

In [1]:
### set environment
import sys
sys.path.append('../config')
from config_sing import *
show_env()

### import more tools
import csv
import sqlite3
# https://stackoverflow.com/questions/49456158/integer-in-python-pandas-becomes-blob-binary-in-sqlite
sqlite3.register_adapter(np.int64, lambda val: int(val))
sqlite3.register_adapter(np.int32, lambda val: int(val))

You are in: Singularity | singularity_proj_combeffect
    BASE DIRECTORY:     /mount/work
    PATH OF SOURCE:     /mount/work/source
    PATH OF EXECUTABLE: /mount/work/exe
    PATH OF ANNOTATION: /mount/work/annotation
    PATH OF PROJECT:    /mount/project
    PATH OF RESULTS:    /mount/work/out/proj_combeffect_encode_fcc

Library imported:
    numpy, pandas, itertools,
    os, sys, time, gzip, glob,
    functools.partial/reduce,
    collections.Counter,
    matplotlib.pyplot



In [2]:
#PREFIX = "KS91_K562_ASTARRseq"
PREFIX  = "A001_K562_WSTARRseq"
#PREFIX = "Tewhey_K562_TileMPRA"

In [3]:
def gen_lines(file, n_lines=10, remove_header=False):
    """generate lines lines from the file"""
    ### read lines using csv reader
    lines = csv.reader(file, delimiter='\t', quotechar='|')
    
    ### remove file header if needed
    if remove_header:
        next(lines)
    
    ### specify number of lines to generate
    lines = it.islice(lines, 0, n_lines)
    for line in lines:
        yield line

## Sample
```
query_table = ("""CREATE TABLE IF NOT EXISTS Sample(
    sample    TEXT PRIMARY KEY, 
    treatment TEXT,
    replicate TEXT,
    size      INTEGER
);""")

query_insert = ("""
    INSERT OR IGNORE INTO Sample
        (sample, treatment, replicate, size) 
    VALUES 
        (?,?,?,?)
    """)
```

In [4]:
fdiry = os.path.join(FD_RES, PREFIX, "coverage", "library_size")
fname = "library_size_summary.csv"
fpath = os.path.join(fdiry, fname)

dat_sample = pd.read_table(fpath, sep=",")
print(dat_sample.dtypes)
dat_sample

Sample    object
Group     object
Rep       object
Count      int64
Fpath     object
dtype: object


Unnamed: 0,Sample,Group,Rep,Count,Fpath
0,Input_rep1,Input,rep1,26908970,A001-input-K562-rep1.masked.dedup.fragments.co...
1,Input_rep2,Input,rep2,99899775,A001-input-K562-rep2.masked.dedup.fragments.co...
2,Input_rep3,Input,rep3,105623984,A001-input-K562-rep3.masked.dedup.fragments.co...
3,Input_rep4,Input,rep4,108635002,A001-input-K562-rep4.masked.dedup.fragments.co...
4,Output_rep1,Output,rep1,160349140,A001-K562-rep1.masked.dedup.fragments.counts.t...
5,Output_rep2,Output,rep2,157326312,A001-K562-rep2.masked.dedup.fragments.counts.t...
6,Output_rep3,Output,rep3,328185275,A001-K562-rep3.masked.dedup.fragments.counts.t...


## Fragments (NUC)
```
query_reset_table = "DROP TABLE IF EXISTS Fragment"
query_table = ("""
    CREATE TABLE IF NOT EXISTS Fragment(
        fragment TEXT PRIMARY KEY, 
        chrom    TEXT,
        start    INTEGER,
        end      INTEGER,
        pct_at   REAL,
        pct_gc   REAL,
        num_A    INTEGER,
        num_C    INTEGER,
        num_G    INTEGER,
        num_T    INTEGER,
        num_N    INTEGER,
        num_oth  INTEGER
    );""")
    
query_insert = ("""
    INSERT OR IGNORE INTO Fragment
        (fragment, chrom, start, end, pct_at, pct_gc,
         num_A, num_C, num_G, num_T, num_N, num_oth) 
    VALUES 
        (?,?,?,?,?,?,?,?,?,?,?,?)
    """)
```

In [5]:
%%bash
source ../config/config_sing.sh
PREFIX="A001_K562_WSTARRseq"
REGION="GATA1"
SAMPLE="Input_rep1"
STRAND="unstranded"
FOLDER="fragment_nuc"
FPATH=$(ls ${FD_RES}/${PREFIX}/${FOLDER}/*${SAMPLE}*${REGION}*${STRAND}*)
echo ${FPATH}
zcat ${FPATH} | head

/mount/work/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/fragment_nuc/A001_K562_WSTARRseq_Input_rep1.GATA1.unstranded.bed.gz
#1_usercol	2_usercol	3_usercol	4_usercol	5_usercol	6_usercol	7_pct_at	8_pct_gc	9_num_A	10_num_C	11_num_G	12_num_T	13_num_N	14_num_oth	15_seq_len
chrX	47787533	47787772	chrX_47787533_47787772	1	.	0.615063	0.384937	48	43	49	99	0	0	239
chrX	47787569	47787735	chrX_47787569_47787735	1	.	0.638554	0.361446	38	25	35	68	0	0	166
chrX	47787714	47788125	chrX_47787714_47788125	1	.	0.569343	0.430657	72	88	89	162	0	0	411
chrX	47787767	47788062	chrX_47787767_47788062	1	.	0.566102	0.433898	53	59	69	114	0	0	295
chrX	47788341	47788759	chrX_47788341_47788759	1	.	0.581340	0.418660	69	104	71	174	0	0	418
chrX	47788406	47788749	chrX_47788406_47788749	1	.	0.565598	0.434402	60	87	62	134	0	0	343
chrX	47788733	47789143	chrX_47788733_47789143	1	.	0.490244	0.509756	64	95	114	137	0	0	410
chrX	47788882	47789418	chrX_47788882_47789418	1	.	0.421642	0.578358	94	152	158	132	0	0	536
chrX	47788

In [6]:
### helper function to process each row
def prep_line(line):
    """Function to process each line"""
    ### parse info: Chrom, Start, Stop, Name, Score, Strand
    key = f"{line[0]}_{line[1]}_{line[2]}"
    val = [key, *line[0:3], *line[6:14]]
    return val

In [7]:
FOLDER="fragment_nuc"
REGION="GATA1"
SAMPLE="Input_rep1"
STRAND="unstranded"

fglob  = os.path.join(FD_RES, PREFIX, FOLDER, f"*{SAMPLE}*{REGION}*{STRAND}*")
fpaths = glob.glob(fglob)
fpath  = fpaths[0]
print(fpaths)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
with gzip.open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5)
    for line in lines:
        print(line)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
fun = prep_line
with gzip.open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5, remove_header=True)
    lines = map(fun, lines)
    for line in lines:
        print(line)

['/mount/work/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/fragment_nuc/A001_K562_WSTARRseq_Input_rep1.GATA1.unstranded.bed.gz']
+++++++++++++++++++++++++++++++++++++++++++++++++++
['#1_usercol', '2_usercol', '3_usercol', '4_usercol', '5_usercol', '6_usercol', '7_pct_at', '8_pct_gc', '9_num_A', '10_num_C', '11_num_G', '12_num_T', '13_num_N', '14_num_oth', '15_seq_len']
['chrX', '47787533', '47787772', 'chrX_47787533_47787772', '1', '.', '0.615063', '0.384937', '48', '43', '49', '99', '0', '0', '239']
['chrX', '47787569', '47787735', 'chrX_47787569_47787735', '1', '.', '0.638554', '0.361446', '38', '25', '35', '68', '0', '0', '166']
['chrX', '47787714', '47788125', 'chrX_47787714_47788125', '1', '.', '0.569343', '0.430657', '72', '88', '89', '162', '0', '0', '411']
['chrX', '47787767', '47788062', 'chrX_47787767_47788062', '1', '.', '0.566102', '0.433898', '53', '59', '69', '114', '0', '0', '295']
+++++++++++++++++++++++++++++++++++++++++++++++++++
['chrX_47787533_47787772', 'chrX

## Counts

In [8]:
%%bash
source ../config/config_sing.sh 
PREFIX="A001_K562_WSTARRseq"
FOLDER="fragment"
REGION="GATA1"
SAMPLE="Input_rep1"
STRAND="unstranded"
FPATH=$(ls ${FD_RES}/${PREFIX}/${FOLDER}/*${SAMPLE}*${REGION}*${STRAND}*)
echo ${FPATH}
head ${FPATH}

/mount/work/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/fragment/A001_K562_WSTARRseq_Input_rep1.GATA1.unstranded.bed
chrX	47787533	47787772	chrX_47787533_47787772	1	.
chrX	47787569	47787735	chrX_47787569_47787735	1	.
chrX	47787714	47788125	chrX_47787714_47788125	1	.
chrX	47787767	47788062	chrX_47787767_47788062	1	.
chrX	47788341	47788759	chrX_47788341_47788759	1	.
chrX	47788406	47788749	chrX_47788406_47788749	1	.
chrX	47788733	47789143	chrX_47788733_47789143	1	.
chrX	47788882	47789418	chrX_47788882_47789418	1	.
chrX	47788984	47789338	chrX_47788984_47789338	1	.
chrX	47789379	47790004	chrX_47789379_47790004	1	.


In [9]:
### helper function to process each row
def prep_line(line):
    """Function to process each line"""
    key = f"{line[0]}_{line[1]}_{line[2]}"
    val = [key, *line[0:3], line[4]]
    return val

In [10]:
FOLDER="fragment"
REGION="GATA1"
SAMPLE="Input_rep1"
STRAND="unstranded"

fglob  = os.path.join(FD_RES, PREFIX, FOLDER, f"*{SAMPLE}*{REGION}*{STRAND}*")
fpaths = glob.glob(fglob)
fpath  = fpaths[0]
print(fpaths)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
with open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5)
    for line in lines:
        print(line)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
fun = prep_line
with open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5, remove_header=True)
    lines = map(fun, lines)
    for line in lines:
        print(line)

['/mount/work/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/fragment/A001_K562_WSTARRseq_Input_rep1.GATA1.unstranded.bed']
+++++++++++++++++++++++++++++++++++++++++++++++++++
['chrX', '47787533', '47787772', 'chrX_47787533_47787772', '1', '.']
['chrX', '47787569', '47787735', 'chrX_47787569_47787735', '1', '.']
['chrX', '47787714', '47788125', 'chrX_47787714_47788125', '1', '.']
['chrX', '47787767', '47788062', 'chrX_47787767_47788062', '1', '.']
['chrX', '47788341', '47788759', 'chrX_47788341_47788759', '1', '.']
+++++++++++++++++++++++++++++++++++++++++++++++++++
['chrX_47787569_47787735', 'chrX', '47787569', '47787735', '1']
['chrX_47787714_47788125', 'chrX', '47787714', '47788125', '1']
['chrX_47787767_47788062', 'chrX', '47787767', '47788062', '1']
['chrX_47788341_47788759', 'chrX', '47788341', '47788759', '1']
['chrX_47788406_47788749', 'chrX', '47788406', '47788749', '1']


## Motif
```
query_reset_table = "DROP TABLE IF EXISTS Motif"
query_table = """
    CREATE TABLE IF NOT EXISTS Motif(
        binding TEXT PRIMARY KEY, 
        chrom   TEXT,
        start   INTEGER,
        end     INTEGER,
        motif   TEXT,
        score   REAL
    );"""
query_insert = """
    INSERT OR IGNORE INTO Motif 
        (binding,chrom,start,end,motif,score)
    VALUES 
        (?,?,?,?,?,?)
    """
```

In [11]:
%%bash
source ../config/config_sing.sh 
CHROM="chrX"
FDIRY="${FD_ANN}/motif_cluster_jvierstra/hg38_archetype_motifs_v1"
FNAME="${CHROM}_rm_mouse_merge.bed.gz"
FPATH="${FDIRY}/${FNAME}"
echo ${FPATH}
zcat ${FPATH} | head

/mount/work/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1/chrX_rm_mouse_merge.bed.gz
chrX	10006	10041	KLF/SP/2	3.257775
chrX	10018	10038	GC-tract	6.314
chrX	10025	10039	NR/3	7.0957
chrX	10035	10046	PRDM1	8.1562
chrX	10046	10061	MAF	7.1564
chrX	10047	10057	NFY	7.6173
chrX	10053	10086	KLF/SP/2	7.89956
chrX	10055	10073	TBX/3	7.6294
chrX	10055	10066	ETS/2	10.5061
chrX	10056	10078	GC-tract	7.8744


In [12]:
### helper function to process each row
def prep_line(line):
    """Function to process each line"""
    ### parse info: Chrom, Start, Stop, Name, Score, Strand
    key = f"{line[0]}_{line[1]}_{line[2]}_{line[3]}"
    val = [key, *line]
    return val

In [13]:
CHROM="chrX"
fpath  = os.path.join(
    FD_ANN, 
    "motif_cluster_jvierstra", 
    "hg38_archetype_motifs_v1", 
    f"{CHROM}_rm_mouse_merge.bed.gz")
print(fpath)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
with gzip.open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5)
    for line in lines:
        print(line)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
fun = prep_line
with gzip.open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5, remove_header=True)
    lines = map(fun, lines)
    for line in lines:
        print(line)

/mount/work/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1/chrX_rm_mouse_merge.bed.gz
+++++++++++++++++++++++++++++++++++++++++++++++++++
['chrX', '10006', '10041', 'KLF/SP/2', '3.257775']
['chrX', '10018', '10038', 'GC-tract', '6.314']
['chrX', '10025', '10039', 'NR/3', '7.0957']
['chrX', '10035', '10046', 'PRDM1', '8.1562']
['chrX', '10046', '10061', 'MAF', '7.1564']
+++++++++++++++++++++++++++++++++++++++++++++++++++
['chrX_10018_10038_GC-tract', 'chrX', '10018', '10038', 'GC-tract', '6.314']
['chrX_10025_10039_NR/3', 'chrX', '10025', '10039', 'NR/3', '7.0957']
['chrX_10035_10046_PRDM1', 'chrX', '10035', '10046', 'PRDM1', '8.1562']
['chrX_10046_10061_MAF', 'chrX', '10046', '10061', 'MAF', '7.1564']
['chrX_10047_10057_NFY', 'chrX', '10047', '10057', 'NFY', '7.6173']


## Annotation
```
query_reset_table = "DROP TABLE IF EXISTS Annotation"

query_table = """
    CREATE TABLE IF NOT EXISTS Annotation (
        fragment TEXT, 
        binding  TEXT,
        FOREIGN KEY (fragment) REFERENCES Fragment (fragment),
        FOREIGN KEY (binding)  REFERENCES Motif    (binding),
        UNIQUE (fragment, binding) ON CONFLICT IGNORE
    );"""

query_insert = """
    INSERT OR IGNORE INTO Annotation
        (fragment, binding)
    VALUES 
        (?,?)
    """
```

In [14]:
%%bash
source ../config/config_sing.sh
PREFIX="A001_K562_WSTARRseq"
FOLDER="annotation"
REGION="GATA1"
SAMPLE="Input_rep1"
STRAND="unstranded"

FPATH=$(ls ${FD_RES}/${PREFIX}/${FOLDER}/*${SAMPLE}*${REGION}*${STRAND}*)
echo ${FPATH}
zcat ${FPATH} | head

/mount/work/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/annotation/A001_K562_WSTARRseq_Input_rep1.GATA1.unstranded.bed.gz
chrX	47787533	47787772	chrX_47787533_47787772	1	.	chrX	47787544	47787554	MEF2	9.2122	10
chrX	47787533	47787772	chrX_47787533_47787772	1	.	chrX	47787566	47787580	OCT4+SOX2	7.768	14
chrX	47787533	47787772	chrX_47787533_47787772	1	.	chrX	47787574	47787589	ZNF528	5.9885	15
chrX	47787533	47787772	chrX_47787533_47787772	1	.	chrX	47787580	47787601	ZNF490	17.1545	21
chrX	47787533	47787772	chrX_47787533_47787772	1	.	chrX	47787597	47787611	GLIS	2.9258	14
chrX	47787533	47787772	chrX_47787533_47787772	1	.	chrX	47787597	47787610	ZIC	7.0157	13
chrX	47787533	47787772	chrX_47787533_47787772	1	.	chrX	47787606	47787618	NR/4	5.1724	12
chrX	47787533	47787772	chrX_47787533_47787772	1	.	chrX	47787612	47787624	NR/14	7.7273	12
chrX	47787533	47787772	chrX_47787533_47787772	1	.	chrX	47787629	47787642	ZSCAN4	4.9527	13
chrX	47787533	47787772	chrX_47787533_47787772	1	.	chrX	47787644	4778

In [15]:
### helper function to process each row
def prep_line(line):
    """Function to process each line"""
    ### parse info: Chrom, Start, Stop, Name, Score, Strand
    fragment = f"{line[0]}_{line[1]}_{line[2]}"
    motif    = f"{line[6]}_{line[7]}_{line[8]}_{line[9]}"
    return fragment, motif

In [17]:
FOLDER="annotation"
REGION="GATA1"
SAMPLE="Input_rep1"
STRAND="unstranded"

fglob  = os.path.join(FD_RES, PREFIX, FOLDER, f"*{SAMPLE}*{REGION}*{STRAND}*")
fpaths = glob.glob(fglob)
fpath  = fpaths[0]
print(fpaths)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
with gzip.open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5)
    for line in lines:
        print(line)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
fun = prep_line
with gzip.open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5, remove_header=True)
    lines = map(fun, lines)
    for line in lines:
        print(line)

['/mount/work/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/annotation/A001_K562_WSTARRseq_Input_rep1.GATA1.unstranded.bed.gz']
+++++++++++++++++++++++++++++++++++++++++++++++++++
['chrX', '47787533', '47787772', 'chrX_47787533_47787772', '1', '.', 'chrX', '47787544', '47787554', 'MEF2', '9.2122', '10']
['chrX', '47787533', '47787772', 'chrX_47787533_47787772', '1', '.', 'chrX', '47787566', '47787580', 'OCT4+SOX2', '7.768', '14']
['chrX', '47787533', '47787772', 'chrX_47787533_47787772', '1', '.', 'chrX', '47787574', '47787589', 'ZNF528', '5.9885', '15']
['chrX', '47787533', '47787772', 'chrX_47787533_47787772', '1', '.', 'chrX', '47787580', '47787601', 'ZNF490', '17.1545', '21']
['chrX', '47787533', '47787772', 'chrX_47787533_47787772', '1', '.', 'chrX', '47787597', '47787611', 'GLIS', '2.9258', '14']
+++++++++++++++++++++++++++++++++++++++++++++++++++
('chrX_47787533_47787772', 'chrX_47787566_47787580_OCT4+SOX2')
('chrX_47787533_47787772', 'chrX_47787574_47787589_ZNF528')
('chrX

## Coverage
```
query_reset_table = "DROP TABLE IF EXISTS Coverage"

query_table = """CREATE TABLE IF NOT EXISTS Coverage(
    chrom    TEXT,
    location INTEGER,
    depth    INTEGER,
    sample   TEXT,
    FOREIGN KEY (sample) REFERENCES Sample (sample)
    );"""

query_insert = ("""
    INSERT OR IGNORE INTO Coverage
        (chrom, location, depth, sample)
    VALUES 
        (?,?,?,?)""")
```

In [18]:
%%bash
source ../config/config_sing.sh
PREFIX="A001_K562_WSTARRseq"
FOLDER="coverage"
REGION="GATA1"
SAMPLE="Input_rep1"
STRAND="unstranded"

FPATH=$(ls ${FD_RES}/${PREFIX}/${FOLDER}/*${SAMPLE}*${REGION}*${STRAND}*)
echo ${FPATH}
zcat ${FPATH} | head

/mount/work/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/coverage/A001_K562_WSTARRseq_Input_rep1.GATA1.unstranded.perbase.tsv.gz /mount/work/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/coverage/A001_K562_WSTARRseq_Input_rep1.GATA1.unstranded.total_count.tsv
chrX	47786880	47786881	.
chrX	47786881	47786882	.
chrX	47786882	47786883	.
chrX	47786883	47786884	.
chrX	47786884	47786885	.
chrX	47786885	47786886	.
chrX	47786886	47786887	.
chrX	47786887	47786888	.
chrX	47786888	47786889	.
chrX	47786889	47786890	.


In [19]:
### helper function to process each row
def prep_line(line, sample):
    """Function to process each line"""
    return [*line, SAMPLE]

In [20]:
FOLDER="coverage"
REGION="GATA1"
SAMPLE="Input_rep1"
STRAND="unstranded"

fglob  = os.path.join(FD_RES, PREFIX, FOLDER, f"*{SAMPLE}*{REGION}*{STRAND}*perbase*")
fpaths = glob.glob(fglob)
fpath  = fpaths[0]
print(fpaths)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
with gzip.open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5)
    for line in lines:
        print(line)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
fun = partial(prep_line, sample=SAMPLE)
with gzip.open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5, remove_header=True)
    lines = map(fun, lines)
    for line in lines:
        print(line)

['/mount/work/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/coverage/A001_K562_WSTARRseq_Input_rep1.GATA1.unstranded.perbase.tsv.gz']
+++++++++++++++++++++++++++++++++++++++++++++++++++
['chrX', '47786880', '47786881', '.']
['chrX', '47786881', '47786882', '.']
['chrX', '47786882', '47786883', '.']
['chrX', '47786883', '47786884', '.']
['chrX', '47786884', '47786885', '.']
+++++++++++++++++++++++++++++++++++++++++++++++++++
['chrX', '47786881', '47786882', '.', 'Input_rep1']
['chrX', '47786882', '47786883', '.', 'Input_rep1']
['chrX', '47786883', '47786884', '.', 'Input_rep1']
['chrX', '47786884', '47786885', '.', 'Input_rep1']
['chrX', '47786885', '47786886', '.', 'Input_rep1']
