**Set environment**

In [1]:
### set environment
import sys
sys.path.append('../config')
from config_sing import *
show_env()

### import more tools
import csv
import sqlite3
# https://stackoverflow.com/questions/49456158/integer-in-python-pandas-becomes-blob-binary-in-sqlite
sqlite3.register_adapter(np.int64, lambda val: int(val))
sqlite3.register_adapter(np.int32, lambda val: int(val))

You are in: Singularity | singularity_proj_combeffect
    BASE DIRECTORY:     /mount/work
    PATH OF SOURCE:     /mount/work/source
    PATH OF EXECUTABLE: /mount/work/exe
    PATH OF ANNOTATION: /mount/work/annotation
    PATH OF PROJECT:    /mount/project
    PATH OF RESULTS:    /mount/work/out/proj_combeffect_encode_fcc

Library imported:
    numpy, pandas, itertools,
    os, sys, time, gzip, glob,
    functools.partial/reduce,
    collections.Counter,
    matplotlib.pyplot



In [2]:
#PREFIX = "KS91_K562_ASTARRseq"
#PREFIX = "A001_K562_WSTARRseq"
PREFIX  = "Tewhey_K562_TileMPRA"

In [3]:
def gen_lines(file, n_lines=10, remove_header=False):
    """generate lines lines from the file"""
    ### read lines using csv reader
    lines = csv.reader(file, delimiter='\t', quotechar='|')
    
    ### remove file header if needed
    if remove_header:
        next(lines)
    
    ### specify number of lines to generate
    lines = it.islice(lines, 0, n_lines)
    for line in lines:
        yield line

## Sample
```
query_table = ("""CREATE TABLE IF NOT EXISTS Sample(
    sample    TEXT PRIMARY KEY, 
    treatment TEXT,
    replicate TEXT,
    size      INTEGER
);""")

query_insert = ("""
    INSERT OR IGNORE INTO Sample
        (sample, treatment, replicate, size) 
    VALUES 
        (?,?,?,?)
    """)
```

In [6]:
fdiry = os.path.join(FD_RES, PREFIX, "coverage", "library_size")
fname = "region_total_counts.tsv"
fpath = os.path.join(fdiry, fname)

dat_sample = pd.read_table(fpath, sep="\t")
print(dat_sample.dtypes)
dat_sample

Region     object
Strand     object
Group      object
Size      float64
dtype: object


Unnamed: 0,Region,Strand,Group,Size
0,GATA1,stranded_pos,Input,26777810.0
1,GATA1,stranded_pos,Output,91437830.0
2,MYC,stranded_pos,Input,22526030.0
3,MYC,stranded_pos,Output,42384060.0
4,FADS,stranded_pos,Input,6408832.0
5,FADS,stranded_pos,Output,21584870.0
6,FADS,stranded_neg,Input,6318496.0
7,FADS,stranded_neg,Output,20165760.0


## Fragments (NUC)
```
query_reset_table = "DROP TABLE IF EXISTS Fragment"
query_table = ("""
    CREATE TABLE IF NOT EXISTS Fragment(
        fragment TEXT PRIMARY KEY, 
        chrom    TEXT,
        start    INTEGER,
        end      INTEGER,
        pct_at   REAL,
        pct_gc   REAL,
        num_A    INTEGER,
        num_C    INTEGER,
        num_G    INTEGER,
        num_T    INTEGER,
        num_N    INTEGER,
        num_oth  INTEGER
    );""")
    
query_insert = ("""
    INSERT OR IGNORE INTO Fragment
        (fragment, chrom, start, end, pct_at, pct_gc,
         num_A, num_C, num_G, num_T, num_N, num_oth) 
    VALUES 
        (?,?,?,?,?,?,?,?,?,?,?,?)
    """)
```

In [8]:
%%bash
source ../config/config_sing.sh
PREFIX="Tewhey_K562_TileMPRA"
REGION="GATA1"
#SAMPLE="Input_rep1"
STRAND="stranded_pos"
FOLDER="fragment_nuc"
FPATH=$(ls ${FD_RES}/${PREFIX}/${FOLDER}/*${REGION}*${STRAND}*)
echo ${FPATH}
zcat ${FPATH} | head

/mount/work/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/fragment_nuc/Tile_K562_hg38_20210130.GATA1.stranded_pos.bed.gz
#1_usercol	2_usercol	3_usercol	4_usercol	5_usercol	6_usercol	7_pct_at	8_pct_gc	9_num_A	10_num_C	11_num_G	12_num_T	13_num_N	14_num_oth	15_seq_len
chrX	47786401	47786600	X:47786401-47786600	.	+	0.668342	0.331658	33	27	39	100	0	0	199
chrX	47786451	47786650	X:47786451-47786650	.	+	0.658291	0.341709	36	30	38	95	0	0	199
chrX	47786501	47786700	X:47786501-47786700	.	+	0.673367	0.326633	34	35	30	100	0	0	199
chrX	47786551	47786750	X:47786551-47786750	.	+	0.663317	0.336683	29	38	29	103	0	0	199
chrX	47786601	47786800	X:47786601-47786800	.	+	0.653266	0.346734	30	35	34	100	0	0	199
chrX	47786651	47786850	X:47786651-47786850	.	+	0.633166	0.366834	27	38	35	99	0	0	199
chrX	47786701	47786900	X:47786701-47786900	.	+	0.613065	0.386935	32	36	41	90	0	0	199
chrX	47786751	47786950	X:47786751-47786950	.	+	0.603015	0.396985	38	39	40	82	0	0	199
chrX	47786801	47787000	X:47786801-47787000	.

In [9]:
### helper function to process each row
def prep_line(line):
    """Function to process each line"""
    ### parse info: Chrom, Start, Stop, Name, Score, Strand
    key = f"{line[0]}_{line[1]}_{line[2]}"
    val = [key, *line[0:3], *line[6:14]]
    return val

In [10]:
FOLDER="fragment_nuc"
REGION="GATA1"
#SAMPLE="Input_rep1"
STRAND="stranded_pos"

fglob  = os.path.join(FD_RES, PREFIX, FOLDER, f"*{REGION}*{STRAND}*")
fpaths = glob.glob(fglob)
fpath  = fpaths[0]
print(fpaths)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
with gzip.open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5)
    for line in lines:
        print(line)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
fun = prep_line
with gzip.open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5, remove_header=True)
    lines = map(fun, lines)
    for line in lines:
        print(line)

['/mount/work/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/fragment_nuc/Tile_K562_hg38_20210130.GATA1.stranded_pos.bed.gz']
+++++++++++++++++++++++++++++++++++++++++++++++++++
['#1_usercol', '2_usercol', '3_usercol', '4_usercol', '5_usercol', '6_usercol', '7_pct_at', '8_pct_gc', '9_num_A', '10_num_C', '11_num_G', '12_num_T', '13_num_N', '14_num_oth', '15_seq_len']
['chrX', '47786401', '47786600', 'X:47786401-47786600', '.', '+', '0.668342', '0.331658', '33', '27', '39', '100', '0', '0', '199']
['chrX', '47786451', '47786650', 'X:47786451-47786650', '.', '+', '0.658291', '0.341709', '36', '30', '38', '95', '0', '0', '199']
['chrX', '47786501', '47786700', 'X:47786501-47786700', '.', '+', '0.673367', '0.326633', '34', '35', '30', '100', '0', '0', '199']
['chrX', '47786551', '47786750', 'X:47786551-47786750', '.', '+', '0.663317', '0.336683', '29', '38', '29', '103', '0', '0', '199']
+++++++++++++++++++++++++++++++++++++++++++++++++++
['chrX_47786401_47786600', 'chrX', '47786401', 

## Counts

In [11]:
%%bash
source ../config/config_sing.sh 
PREFIX="Tewhey_K562_TileMPRA"
FOLDER="fragment"
REGION="GATA1"
#SAMPLE="Input_rep1"
STRAND="stranded_pos"
FPATH=$(ls ${FD_RES}/${PREFIX}/${FOLDER}/*${REGION}*${STRAND}*)
echo ${FPATH}
head ${FPATH}

/mount/work/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/fragment/Tile_K562_hg38_20210130.GATA1.stranded_pos.bed
chrX	47786401	47786600	X:47786401-47786600	.	+
chrX	47786451	47786650	X:47786451-47786650	.	+
chrX	47786501	47786700	X:47786501-47786700	.	+
chrX	47786551	47786750	X:47786551-47786750	.	+
chrX	47786601	47786800	X:47786601-47786800	.	+
chrX	47786651	47786850	X:47786651-47786850	.	+
chrX	47786701	47786900	X:47786701-47786900	.	+
chrX	47786751	47786950	X:47786751-47786950	.	+
chrX	47786801	47787000	X:47786801-47787000	.	+
chrX	47786851	47787050	X:47786851-47787050	.	+


In [9]:
### helper function to process each row
def prep_line(line):
    """Function to process each line"""
    key = f"{line[0]}_{line[1]}_{line[2]}"
    val = [key, *line[0:3], line[4]]
    return val

In [12]:
FOLDER="fragment"
REGION="GATA1"
#SAMPLE="Input_rep1"
STRAND="stranded_pos"

fglob  = os.path.join(FD_RES, PREFIX, FOLDER, f"*{REGION}*{STRAND}*")
fpaths = glob.glob(fglob)
fpath  = fpaths[0]
print(fpaths)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
with open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5)
    for line in lines:
        print(line)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
fun = prep_line
with open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5, remove_header=True)
    lines = map(fun, lines)
    for line in lines:
        print(line)

['/mount/work/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/fragment/Tile_K562_hg38_20210130.GATA1.stranded_pos.bed']
+++++++++++++++++++++++++++++++++++++++++++++++++++
['chrX', '47786401', '47786600', 'X:47786401-47786600', '.', '+']
['chrX', '47786451', '47786650', 'X:47786451-47786650', '.', '+']
['chrX', '47786501', '47786700', 'X:47786501-47786700', '.', '+']
['chrX', '47786551', '47786750', 'X:47786551-47786750', '.', '+']
['chrX', '47786601', '47786800', 'X:47786601-47786800', '.', '+']
+++++++++++++++++++++++++++++++++++++++++++++++++++
['chrX_47786451_47786650', 'chrX', '47786451', '47786650']
['chrX_47786501_47786700', 'chrX', '47786501', '47786700']
['chrX_47786551_47786750', 'chrX', '47786551', '47786750']
['chrX_47786601_47786800', 'chrX', '47786601', '47786800']
['chrX_47786651_47786850', 'chrX', '47786651', '47786850']


## Motif
```
query_reset_table = "DROP TABLE IF EXISTS Motif"
query_table = """
    CREATE TABLE IF NOT EXISTS Motif(
        binding TEXT PRIMARY KEY, 
        chrom   TEXT,
        start   INTEGER,
        end     INTEGER,
        motif   TEXT,
        score   REAL
    );"""
query_insert = """
    INSERT OR IGNORE INTO Motif 
        (binding,chrom,start,end,motif,score)
    VALUES 
        (?,?,?,?,?,?)
    """
```

In [13]:
%%bash
source ../config/config_sing.sh 
CHROM="chrX"
FDIRY="${FD_ANN}/motif_cluster_jvierstra/hg38_archetype_motifs_v1"
FNAME="${CHROM}_rm_mouse_merge.bed.gz"
FPATH="${FDIRY}/${FNAME}"
echo ${FPATH}
zcat ${FPATH} | head

/mount/work/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1/chrX_rm_mouse_merge.bed.gz
chrX	10006	10041	KLF/SP/2	3.257775
chrX	10018	10038	GC-tract	6.314
chrX	10025	10039	NR/3	7.0957
chrX	10035	10046	PRDM1	8.1562
chrX	10046	10061	MAF	7.1564
chrX	10047	10057	NFY	7.6173
chrX	10053	10086	KLF/SP/2	7.89956
chrX	10055	10073	TBX/3	7.6294
chrX	10055	10066	ETS/2	10.5061
chrX	10056	10078	GC-tract	7.8744


In [14]:
### helper function to process each row
def prep_line(line):
    """Function to process each line"""
    ### parse info: Chrom, Start, Stop, Name, Score, Strand
    key = f"{line[0]}_{line[1]}_{line[2]}_{line[3]}"
    val = [key, *line]
    return val

In [15]:
CHROM="chrX"
fpath  = os.path.join(
    FD_ANN, 
    "motif_cluster_jvierstra", 
    "hg38_archetype_motifs_v1", 
    f"{CHROM}_rm_mouse_merge.bed.gz")
print(fpath)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
with gzip.open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5)
    for line in lines:
        print(line)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
fun = prep_line
with gzip.open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5, remove_header=True)
    lines = map(fun, lines)
    for line in lines:
        print(line)

/mount/work/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1/chrX_rm_mouse_merge.bed.gz
+++++++++++++++++++++++++++++++++++++++++++++++++++
['chrX', '10006', '10041', 'KLF/SP/2', '3.257775']
['chrX', '10018', '10038', 'GC-tract', '6.314']
['chrX', '10025', '10039', 'NR/3', '7.0957']
['chrX', '10035', '10046', 'PRDM1', '8.1562']
['chrX', '10046', '10061', 'MAF', '7.1564']
+++++++++++++++++++++++++++++++++++++++++++++++++++
['chrX_10018_10038_GC-tract', 'chrX', '10018', '10038', 'GC-tract', '6.314']
['chrX_10025_10039_NR/3', 'chrX', '10025', '10039', 'NR/3', '7.0957']
['chrX_10035_10046_PRDM1', 'chrX', '10035', '10046', 'PRDM1', '8.1562']
['chrX_10046_10061_MAF', 'chrX', '10046', '10061', 'MAF', '7.1564']
['chrX_10047_10057_NFY', 'chrX', '10047', '10057', 'NFY', '7.6173']


## Annotation
```
query_reset_table = "DROP TABLE IF EXISTS Annotation"

query_table = """
    CREATE TABLE IF NOT EXISTS Annotation (
        fragment TEXT, 
        binding  TEXT,
        FOREIGN KEY (fragment) REFERENCES Fragment (fragment),
        FOREIGN KEY (binding)  REFERENCES Motif    (binding),
        UNIQUE (fragment, binding) ON CONFLICT IGNORE
    );"""

query_insert = """
    INSERT OR IGNORE INTO Annotation
        (fragment, binding)
    VALUES 
        (?,?)
    """
```

In [24]:
%%bash
source ../config/config_sing.sh
PREFIX="Tewhey_K562_TileMPRA"
FOLDER="annotation"
REGION="GATA1"
#SAMPLE="Merged"
STRAND="stranded_pos"

FPATH=$(ls ${FD_RES}/${PREFIX}/${FOLDER}/*${REGION}*${STRAND}*)
echo ${FPATH}
zcat ${FPATH} | head

/mount/work/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/annotation/Tile_K562_hg38_20210130.GATA1.stranded_pos.bed.gz
chrX	47786401	47786600	X:47786401-47786600	.	+	chrX	47786401	47786418	KLF/SP/2	6.7001	17
chrX	47786401	47786600	X:47786401-47786600	.	+	chrX	47786424	47786443	ZNF28	8.2497	19
chrX	47786401	47786600	X:47786401-47786600	.	+	chrX	47786449	47786459	FOX/4	8.4743	10
chrX	47786401	47786600	X:47786401-47786600	.	+	chrX	47786460	47786481	ZNF382	7.8726	21
chrX	47786401	47786600	X:47786401-47786600	.	+	chrX	47786461	47786483	ZNF136	2.6078	22
chrX	47786401	47786600	X:47786401-47786600	.	+	chrX	47786479	47786501	ZNF41	7.1286	22
chrX	47786401	47786600	X:47786401-47786600	.	+	chrX	47786487	47786499	IRF/1	5.3024	12
chrX	47786401	47786600	X:47786401-47786600	.	+	chrX	47786499	47786513	RUNX/2	8.9312	14
chrX	47786401	47786600	X:47786401-47786600	.	+	chrX	47786499	47786518	ZNF320	5.6607	19
chrX	47786401	47786600	X:47786401-47786600	.	+	chrX	47786500	47786521	ZNF382	16.7936	21


In [17]:
### helper function to process each row
def prep_line(line):
    """Function to process each line"""
    ### parse info: Chrom, Start, Stop, Name, Score, Strand
    fragment = f"{line[0]}_{line[1]}_{line[2]}"
    motif    = f"{line[6]}_{line[7]}_{line[8]}_{line[9]}"
    return fragment, motif

In [18]:
FOLDER="annotation"
REGION="GATA1"
#SAMPLE="Input_rep1"
STRAND="stranded_pos"

fglob  = os.path.join(FD_RES, PREFIX, FOLDER, f"*{REGION}*{STRAND}*")
fpaths = glob.glob(fglob)
fpath  = fpaths[0]
print(fpaths)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
with gzip.open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5)
    for line in lines:
        print(line)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
fun = prep_line
with gzip.open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5, remove_header=True)
    lines = map(fun, lines)
    for line in lines:
        print(line)

['/mount/work/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/annotation/Tile_K562_hg38_20210130.GATA1.stranded_pos.bed.gz']
+++++++++++++++++++++++++++++++++++++++++++++++++++
['chrX', '47786401', '47786600', 'X:47786401-47786600', '.', '+', 'chrX', '47786401', '47786418', 'KLF/SP/2', '6.7001', '17']
['chrX', '47786401', '47786600', 'X:47786401-47786600', '.', '+', 'chrX', '47786424', '47786443', 'ZNF28', '8.2497', '19']
['chrX', '47786401', '47786600', 'X:47786401-47786600', '.', '+', 'chrX', '47786449', '47786459', 'FOX/4', '8.4743', '10']
['chrX', '47786401', '47786600', 'X:47786401-47786600', '.', '+', 'chrX', '47786460', '47786481', 'ZNF382', '7.8726', '21']
['chrX', '47786401', '47786600', 'X:47786401-47786600', '.', '+', 'chrX', '47786461', '47786483', 'ZNF136', '2.6078', '22']
+++++++++++++++++++++++++++++++++++++++++++++++++++
('chrX_47786401_47786600', 'chrX_47786424_47786443_ZNF28')
('chrX_47786401_47786600', 'chrX_47786449_47786459_FOX/4')
('chrX_47786401_47786600', 'c

## Coverage
```
query_reset_table = "DROP TABLE IF EXISTS Coverage"

query_table = """CREATE TABLE IF NOT EXISTS Coverage(
    chrom    TEXT,
    location INTEGER,
    depth    INTEGER,
    sample   TEXT,
    FOREIGN KEY (sample) REFERENCES Sample (sample)
    );"""

query_insert = ("""
    INSERT OR IGNORE INTO Coverage
        (chrom, location, depth, sample)
    VALUES 
        (?,?,?,?)""")
```

In [19]:
%%bash
source ../config/config_sing.sh
PREFIX="Tewhey_K562_TileMPRA"
FOLDER="coverage"
REGION="GATA1"
#SAMPLE="Input_rep1"
STRAND="stranded_pos"

FPATH=$(ls ${FD_RES}/${PREFIX}/${FOLDER}/*${REGION}*${STRAND}*)
echo ${FPATH}
zcat ${FPATH} | head

/mount/work/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/coverage/Tile_K562_hg38_20210130.GATA1.stranded_pos.Input.perbase.tsv.gz /mount/work/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/coverage/Tile_K562_hg38_20210130.GATA1.stranded_pos.Output.perbase.tsv.gz
chrX	47786400	47786401	.
chrX	47786401	47786402	2117.787
chrX	47786402	47786403	2117.787
chrX	47786403	47786404	2117.787
chrX	47786404	47786405	2117.787
chrX	47786405	47786406	2117.787
chrX	47786406	47786407	2117.787
chrX	47786407	47786408	2117.787
chrX	47786408	47786409	2117.787
chrX	47786409	47786410	2117.787


In [21]:
### helper function to process each row
def prep_line(line, sample):
    """Function to process each line"""
    return [*line, SAMPLE]

In [22]:
FOLDER="coverage"
REGION="GATA1"
SAMPLE="Merged"
STRAND="stranded_pos"

fglob  = os.path.join(FD_RES, PREFIX, FOLDER, f"*{REGION}*{STRAND}*perbase*")
fpaths = glob.glob(fglob)
fpath  = fpaths[0]
print(fpaths)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
with gzip.open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5)
    for line in lines:
        print(line)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
fun = partial(prep_line, sample=SAMPLE)
with gzip.open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5, remove_header=True)
    lines = map(fun, lines)
    for line in lines:
        print(line)

['/mount/work/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/coverage/Tile_K562_hg38_20210130.GATA1.stranded_pos.Input.perbase.tsv.gz', '/mount/work/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/coverage/Tile_K562_hg38_20210130.GATA1.stranded_pos.Output.perbase.tsv.gz']
+++++++++++++++++++++++++++++++++++++++++++++++++++
['chrX', '47786400', '47786401', '.']
['chrX', '47786401', '47786402', '2117.787']
['chrX', '47786402', '47786403', '2117.787']
['chrX', '47786403', '47786404', '2117.787']
['chrX', '47786404', '47786405', '2117.787']
+++++++++++++++++++++++++++++++++++++++++++++++++++
['chrX', '47786401', '47786402', '2117.787', 'Merged']
['chrX', '47786402', '47786403', '2117.787', 'Merged']
['chrX', '47786403', '47786404', '2117.787', 'Merged']
['chrX', '47786404', '47786405', '2117.787', 'Merged']
['chrX', '47786405', '47786406', '2117.787', 'Merged']
