**Set environment**

In [1]:
### set environment
import sys
sys.path.append('../config')
from config_sing import *
show_env()

### import more tools
import csv
import sqlite3
# https://stackoverflow.com/questions/49456158/integer-in-python-pandas-becomes-blob-binary-in-sqlite
sqlite3.register_adapter(np.int64, lambda val: int(val))
sqlite3.register_adapter(np.int32, lambda val: int(val))

You are in: Singularity | singularity_proj_combeffect
    BASE DIRECTORY:     /mount/work
    PATH OF SOURCE:     /mount/work/source
    PATH OF EXECUTABLE: /mount/work/exe
    PATH OF ANNOTATION: /mount/work/annotation
    PATH OF PROJECT:    /mount/project
    PATH OF RESULTS:    /mount/work/out/proj_combeffect_encode_fcc

Library imported:
    numpy, pandas, itertools,
    os, sys, time, gzip, glob,
    functools.partial/reduce,
    collections.Counter,
    matplotlib.pyplot



In [2]:
PREFIX  = "KS91_K562_ASTARRseq"
#PREFIX = "A001_K562_WSTARRseq"
#PREFIX = "Tewhey_K562_TileMPRA"

In [3]:
def gen_lines(file, n_lines=10, remove_header=False):
    """generate lines lines from the file"""
    ### read lines using csv reader
    lines = csv.reader(file, delimiter='\t', quotechar='|')
    
    ### remove file header if needed
    if remove_header:
        next(lines)
    
    ### specify number of lines to generate
    lines = it.islice(lines, 0, n_lines)
    for line in lines:
        yield line

## Sample
```
query_table = ("""CREATE TABLE IF NOT EXISTS Sample(
    sample    TEXT PRIMARY KEY, 
    treatment TEXT,
    replicate TEXT,
    size      INTEGER
);""")

query_insert = ("""
    INSERT OR IGNORE INTO Sample
        (sample, treatment, replicate, size) 
    VALUES 
        (?,?,?,?)
    """)
```

In [4]:
fdiry = os.path.join(FD_RES, PREFIX, "coverage", "library_size")
fname = "library_size_summary.csv"
fpath = os.path.join(fdiry, fname)

dat_sample = pd.read_table(fpath, sep=",")
print(dat_sample.dtypes)
dat_sample

Sample    object
Group     object
Rep       object
Count      int64
Fpath     object
dtype: object


Unnamed: 0,Sample,Group,Rep,Count,Fpath
0,Input_rep1,Input,rep1,348695063,KS91_K562_hg38_ASTARRseq_Input_rep1.masked.ded...
1,Input_rep2,Input,rep2,451369741,KS91_K562_hg38_ASTARRseq_Input_rep2.masked.ded...
2,Input_rep3,Input,rep3,487579055,KS91_K562_hg38_ASTARRseq_Input_rep3.masked.ded...
3,Input_rep4,Input,rep4,456246254,KS91_K562_hg38_ASTARRseq_Input_rep4.masked.ded...
4,Input_rep5,Input,rep5,444268950,KS91_K562_hg38_ASTARRseq_Input_rep5.masked.ded...
5,Input_rep6,Input,rep6,397333562,KS91_K562_hg38_ASTARRseq_Input_rep6.masked.ded...
6,Output_rep1,Output,rep1,44103844,KS91_K562_hg38_ASTARRseq_Output_rep1.f3q10.fra...
7,Output_rep2,Output,rep2,97471282,KS91_K562_hg38_ASTARRseq_Output_rep2.f3q10.fra...
8,Output_rep3,Output,rep3,84103298,KS91_K562_hg38_ASTARRseq_Output_rep3.f3q10.fra...
9,Output_rep4,Output,rep4,183115379,KS91_K562_hg38_ASTARRseq_Output_rep4.f3q10.fra...


## Fragments (NUC)
```
query_reset_table = "DROP TABLE IF EXISTS Fragment"
query_table = ("""
    CREATE TABLE IF NOT EXISTS Fragment(
        fragment TEXT PRIMARY KEY, 
        chrom    TEXT,
        start    INTEGER,
        end      INTEGER,
        pct_at   REAL,
        pct_gc   REAL,
        num_A    INTEGER,
        num_C    INTEGER,
        num_G    INTEGER,
        num_T    INTEGER,
        num_N    INTEGER,
        num_oth  INTEGER
    );""")
    
query_insert = ("""
    INSERT OR IGNORE INTO Fragment
        (fragment, chrom, start, end, pct_at, pct_gc,
         num_A, num_C, num_G, num_T, num_N, num_oth) 
    VALUES 
        (?,?,?,?,?,?,?,?,?,?,?,?)
    """)
```

In [5]:
%%bash
source ../config/config_sing.sh
PREFIX="KS91_K562_ASTARRseq"
REGION="GATA1"
SAMPLE="Input_rep1"
STRAND="unstranded"
FOLDER="fragment_nuc"
FPATH=$(ls ${FD_RES}/${PREFIX}/${FOLDER}/*${SAMPLE}*${REGION}*${STRAND}*)
echo ${FPATH}
zcat ${FPATH} | head

/mount/work/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/fragment_nuc/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.bed.gz
#1_usercol	2_usercol	3_usercol	4_usercol	5_usercol	6_usercol	7_pct_at	8_pct_gc	9_num_A	10_num_C	11_num_G	12_num_T	13_num_N	14_num_oth	15_seq_len
chrX	47787165	47787363	chrX_47787165_47787363	1	.	0.560606	0.439394	33	34	53	78	0	0	198
chrX	47787557	47787773	chrX_47787557_47787773	1	.	0.615741	0.384259	45	35	48	88	0	0	216
chrX	47787593	47787783	chrX_47787593_47787783	1	.	0.636842	0.363158	42	32	37	79	0	0	190
chrX	47787598	47787772	chrX_47787598_47787772	1	.	0.637931	0.362069	37	28	35	74	0	0	174
chrX	47787661	47788058	chrX_47787661_47788058	1	.	0.594458	0.405542	76	72	89	160	0	0	397
chrX	47787743	47787849	chrX_47787743_47787849	1	.	0.528302	0.471698	15	24	26	41	0	0	106
chrX	47787998	47788231	chrX_47787998_47788231	1	.	0.536481	0.463519	35	57	51	90	0	0	233
chrX	47788008	47788157	chrX_47788008_47788157	1	.	0.543624	0.456376	23	34	34	58	0	0	149
chrX	47788008	

In [6]:
### helper function to process each row
def prep_line(line):
    """Function to process each line"""
    ### parse info: Chrom, Start, Stop, Name, Score, Strand
    key = f"{line[0]}_{line[1]}_{line[2]}"
    val = [key, *line[0:3], *line[6:14]]
    return val

In [7]:
FOLDER="fragment_nuc"
REGION="GATA1"
SAMPLE="Input_rep1"
STRAND="unstranded"

fglob  = os.path.join(FD_RES, PREFIX, FOLDER, f"*{SAMPLE}*{REGION}*{STRAND}*")
fpaths = glob.glob(fglob)
fpath  = fpaths[0]
print(fpaths)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
with gzip.open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5)
    for line in lines:
        print(line)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
fun = prep_line
with gzip.open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5, remove_header=True)
    lines = map(fun, lines)
    for line in lines:
        print(line)

['/mount/work/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/fragment_nuc/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.bed.gz']
+++++++++++++++++++++++++++++++++++++++++++++++++++
['#1_usercol', '2_usercol', '3_usercol', '4_usercol', '5_usercol', '6_usercol', '7_pct_at', '8_pct_gc', '9_num_A', '10_num_C', '11_num_G', '12_num_T', '13_num_N', '14_num_oth', '15_seq_len']
['chrX', '47787165', '47787363', 'chrX_47787165_47787363', '1', '.', '0.560606', '0.439394', '33', '34', '53', '78', '0', '0', '198']
['chrX', '47787557', '47787773', 'chrX_47787557_47787773', '1', '.', '0.615741', '0.384259', '45', '35', '48', '88', '0', '0', '216']
['chrX', '47787593', '47787783', 'chrX_47787593_47787783', '1', '.', '0.636842', '0.363158', '42', '32', '37', '79', '0', '0', '190']
['chrX', '47787598', '47787772', 'chrX_47787598_47787772', '1', '.', '0.637931', '0.362069', '37', '28', '35', '74', '0', '0', '174']
+++++++++++++++++++++++++++++++++++++++++++++++++++
['chrX_47787165_47787363', 'c

## Motif
```
query_reset_table = "DROP TABLE IF EXISTS Motif"
query_table = """
    CREATE TABLE IF NOT EXISTS Motif(
        binding TEXT PRIMARY KEY, 
        chrom   TEXT,
        start   INTEGER,
        end     INTEGER,
        motif   TEXT,
        score   REAL
    );"""
query_insert = """
    INSERT OR IGNORE INTO Motif 
        (binding,chrom,start,end,motif,score)
    VALUES 
        (?,?,?,?,?,?)
    """
```

In [11]:
%%bash
source ../config/config_sing.sh 
CHROM="chrX"
FDIRY="${FD_ANN}/motif_cluster_jvierstra/hg38_archetype_motifs_v1"
FNAME="${CHROM}_rm_mouse_merge.bed.gz"
FPATH="${FDIRY}/${FNAME}"
echo ${FPATH}
zcat ${FPATH} | head

/mount/work/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1/chrX_rm_mouse_merge.bed.gz
chrX	10006	10041	KLF/SP/2	3.257775
chrX	10018	10038	GC-tract	6.314
chrX	10025	10039	NR/3	7.0957
chrX	10035	10046	PRDM1	8.1562
chrX	10046	10061	MAF	7.1564
chrX	10047	10057	NFY	7.6173
chrX	10053	10086	KLF/SP/2	7.89956
chrX	10055	10073	TBX/3	7.6294
chrX	10055	10066	ETS/2	10.5061
chrX	10056	10078	GC-tract	7.8744


In [25]:
### helper function to process each row
def prep_line(line):
    """Function to process each line"""
    ### parse info: Chrom, Start, Stop, Motif, Score
    key = f"{line[0]}_{line[1]}_{line[2]}_{line[3]}"
    val = [key, *line]
    return val

In [26]:
CHROM="chrX"
fpath  = os.path.join(
    FD_ANN, 
    "motif_cluster_jvierstra", 
    "hg38_archetype_motifs_v1", 
    f"{CHROM}_rm_mouse_merge.bed.gz")
print(fpath)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
with gzip.open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5)
    for line in lines:
        print(line)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
fun = prep_line
with gzip.open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5, remove_header=False)
    lines = map(fun, lines)
    for line in lines:
        print(line)

/mount/work/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1/chrX_rm_mouse_merge.bed.gz
+++++++++++++++++++++++++++++++++++++++++++++++++++
['chrX', '10006', '10041', 'KLF/SP/2', '3.257775']
['chrX', '10018', '10038', 'GC-tract', '6.314']
['chrX', '10025', '10039', 'NR/3', '7.0957']
['chrX', '10035', '10046', 'PRDM1', '8.1562']
['chrX', '10046', '10061', 'MAF', '7.1564']
+++++++++++++++++++++++++++++++++++++++++++++++++++
['chrX_10006_10041_KLF/SP/2', 'chrX', '10006', '10041', 'KLF/SP/2', '3.257775']
['chrX_10018_10038_GC-tract', 'chrX', '10018', '10038', 'GC-tract', '6.314']
['chrX_10025_10039_NR/3', 'chrX', '10025', '10039', 'NR/3', '7.0957']
['chrX_10035_10046_PRDM1', 'chrX', '10035', '10046', 'PRDM1', '8.1562']
['chrX_10046_10061_MAF', 'chrX', '10046', '10061', 'MAF', '7.1564']


## Counts
```
query_reset_table = "DROP TABLE IF EXISTS Count"

query_table = """
    CREATE TABLE IF NOT EXISTS Count (
        fragment TEXT, 
        sample   TEXT,
        count    INTEGER,
        FOREIGN KEY (fragment) REFERENCES Fragment (fragment),
        FOREIGN KEY (sample)   REFERENCES Sample   (sample)
    );"""

query_insert = ("""INSERT OR IGNORE INTO Count
    (fragment, sample, count)
    VALUES 
    (?,?,?)""")
```

In [8]:
%%bash
source ../config/config_sing.sh 
PREFIX="KS91_K562_ASTARRseq"
FOLDER="fragment"
REGION="GATA1"
SAMPLE="Input_rep1"
STRAND="unstranded"
FPATH=$(ls ${FD_RES}/${PREFIX}/${FOLDER}/*${SAMPLE}*${REGION}*${STRAND}*)
echo ${FPATH}
head ${FPATH}

/mount/work/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/fragment/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.bed
chrX	47787165	47787363	chrX_47787165_47787363	1	.
chrX	47787557	47787773	chrX_47787557_47787773	1	.
chrX	47787593	47787783	chrX_47787593_47787783	1	.
chrX	47787598	47787772	chrX_47787598_47787772	1	.
chrX	47787661	47788058	chrX_47787661_47788058	1	.
chrX	47787743	47787849	chrX_47787743_47787849	1	.
chrX	47787998	47788231	chrX_47787998_47788231	1	.
chrX	47788008	47788157	chrX_47788008_47788157	1	.
chrX	47788008	47788181	chrX_47788008_47788181	1	.
chrX	47788140	47788547	chrX_47788140_47788547	1	.


In [27]:
### helper function to process each row
def prep_line(line, sample):
    """Function to process each line"""
    key = f"{line[0]}_{line[1]}_{line[2]}"
    val = [key, sample, line[4]]
    return val

In [28]:
FOLDER="fragment"
REGION="GATA1"
SAMPLE="Input_rep1"
STRAND="unstranded"

fglob  = os.path.join(FD_RES, PREFIX, FOLDER, f"*{SAMPLE}*{REGION}*{STRAND}*")
fpaths = glob.glob(fglob)
fpath  = fpaths[0]
print(fpaths)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
with open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5)
    for line in lines:
        print(line)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
fun = partial(prep_line, sample=SAMPLE)
with open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5, remove_header=True)
    lines = map(fun, lines)
    for line in lines:
        print(line)

['/mount/work/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/fragment/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.bed']
+++++++++++++++++++++++++++++++++++++++++++++++++++
['chrX', '47787165', '47787363', 'chrX_47787165_47787363', '1', '.']
['chrX', '47787557', '47787773', 'chrX_47787557_47787773', '1', '.']
['chrX', '47787593', '47787783', 'chrX_47787593_47787783', '1', '.']
['chrX', '47787598', '47787772', 'chrX_47787598_47787772', '1', '.']
['chrX', '47787661', '47788058', 'chrX_47787661_47788058', '1', '.']
+++++++++++++++++++++++++++++++++++++++++++++++++++
['chrX_47787557_47787773', 'Input_rep1', '1']
['chrX_47787593_47787783', 'Input_rep1', '1']
['chrX_47787598_47787772', 'Input_rep1', '1']
['chrX_47787661_47788058', 'Input_rep1', '1']
['chrX_47787743_47787849', 'Input_rep1', '1']


## Annotation
```
query_reset_table = "DROP TABLE IF EXISTS Annotation"

query_table = """
    CREATE TABLE IF NOT EXISTS Annotation (
        fragment TEXT, 
        binding  TEXT,
        FOREIGN KEY (fragment) REFERENCES Fragment (fragment),
        FOREIGN KEY (binding)  REFERENCES Motif    (binding),
        UNIQUE (fragment, binding) ON CONFLICT IGNORE
    );"""

query_insert = """
    INSERT OR IGNORE INTO Annotation
        (fragment, binding)
    VALUES 
        (?,?)
    """
```

In [14]:
%%bash
source ../config/config_sing.sh
PREFIX="KS91_K562_ASTARRseq"
FOLDER="annotation"
REGION="GATA1"
SAMPLE="Input_rep1"
STRAND="unstranded"

FPATH=$(ls ${FD_RES}/${PREFIX}/${FOLDER}/*${SAMPLE}*${REGION}*${STRAND}*)
echo ${FPATH}
zcat ${FPATH} | head

/mount/work/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/annotation/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.bed.gz
chrX	47787165	47787363	chrX_47787165_47787363	1	.	chrX	47787180	47787190	FOX/8	5.1151	10
chrX	47787165	47787363	chrX_47787165_47787363	1	.	chrX	47787182	47787192	HINFP1/1	8.8688	10
chrX	47787165	47787363	chrX_47787165_47787363	1	.	chrX	47787194	47787216	ZNF146	1.8019	22
chrX	47787165	47787363	chrX_47787165_47787363	1	.	chrX	47787196	47787211	NR/15	8.0507	15
chrX	47787165	47787363	chrX_47787165_47787363	1	.	chrX	47787229	47787243	NR/20	8.1423	14
chrX	47787165	47787363	chrX_47787165_47787363	1	.	chrX	47787257	47787267	FOX/4	8.8403	10
chrX	47787165	47787363	chrX_47787165_47787363	1	.	chrX	47787260	47787276	TBX/1	11.3184	16
chrX	47787165	47787363	chrX_47787165_47787363	1	.	chrX	47787261	47787294	TBX/4	7.42815	33
chrX	47787165	47787363	chrX_47787165_47787363	1	.	chrX	47787261	47787274	ZIC	6.6872	13
chrX	47787165	47787363	chrX_47787165_47787363	1	.	chrX	477872

In [29]:
### helper function to process each row
def prep_line(line):
    """Function to process each line"""
    ### parse info: Chrom, Start, Stop, Name, Score, Strand
    fragment = f"{line[0]}_{line[1]}_{line[2]}"
    binding  = f"{line[6]}_{line[7]}_{line[8]}_{line[9]}"
    return fragment, binding

In [30]:
PREFIX="KS91_K562_ASTARRseq"
FOLDER="annotation"
REGION="GATA1"
SAMPLE="Input_rep1"
STRAND="unstranded"

fglob  = os.path.join(FD_RES, PREFIX, FOLDER, f"*{SAMPLE}*{REGION}*{STRAND}*")
fpaths = glob.glob(fglob)
fpath  = fpaths[0]
print(fpaths)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
with gzip.open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5)
    for line in lines:
        print(line)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
fun = prep_line
with gzip.open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5, remove_header=True)
    lines = map(fun, lines)
    for line in lines:
        print(line)

['/mount/work/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/annotation/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.bed.gz']
+++++++++++++++++++++++++++++++++++++++++++++++++++
['chrX', '47787165', '47787363', 'chrX_47787165_47787363', '1', '.', 'chrX', '47787180', '47787190', 'FOX/8', '5.1151', '10']
['chrX', '47787165', '47787363', 'chrX_47787165_47787363', '1', '.', 'chrX', '47787182', '47787192', 'HINFP1/1', '8.8688', '10']
['chrX', '47787165', '47787363', 'chrX_47787165_47787363', '1', '.', 'chrX', '47787194', '47787216', 'ZNF146', '1.8019', '22']
['chrX', '47787165', '47787363', 'chrX_47787165_47787363', '1', '.', 'chrX', '47787196', '47787211', 'NR/15', '8.0507', '15']
['chrX', '47787165', '47787363', 'chrX_47787165_47787363', '1', '.', 'chrX', '47787229', '47787243', 'NR/20', '8.1423', '14']
+++++++++++++++++++++++++++++++++++++++++++++++++++
('chrX_47787165_47787363', 'chrX_47787182_47787192_HINFP1/1')
('chrX_47787165_47787363', 'chrX_47787194_47787216_ZNF146')
('

## Coverage
```
query_reset_table = "DROP TABLE IF EXISTS Coverage"

query_table = """CREATE TABLE IF NOT EXISTS Coverage(
    chrom    TEXT,
    location INTEGER,
    depth    INTEGER,
    sample   TEXT,
    FOREIGN KEY (sample) REFERENCES Sample (sample)
    );"""

query_insert = """
    INSERT OR IGNORE INTO Coverage
        (chrom, location, depth, sample)
    VALUES 
        (?,?,?,?)
    """
```

In [17]:
%%bash
source ../config/config_sing.sh
PREFIX="KS91_K562_ASTARRseq"
FOLDER="coverage"
REGION="GATA1"
SAMPLE="Input_rep1"
STRAND="unstranded"

FPATH=$(ls ${FD_RES}/${PREFIX}/${FOLDER}/*${SAMPLE}*${REGION}*${STRAND}*)
echo ${FPATH}
zcat ${FPATH} | head

/mount/work/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/coverage/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.perbase.tsv.gz /mount/work/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/coverage/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.total_count.tsv
chrX	47786500	47786501	.
chrX	47786501	47786502	.
chrX	47786502	47786503	.
chrX	47786503	47786504	.
chrX	47786504	47786505	.
chrX	47786505	47786506	.
chrX	47786506	47786507	.
chrX	47786507	47786508	.
chrX	47786508	47786509	.
chrX	47786509	47786510	.


In [35]:
### helper function to process each row
def prep_line(line, sample):
    """Function to process each line"""
    return [line[0], line[1], line[3], sample]

In [36]:
FOLDER="coverage"
REGION="GATA1"
SAMPLE="Input_rep1"
STRAND="unstranded"

fglob  = os.path.join(FD_RES, PREFIX, FOLDER, f"*{SAMPLE}*{REGION}*{STRAND}*perbase*")
fpaths = glob.glob(fglob)
fpath  = fpaths[0]
print(fpaths)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
with gzip.open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5)
    for line in lines:
        print(line)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
fun = partial(prep_line, sample=SAMPLE)
with gzip.open(fpath, "rt") as file:
    lines = gen_lines(file, n_lines=5, remove_header=True)
    lines = map(fun, lines)
    for line in lines:
        print(line)

['/mount/work/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/coverage/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.perbase.tsv.gz']
+++++++++++++++++++++++++++++++++++++++++++++++++++
['chrX', '47786500', '47786501', '.']
['chrX', '47786501', '47786502', '.']
['chrX', '47786502', '47786503', '.']
['chrX', '47786503', '47786504', '.']
['chrX', '47786504', '47786505', '.']
+++++++++++++++++++++++++++++++++++++++++++++++++++
['chrX', '47786501', '.', 'Input_rep1']
['chrX', '47786502', '.', 'Input_rep1']
['chrX', '47786503', '.', 'Input_rep1']
['chrX', '47786504', '.', 'Input_rep1']
['chrX', '47786505', '.', 'Input_rep1']


In [60]:
import re
string="/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/fragment_nuc/KS91_K562_hg38_ASTARRseq_Output_rep2.GATA1.unstranded.bed.gz"
pattern="(Input|Output)_rep."
result = re.search(pattern, string)
print(result)
result.group()

<re.Match object; span=(109, 120), match='Output_rep2'>


'Output_rep2'

In [65]:
def get_sample(string):
    pattern = "(Input|Output)_rep."
    result  = re.search(pattern, string)
    if result:
        return result.group()
    else:
        return None

string="/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/fragment_nuc/KS91_K562_hg38_ASTARRseq_Output_rep2.GATA1.unstranded.bed.gz"
get_sample(string)

'Output_rep2'

In [66]:

string="/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/"
get_sample(string)

In [68]:
def prep_line_coverage(line, sample=None):
    """Function to process each line"""
    return [line[0], line[1], line[3], sample]
prep_line_coverage([1,2,3,4,5])

[1, 2, 4, None]