# Database: fragment table

**Set Environment**

In [1]:
### basic
import sys
sys.path.append('../')
from config_sing import *

### specific tools
from functools import reduce
import itertools as it
import sqlite3
# https://stackoverflow.com/questions/49456158/integer-in-python-pandas-becomes-blob-binary-in-sqlite
sqlite3.register_adapter(np.int64, lambda val: int(val))
sqlite3.register_adapter(np.int32, lambda val: int(val))

### global variables
fdiry = os.path.join(FD_RES, 'database')
fname = "fragment_chr17.db"
FPATH_DB = os.path.join(fdiry, fname)

### Set Samples
fun = np.core.defchararray.add
idx = np.arange(1,6).astype("str")

INPUT    = reduce(fun, ["Input", idx])
INPUT20X = reduce(fun, ["Input", idx,     "_20x"])
TFX_DMSO = reduce(fun, ["TFX",   idx[1:], "_DMSO"])
TFX_DEX  = reduce(fun, ["TFX",   idx[1:], "_Dex"])
SAMPLES  = np.concatenate([INPUT, INPUT20X, TFX_DMSO, TFX_DEX])

You are on Duke Server: Singularity: Proj CombEffect
BASE DIRECTORY:     /mount/work
PATH OF SOURCE:     /mount/work/source
PATH OF EXECUTABLE: /mount/work/exe
PATH OF ANNOTATION: /mount/work/annotation
PATH OF PROJECT:    /mount/project
PATH OF RESULTS:    /mount/work/out/proj_combeffect



**Check data location**

In [2]:
fdiry = os.path.join(FD_RES, "nuc")
print(os.listdir(fdiry))

['Input5_20x', 'Input1_20x', 'TFX2_DMSO', 'Input3', 'TFX5_Dex', 'Input4_20x', 'TFX2_Dex', 'TFX5_DMSO', 'TFX4_Dex', 'Input4', 'Input5', 'Input1', 'Input2', 'TFX3_Dex', 'TFX4_DMSO', 'Input2_20x', 'TFX3_DMSO', 'Input3_20x']


In [3]:
fdiry = os.path.join(FD_RES, "nuc", "Input1_20x")
os.listdir(fdiry)

['target_PER1.bed.gz',
 'chr20.bed.gz',
 'chr12.bed.gz',
 'chr3.bed.gz',
 'chr6.bed.gz',
 'chrY.bed.gz',
 'chr5.bed.gz',
 'chr2.bed.gz',
 'chr19.bed.gz',
 'chr22.bed.gz',
 'chr9.bed.gz',
 'chr17.bed.gz',
 'chr16.bed.gz',
 'chr4.bed.gz',
 'chr1.bed.gz',
 'chr15.bed.gz',
 'chr10.bed.gz',
 'chr8.bed.gz',
 'chr18.bed.gz',
 'chr13.bed.gz',
 'chrX.bed.gz',
 'chr7.bed.gz',
 'chr11.bed.gz',
 'chr14.bed.gz',
 'chr21.bed.gz']

In [4]:
%%script env FDIRY=$fdiry bash
zcat ${FDIRY}/chr17.bed.gz | head -3

#1_usercol	2_usercol	3_usercol	4_usercol	5_pct_at	6_pct_gc	7_num_A	8_num_C	9_num_G	10_num_T	11_num_N	12_num_oth	13_seq_len
chr17	107410	108464	1	0.512334	0.487666	342	243	271	198	0	0	1054
chr17	159026	160040	1	0.506903	0.493097	286	259	241	228	0	0	1014


## Test read in data

In [5]:
fdiry = os.path.join(FD_RES, "nuc", "Input1_20x")
fname = "chr17.bed.gz"
fpath = os.path.join(fdiry, fname)

n_lines = 5
with gzip.open(fpath, "rb") as file:
    header = file.readline().decode('ASCII').strip().split('\t')
    print(header)
    
    lines  = it.islice(file, n_lines)
    for line in lines:
        lst = line.decode('ASCII').strip().split('\t')  
        print(lst)

['#1_usercol', '2_usercol', '3_usercol', '4_usercol', '5_pct_at', '6_pct_gc', '7_num_A', '8_num_C', '9_num_G', '10_num_T', '11_num_N', '12_num_oth', '13_seq_len']
['chr17', '107410', '108464', '1', '0.512334', '0.487666', '342', '243', '271', '198', '0', '0', '1054']
['chr17', '159026', '160040', '1', '0.506903', '0.493097', '286', '259', '241', '228', '0', '0', '1014']
['chr17', '159426', '160303', '1', '0.491448', '0.508552', '224', '247', '199', '207', '0', '0', '877']
['chr17', '159510', '160362', '1', '0.497653', '0.502347', '222', '231', '197', '202', '0', '0', '852']
['chr17', '159977', '160849', '1', '0.458716', '0.541284', '198', '229', '243', '202', '0', '0', '872']


In [6]:
def prep_line(lst):
    key = "_".join(lst[0:3])
    val = lst[0:3] + lst[4:-1]
    return [key] + val

In [7]:
fdiry = os.path.join(FD_RES, "nuc", "Input1_20x")
fname = "target_PER1.bed.gz"
fpath = os.path.join(fdiry, fname)

n_lines = 5

with gzip.open(fpath, "rb") as file:
    header = file.readline()
    lines  = it.islice(file, n_lines)
    for line in lines:
        lst = line.decode('ASCII').strip().split('\t')  
        print(lst)
        print(prep_line(lst))
        print()

['chr17', '8148003', '8148983', '3', '0.411224', '0.588776', '213', '288', '289', '190', '0', '0', '980']
['chr17_8148003_8148983', 'chr17', '8148003', '8148983', '0.411224', '0.588776', '213', '288', '289', '190', '0', '0']

['chr17', '8148004', '8148925', '1', '0.412595', '0.587405', '200', '272', '269', '180', '0', '0', '921']
['chr17_8148004_8148925', 'chr17', '8148004', '8148925', '0.412595', '0.587405', '200', '272', '269', '180', '0', '0']

['chr17', '8148004', '8148962', '1', '0.412317', '0.587683', '208', '283', '280', '187', '0', '0', '958']
['chr17_8148004_8148962', 'chr17', '8148004', '8148962', '0.412317', '0.587683', '208', '283', '280', '187', '0', '0']

['chr17', '8148004', '8148963', '1', '0.411887', '0.588113', '208', '284', '280', '187', '0', '0', '959']
['chr17_8148004_8148963', 'chr17', '8148004', '8148963', '0.411887', '0.588113', '208', '284', '280', '187', '0', '0']

['chr17', '8148005', '8149014', '1', '0.412289', '0.587711', '222', '297', '296', '194', '0', '0

note: plan next: unique constraint and unique index?

In [8]:
query_reset_table = "DROP TABLE IF EXISTS Fragment"
query_reset_index = "DROP INDEX IF EXISTS idx_frag_loc"

query_table = ("""
    CREATE TABLE IF NOT EXISTS Fragment(
        fragment TEXT PRIMARY KEY, 
        chrom    TEXT,
        start    INTEGER,
        end      INTEGER,
        pct_at   REAL,
        pct_gc   REAL,
        num_A    INTEGER,
        num_C    INTEGER,
        num_G    INTEGER,
        num_T    INTEGER,
        num_N    INTEGER,
        num_oth  INTEGER
    );""")

query_index  = """CREATE INDEX idx_frag_loc ON Fragment (start, end)"""
query_insert = ("""
    INSERT OR IGNORE INTO Fragment
        (fragment, chrom, start, end, pct_at, pct_gc,
         num_A, num_C, num_G, num_T, num_N, num_oth) 
    VALUES 
        (?,?,?,?,?,?,?,?,?,?,?,?)
    """)

In [9]:
fdiry = os.path.join(FD_RES, "nuc", "Input1_20x")
fname = "target_PER1.bed.gz"
fpath_gz = os.path.join(fdiry, fname)
fpath_db = FPATH_DB

with sqlite3.connect(fpath_db) as conn, gzip.open(fpath_gz, "rb") as file:
    ### reset
    cursor = conn.cursor()
    query  = query_reset_table
    cursor.execute(query)
    query  = query_reset_index
    cursor.execute(query)
    
    ### create table
    cursor = conn.cursor()
    query  = query_table
    cursor.execute(query)
    
    ### insert values
    query  = query_insert
    n_lines = 5
    lines = it.islice(file, n_lines)
    for line in lines:
        lst = line.decode('ASCII').strip().split('\t')  
        lst = prep_line(lst)
        cursor.execute(query, lst)
    
    ### show that the table is created
    cursor.execute("SELECT * FROM Fragment")
    for row in cursor.fetchall():
        print(row)

('#1_usercol_2_usercol_3_usercol', '#1_usercol', '2_usercol', '3_usercol', '5_pct_at', '6_pct_gc', '7_num_A', '8_num_C', '9_num_G', '10_num_T', '11_num_N', '12_num_oth')
('chr17_8148003_8148983', 'chr17', 8148003, 8148983, 0.411224, 0.588776, 213, 288, 289, 190, 0, 0)
('chr17_8148004_8148925', 'chr17', 8148004, 8148925, 0.412595, 0.587405, 200, 272, 269, 180, 0, 0)
('chr17_8148004_8148962', 'chr17', 8148004, 8148962, 0.412317, 0.587683, 208, 283, 280, 187, 0, 0)
('chr17_8148004_8148963', 'chr17', 8148004, 8148963, 0.411887, 0.588113, 208, 284, 280, 187, 0, 0)


## Insert the whole files

In [10]:
print(SAMPLES)

['Input1' 'Input2' 'Input3' 'Input4' 'Input5' 'Input1_20x' 'Input2_20x'
 'Input3_20x' 'Input4_20x' 'Input5_20x' 'TFX2_DMSO' 'TFX3_DMSO'
 'TFX4_DMSO' 'TFX5_DMSO' 'TFX2_Dex' 'TFX3_Dex' 'TFX4_Dex' 'TFX5_Dex']


In [11]:
fdiry = os.path.join(FD_RES, "nuc")
fname = "chr17.bed.gz"

#samples = np.sort(os.listdir(fdiry))
samples = SAMPLES
for sam in samples:
    fpath = os.path.join(fdiry, sam, fname)
    print(fpath)

/mount/work/out/proj_combeffect/nuc/Input1/chr17.bed.gz
/mount/work/out/proj_combeffect/nuc/Input2/chr17.bed.gz
/mount/work/out/proj_combeffect/nuc/Input3/chr17.bed.gz
/mount/work/out/proj_combeffect/nuc/Input4/chr17.bed.gz
/mount/work/out/proj_combeffect/nuc/Input5/chr17.bed.gz
/mount/work/out/proj_combeffect/nuc/Input1_20x/chr17.bed.gz
/mount/work/out/proj_combeffect/nuc/Input2_20x/chr17.bed.gz
/mount/work/out/proj_combeffect/nuc/Input3_20x/chr17.bed.gz
/mount/work/out/proj_combeffect/nuc/Input4_20x/chr17.bed.gz
/mount/work/out/proj_combeffect/nuc/Input5_20x/chr17.bed.gz
/mount/work/out/proj_combeffect/nuc/TFX2_DMSO/chr17.bed.gz
/mount/work/out/proj_combeffect/nuc/TFX3_DMSO/chr17.bed.gz
/mount/work/out/proj_combeffect/nuc/TFX4_DMSO/chr17.bed.gz
/mount/work/out/proj_combeffect/nuc/TFX5_DMSO/chr17.bed.gz
/mount/work/out/proj_combeffect/nuc/TFX2_Dex/chr17.bed.gz
/mount/work/out/proj_combeffect/nuc/TFX3_Dex/chr17.bed.gz
/mount/work/out/proj_combeffect/nuc/TFX4_Dex/chr17.bed.gz
/mount/wor

In [12]:
%%time

fdiry = os.path.join(FD_RES, "nuc")
fname = "chr17.bed.gz"
samples   = SAMPLES #np.sort(os.listdir(fdiry))
fpaths_gz = [os.path.join(fdiry, sam, fname) for sam in samples]
fpath_db  = FPATH_DB


with sqlite3.connect(fpath_db) as conn:
    cursor = conn.cursor()
    count  = 0
    
    ### reset
    cursor = conn.cursor()
    query  = query_reset_table
    cursor.execute(query)
    query  = query_reset_index
    cursor.execute(query)

    ### create table
    cursor = conn.cursor()
    query  = query_table
    cursor.execute(query)
    
    for fpath_gz in fpaths_gz:
        print(fpath_gz)
        
        with gzip.open(fpath_gz, "rb") as file:

            ### insert values
            query  = query_insert
            header = file.readline()
            lines  = file #it.islice(file, n_lines)
            for idx, line in enumerate(lines):
                lst = line.decode('ASCII').strip().split('\t')  
                lst = prep_line(lst)
                cursor.execute(query, lst)
                
            print("# Fragments:", idx + 1)
            count += (idx + 1)
            
    print("\n# Fragments (Total):", count)

    
    ### create index
    print("Create index")
    cursor = conn.cursor()
    query  = query_index
    cursor = cursor.execute(query)
    print("Done!")

/mount/work/out/proj_combeffect/nuc/Input1/chr17.bed.gz
# Fragments: 618142
/mount/work/out/proj_combeffect/nuc/Input2/chr17.bed.gz
# Fragments: 667423
/mount/work/out/proj_combeffect/nuc/Input3/chr17.bed.gz
# Fragments: 774503
/mount/work/out/proj_combeffect/nuc/Input4/chr17.bed.gz
# Fragments: 625978
/mount/work/out/proj_combeffect/nuc/Input5/chr17.bed.gz
# Fragments: 502807
/mount/work/out/proj_combeffect/nuc/Input1_20x/chr17.bed.gz
# Fragments: 9004450
/mount/work/out/proj_combeffect/nuc/Input2_20x/chr17.bed.gz
# Fragments: 8581707
/mount/work/out/proj_combeffect/nuc/Input3_20x/chr17.bed.gz
# Fragments: 8651887
/mount/work/out/proj_combeffect/nuc/Input4_20x/chr17.bed.gz
# Fragments: 9926587
/mount/work/out/proj_combeffect/nuc/Input5_20x/chr17.bed.gz
# Fragments: 8446304
/mount/work/out/proj_combeffect/nuc/TFX2_DMSO/chr17.bed.gz
# Fragments: 1412903
/mount/work/out/proj_combeffect/nuc/TFX3_DMSO/chr17.bed.gz
# Fragments: 880072
/mount/work/out/proj_combeffect/nuc/TFX4_DMSO/chr17.bed.

In [12]:
%%time

fdiry = os.path.join(FD_RES, "nuc")
fname = "chr17.bed.gz"
samples   = SAMPLES #np.sort(os.listdir(fdiry))
fpaths_gz = [os.path.join(fdiry, sam, fname) for sam in samples]
fpath_db  = FPATH_DB


with sqlite3.connect(fpath_db) as conn:
    cursor = conn.cursor()
    count  = 0
    
    ### reset
    cursor = conn.cursor()
    query  = query_reset_table
    cursor.execute(query)
    query  = query_reset_index
    cursor.execute(query)

    ### create table
    cursor = conn.cursor()
    query  = query_table
    cursor.execute(query)
    
    for fpath_gz in fpaths_gz:
        print(fpath_gz)
        
        with gzip.open(fpath_gz, "rb") as file:

            ### insert values
            query  = query_insert
            header = file.readline()
            lines  = file #it.islice(file, n_lines)
            for idx, line in enumerate(lines):
                lst = line.decode('ASCII').strip().split('\t')  
                lst = prep_line(lst)
                cursor.execute(query, lst)
                
            print("# Fragments:", idx + 1)
            count += (idx + 1)
            
    print("\n# Fragments (Total):", count)

    
    ### create index
    print("Create index")
    cursor = conn.cursor()
    query  = query_index
    cursor = cursor.execute(query)
    print("Done!")

/mount/work/out/proj_combeffect/nuc/Input1/chr17.bed.gz
# Fragments 618142
/mount/work/out/proj_combeffect/nuc/Input2/chr17.bed.gz
# Fragments 667423
/mount/work/out/proj_combeffect/nuc/Input3/chr17.bed.gz
# Fragments 774503
/mount/work/out/proj_combeffect/nuc/Input4/chr17.bed.gz
# Fragments 625978
/mount/work/out/proj_combeffect/nuc/Input5/chr17.bed.gz
# Fragments 502807
/mount/work/out/proj_combeffect/nuc/Input1_20x/chr17.bed.gz
# Fragments 9004450
/mount/work/out/proj_combeffect/nuc/Input2_20x/chr17.bed.gz
# Fragments 8581707
/mount/work/out/proj_combeffect/nuc/Input3_20x/chr17.bed.gz
# Fragments 8651887
/mount/work/out/proj_combeffect/nuc/Input4_20x/chr17.bed.gz
# Fragments 9926587
/mount/work/out/proj_combeffect/nuc/Input5_20x/chr17.bed.gz
# Fragments 8446304
/mount/work/out/proj_combeffect/nuc/TFX2_DMSO/chr17.bed.gz
# Fragments 1412903
/mount/work/out/proj_combeffect/nuc/TFX3_DMSO/chr17.bed.gz
# Fragments 880072
/mount/work/out/proj_combeffect/nuc/TFX4_DMSO/chr17.bed.gz
# Fragmen

**Check**

In [5]:
fpath_db = FPATH_DB
print(fpath_db)
with sqlite3.connect(fpath_db) as conn:
    cursor = conn.cursor()
    query = "select count(*) from Fragment"
    cursor.execute(query)
    print(cursor.fetchall())

/mount/work/out/proj_combeffect/database/fragment_chr17.db
[(1425854,)]


In [7]:
%%time
query_reset_index = "DROP INDEX IF EXISTS idx_frag_loc"
query_index       = "CREATE INDEX idx_frag_loc ON Fragment (start, end)"

with sqlite3.connect(fpath_db) as conn:
    cursor = conn.cursor()
    print("reset index")
    query  = query_reset_index
    cursor = cursor.execute(query)
    
    print("create index")
    query  = query_index
    cursor = cursor.execute(query)
    
    print("Done!")

reset index
create index
Done!
CPU times: user 958 ms, sys: 1.24 s, total: 2.2 s
Wall time: 45 s


In [20]:
%%bash
FDIRY="/mount/work/out/proj_combeffect/nuc"
FD_BEDS=($(ls -d ${FDIRY}/{Input?,Input?_20x,TFX?_DMSO,TFX?_Dex}/))
FN_BED="chr17.bed.gz"

sum=0
for FD_BED in ${FD_BEDS[@]}; do
    count=$(zcat ${FD_BED}/${FN_BED} | wc -l)
    echo $count
    sum=$((sum + count))
done
echo
echo $sum

169
3030
191
2934
259
2965
178
3273
128
2813
1720
734
1114
495
1520
510
1832
639

24504


In [22]:
%%bash
FD_BED="/mount/work/out/proj_combeffect/nuc"
FN_BED="target_PER1.bed.gz"

zcat ${FD_BED}/*/${FN_BED} | wc -l

24504


**Check**

In [33]:
fdiry = os.path.join(FD_RES, "nuc")
fname = "target_PER1.bed.gz"
samples = np.sort(os.listdir(fdiry))
fpaths = [os.path.join(fdiry, sam, fname) for sam in samples]

###
lst = list()
for fpath in fpaths:
    dat = pd.read_table(fpath)
    lst.append(dat)
    print(fpath)
    print(dat.shape)

###
dat = pd.concat(lst)
print(dat.shape)

###
tmp = dat.drop(columns="4_usercol").drop_duplicates()
print(tmp.shape)

/mount/work/out/proj_combeffect/nuc/Input1/target_PER1.bed.gz
(168, 13)
/mount/work/out/proj_combeffect/nuc/Input1_20x/target_PER1.bed.gz
(3029, 13)
/mount/work/out/proj_combeffect/nuc/Input2/target_PER1.bed.gz
(190, 13)
/mount/work/out/proj_combeffect/nuc/Input2_20x/target_PER1.bed.gz
(2933, 13)
/mount/work/out/proj_combeffect/nuc/Input3/target_PER1.bed.gz
(258, 13)
/mount/work/out/proj_combeffect/nuc/Input3_20x/target_PER1.bed.gz
(2964, 13)
/mount/work/out/proj_combeffect/nuc/Input4/target_PER1.bed.gz
(177, 13)
/mount/work/out/proj_combeffect/nuc/Input4_20x/target_PER1.bed.gz
(3272, 13)
/mount/work/out/proj_combeffect/nuc/Input5/target_PER1.bed.gz
(127, 13)
/mount/work/out/proj_combeffect/nuc/Input5_20x/target_PER1.bed.gz
(2812, 13)
/mount/work/out/proj_combeffect/nuc/TFX2_DMSO/target_PER1.bed.gz
(733, 13)
/mount/work/out/proj_combeffect/nuc/TFX2_Dex/target_PER1.bed.gz
(1719, 13)
/mount/work/out/proj_combeffect/nuc/TFX3_DMSO/target_PER1.bed.gz
(494, 13)
/mount/work/out/proj_combeffec