# Count table: linking fragments and samples

**Set environment**

In [1]:
import sys
sys.path.append('../')
from config_sing import *

You are on Duke Server: Singularity: Proj CombEffect
BASE DIRECTORY:     /mount/work
PATH OF SOURCE:     /mount/work/source
PATH OF EXECUTABLE: /mount/work/exe
PATH OF ANNOTATION: /mount/work/annotation
PATH OF PROJECT:    /mount/project
PATH OF RESULTS:    /mount/work/out/proj_combeffect


In [2]:
import sqlite3
import itertools as it

## Test: check where the data is

In [3]:
fdiry = os.path.join(FD_RES, "count_fragment")
os.listdir(fdiry)

['TFX_DMSO',
 'Input5_20x',
 'Input1_20x',
 'TFX2_DMSO',
 'TFX_Dex',
 'dat_cnt_dex_chr17.csv',
 'Input3',
 '.ipynb_checkpoints',
 'TFX5_Dex',
 'dat_cnt_dmso_chr17.csv',
 'dat_cnt_input_dex_PER1.csv',
 'Input4_20x',
 'dat_cnt_input_per1.csv',
 'TFX2_Dex',
 'TFX5_DMSO',
 'TFX4_Dex',
 'Input4',
 'Input5',
 'dat_cnt_input_dex_PER1.tsv',
 'dat_cnt_input_dmso_PER1.csv',
 'Input1',
 'Input2',
 'dat_cnt_input_chr17.csv',
 'Input',
 'TFX3_Dex',
 'dat_cnt_dex_per1.csv',
 'dat_cnt_dmso_per1.csv',
 'TFX4_DMSO',
 'Input2_20x',
 'TFX3_DMSO',
 'Input3_20x',
 'dat_cnt_input_dmso_PER1.tsv']

In [4]:
fdiry = os.path.join(FD_RES, "count_fragment", "Input1_20x")
os.listdir(fdiry)

['target_PER1.bed.gz',
 'chr20.bed.gz',
 'chr12.bed.gz',
 'chr3.bed.gz',
 'chr6.bed.gz',
 'chrY.bed.gz',
 'chr5.bed.gz',
 'chr2.bed.gz',
 'chr19.bed.gz',
 'chr22.bed.gz',
 'chr9.bed.gz',
 'chr17.bed.gz',
 'chr16.bed.gz',
 'chr4.bed.gz',
 'chr1.bed.gz',
 'chr15.bed.gz',
 'chr10.bed.gz',
 'chr8.bed.gz',
 'chr18.bed.gz',
 'chr13.bed.gz',
 'chrX.bed.gz',
 'chr7.bed.gz',
 'chr11.bed.gz',
 'chr14.bed.gz',
 'chr21.bed.gz']

In [5]:
###
fdirys = list()

###
samples = "Input?,Input?_20x,TFX?_DMSO,TFX?_Dex".split(",")
for sam in samples:
    fglob = os.path.join(FD_RES, "count_fragment", sam)
    lst   = glob.glob(fglob)
    lst.sort()
    fdirys += lst

###
for fdiry in fdirys:
    print(fdiry)

/mount/work/out/proj_combeffect/count_fragment/Input1
/mount/work/out/proj_combeffect/count_fragment/Input2
/mount/work/out/proj_combeffect/count_fragment/Input3
/mount/work/out/proj_combeffect/count_fragment/Input4
/mount/work/out/proj_combeffect/count_fragment/Input5
/mount/work/out/proj_combeffect/count_fragment/Input1_20x
/mount/work/out/proj_combeffect/count_fragment/Input2_20x
/mount/work/out/proj_combeffect/count_fragment/Input3_20x
/mount/work/out/proj_combeffect/count_fragment/Input4_20x
/mount/work/out/proj_combeffect/count_fragment/Input5_20x
/mount/work/out/proj_combeffect/count_fragment/TFX2_DMSO
/mount/work/out/proj_combeffect/count_fragment/TFX3_DMSO
/mount/work/out/proj_combeffect/count_fragment/TFX4_DMSO
/mount/work/out/proj_combeffect/count_fragment/TFX5_DMSO
/mount/work/out/proj_combeffect/count_fragment/TFX2_Dex
/mount/work/out/proj_combeffect/count_fragment/TFX3_Dex
/mount/work/out/proj_combeffect/count_fragment/TFX4_Dex
/mount/work/out/proj_combeffect/count_fragme

In [6]:
%%bash
FPATH="/mount/work/out/proj_combeffect/count_fragment/Input1/target_PER1.bed.gz"
zcat ${FPATH} | head

chr17	8148117	8149012	1
chr17	8148122	8149107	1
chr17	8148178	8149194	1
chr17	8148188	8149154	1
chr17	8148190	8149151	1
chr17	8148220	8149108	1
chr17	8148280	8149232	1
chr17	8148401	8149372	1
chr17	8148548	8149585	1
chr17	8148913	8149953	1


## Test: read in the data

In [10]:
def prep_line(lst, sam):
    ### parse info and get fragment and motif ID
    fragment = "_".join(lst[:3])
    count    = lst[-1]
    
    return fragment, sam, count

In [13]:
fdiry = fdirys[0]
fname = "target_PER1.bed.gz"
fpath = os.path.join(fdiry, fname)
sam   = os.path.basename(fdiry)
print(fdiry)
print(fpath)

n_lines = 5
with gzip.open(fpath, "rb") as file:
    lines  = it.islice(file, n_lines)
    for line in lines:
        lst = line.decode('ASCII').strip().split('\t')
        row = prep_line(lst, sam)
        print(row)

/mount/work/out/proj_combeffect/count_fragment/Input1
/mount/work/out/proj_combeffect/count_fragment/Input1/target_PER1.bed.gz
('chr17_8148117_8149012', 'Input1', '1')
('chr17_8148122_8149107', 'Input1', '1')
('chr17_8148178_8149194', 'Input1', '1')
('chr17_8148188_8149154', 'Input1', '1')
('chr17_8148190_8149151', 'Input1', '1')


## Test: Generate table
```
Count Table
- sample
- fragment 
- count
```

**Query**

In [17]:
query_reset = ("DROP TABLE IF EXISTS Count")

query_table = ("""CREATE TABLE IF NOT EXISTS Count (
    fragment TEXT, 
    sample   TEXT,
    count    INTEGER,
    FOREIGN KEY (fragment) REFERENCES Fragment (fragment),
    FOREIGN KEY (sample)   REFERENCES Sample   (sample)
);""")

query_insert = ("""INSERT OR IGNORE INTO Count
    (fragment, sample, count)
    VALUES 
    (?,?,?)""")

**Insert**

In [18]:
def prep_line(lst, sam):
    ### parse info and get fragment and motif ID
    fragment = "_".join(lst[:3])
    count    = lst[-1]
    
    return fragment, sam, count

In [19]:
fdiry = os.path.join(FD_RES, 'database')
fname = "fragment.db"
fpath_db = os.path.join(fdiry, fname)

fdiry = fdirys[0]
fname = "target_PER1.bed.gz"
fpath_gz = os.path.join(fdiry, fname)
sam   = os.path.basename(fdiry)

with sqlite3.connect(fpath_db) as conn, gzip.open(fpath_gz, "rb") as file:
    ### reset
    cursor = conn.cursor()
    query  = query_reset
    cursor.execute(query)
    
    ### create table
    cursor = conn.cursor()
    query  = query_table
    cursor.execute(query)
    
    ### insert values
    query  = query_insert
    
    n_lines = 5
    with gzip.open(fpath, "rb") as file:
        lines  = it.islice(file, n_lines)
        for line in lines:
            lst = line.decode('ASCII').strip().split('\t')
            row = prep_line(lst, sam)
            cursor.execute(query, row)
            
    ### show that the table is created
    cursor.execute("SELECT * FROM Count")
    for row in cursor.fetchall():
        print(row)

('chr17_8148117_8149012', 'Input1', 1)
('chr17_8148122_8149107', 'Input1', 1)
('chr17_8148178_8149194', 'Input1', 1)
('chr17_8148188_8149154', 'Input1', 1)
('chr17_8148190_8149151', 'Input1', 1)


## Insert the whole tables

In [20]:
###
fdirys = list()

###
samples = "Input?,Input?_20x,TFX?_DMSO,TFX?_Dex".split(",")
for sam in samples:
    fglob = os.path.join(FD_RES, "count_fragment", sam)
    lst   = glob.glob(fglob)
    lst.sort()
    fdirys += lst

###
for fdiry in fdirys:
    print(fdiry)

/mount/work/out/proj_combeffect/count_fragment/Input1
/mount/work/out/proj_combeffect/count_fragment/Input2
/mount/work/out/proj_combeffect/count_fragment/Input3
/mount/work/out/proj_combeffect/count_fragment/Input4
/mount/work/out/proj_combeffect/count_fragment/Input5
/mount/work/out/proj_combeffect/count_fragment/Input1_20x
/mount/work/out/proj_combeffect/count_fragment/Input2_20x
/mount/work/out/proj_combeffect/count_fragment/Input3_20x
/mount/work/out/proj_combeffect/count_fragment/Input4_20x
/mount/work/out/proj_combeffect/count_fragment/Input5_20x
/mount/work/out/proj_combeffect/count_fragment/TFX2_DMSO
/mount/work/out/proj_combeffect/count_fragment/TFX3_DMSO
/mount/work/out/proj_combeffect/count_fragment/TFX4_DMSO
/mount/work/out/proj_combeffect/count_fragment/TFX5_DMSO
/mount/work/out/proj_combeffect/count_fragment/TFX2_Dex
/mount/work/out/proj_combeffect/count_fragment/TFX3_Dex
/mount/work/out/proj_combeffect/count_fragment/TFX4_Dex
/mount/work/out/proj_combeffect/count_fragme

In [22]:
%%time

fdiry = os.path.join(FD_RES, 'database')
fname = "fragment.db"
fpath_db = os.path.join(fdiry, fname)

counter_tot = 0
with sqlite3.connect(fpath_db) as conn:
    ### reset
    cursor = conn.cursor()
    query  = query_reset
    cursor.execute(query)
    
    ### 
    for fdiry in fdirys:
        ### init, show progress
        fname = "target_PER1.bed.gz"
        fpath_gz = os.path.join(fdiry, fname)
        sam   = os.path.basename(fdiry)
        
        ###
        print(fpath_gz, flush=True)
        counter = 0
        
        with gzip.open(fpath_gz, "rb") as file:
            ### create table if not exist
            cursor = conn.cursor()
            query  = query_table
            cursor.execute(query)

            ### insert values
            query  = query_insert
            lines  = file #it.islice(file, n_lines)
            for line in lines:
                lst = line.decode('ASCII').strip().split('\t')
                row = prep_line(lst, sam)
                cursor.execute(query, row)
                counter_tot += 1
                counter     += 1
                
        print("#Rows Inserted:", counter, flush=True)
        
print()            
print("#Rows Total:", counter_tot)

/mount/work/out/proj_combeffect/count_fragment/Input1/target_PER1.bed.gz
#Rows Inserted: 168
/mount/work/out/proj_combeffect/count_fragment/Input2/target_PER1.bed.gz
#Rows Inserted: 190
/mount/work/out/proj_combeffect/count_fragment/Input3/target_PER1.bed.gz
#Rows Inserted: 258
/mount/work/out/proj_combeffect/count_fragment/Input4/target_PER1.bed.gz
#Rows Inserted: 177
/mount/work/out/proj_combeffect/count_fragment/Input5/target_PER1.bed.gz
#Rows Inserted: 127
/mount/work/out/proj_combeffect/count_fragment/Input1_20x/target_PER1.bed.gz
#Rows Inserted: 3029
/mount/work/out/proj_combeffect/count_fragment/Input2_20x/target_PER1.bed.gz
#Rows Inserted: 2933
/mount/work/out/proj_combeffect/count_fragment/Input3_20x/target_PER1.bed.gz
#Rows Inserted: 2964
/mount/work/out/proj_combeffect/count_fragment/Input4_20x/target_PER1.bed.gz
#Rows Inserted: 3272
/mount/work/out/proj_combeffect/count_fragment/Input5_20x/target_PER1.bed.gz
#Rows Inserted: 2812
/mount/work/out/proj_combeffect/count_fragmen

**Check**

In [24]:
with sqlite3.connect(fpath_db) as conn:
    cursor = conn.cursor()
    query = "select count(*) from Count"
    cursor.execute(query)
    print(cursor.fetchall())

[(24486,)]
