# Database: motif table

**Set Environment**

In [26]:
### basic
import sys
sys.path.append('../')
from config_sing import *

### specific tools
import itertools as it
import sqlite3
# https://stackoverflow.com/questions/49456158/integer-in-python-pandas-becomes-blob-binary-in-sqlite
sqlite3.register_adapter(np.int64, lambda val: int(val))
sqlite3.register_adapter(np.int32, lambda val: int(val))

### global variables
fdiry = os.path.join(FD_RES, 'database')
fname = "fragment_chr17.db"
FPATH_DB = os.path.join(fdiry, fname)

**Check data location**

In [3]:
fdiry = FD_ANN
os.listdir(fdiry)

['motif_cluster_jvierstra', 'genome', '.ipynb_checkpoints', 'log']

In [4]:
fglob = os.path.join(FD_ANN, "motif_cluster_jvierstra", "hg38_archetype_motifs_v1", "chr17*")
glob.glob(fglob)

['/mount/work/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1/chr17_KI270730v1_random.bed.gz',
 '/mount/work/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1/chr17_KI270729v1_random.bed.gz',
 '/mount/work/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1/chr17',
 '/mount/work/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1/chr17_GL000205v2_random.bed.gz',
 '/mount/work/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1/chr17.bed.gz',
 '/mount/work/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1/chr17_rm_mouse.bed.gz',
 '/mount/work/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1/chr17_rm_mouse_merge.bed.gz']

In [5]:
fdiry = os.path.join(FD_ANN, "motif_cluster_jvierstra", "hg38_archetype_motifs_v1")
fname = "chr17_rm_mouse_merge.bed.gz"
fpath = os.path.join(fdiry, fname)

print(fpath)

/mount/work/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1/chr17_rm_mouse_merge.bed.gz


In [9]:
%%script env FPATH="$fpath" bash
zcat ${FPATH} | head

chr17	60004	60022	ZNF140	5.6897
chr17	60004	60022	ZNF667	8.024
chr17	60006	60015	Ebox/CAGCTG	7.9275
chr17	60011	60031	GC-tract	12.122
chr17	60012	60025	PRDM4	1.3083
chr17	60017	60028	NR/19	9.668
chr17	60019	60035	HEN1	5.5854
chr17	60023	60042	ZNF680	6.3901
chr17	60027	60037	SMARCA1	7.5566
chr17	60027	60040	LEF1	7.1402


## Test read motif data

In [10]:
def prep_line(lst):
    key = "_".join(lst[0:(len(lst)-1)])
    return [key] + lst

In [11]:
fdiry = os.path.join(FD_ANN, "motif_cluster_jvierstra", "hg38_archetype_motifs_v1")
fname = "chr17_rm_mouse_merge.bed.gz"
fpath = os.path.join(fdiry, fname)

n_lines = 10
with gzip.open(fpath, "rb") as file:
    lines = it.islice(file, n_lines)
    for line in lines:
        lst = line.decode('ASCII').strip().split('\t')  
        print(lst)
        print(prep_line(lst))

['chr17', '60004', '60022', 'ZNF140', '5.6897']
['chr17_60004_60022_ZNF140', 'chr17', '60004', '60022', 'ZNF140', '5.6897']
['chr17', '60004', '60022', 'ZNF667', '8.024']
['chr17_60004_60022_ZNF667', 'chr17', '60004', '60022', 'ZNF667', '8.024']
['chr17', '60006', '60015', 'Ebox/CAGCTG', '7.9275']
['chr17_60006_60015_Ebox/CAGCTG', 'chr17', '60006', '60015', 'Ebox/CAGCTG', '7.9275']
['chr17', '60011', '60031', 'GC-tract', '12.122']
['chr17_60011_60031_GC-tract', 'chr17', '60011', '60031', 'GC-tract', '12.122']
['chr17', '60012', '60025', 'PRDM4', '1.3083']
['chr17_60012_60025_PRDM4', 'chr17', '60012', '60025', 'PRDM4', '1.3083']
['chr17', '60017', '60028', 'NR/19', '9.668']
['chr17_60017_60028_NR/19', 'chr17', '60017', '60028', 'NR/19', '9.668']
['chr17', '60019', '60035', 'HEN1', '5.5854']
['chr17_60019_60035_HEN1', 'chr17', '60019', '60035', 'HEN1', '5.5854']
['chr17', '60023', '60042', 'ZNF680', '6.3901']
['chr17_60023_60042_ZNF680', 'chr17', '60023', '60042', 'ZNF680', '6.3901']
['c

## Create motif table

In [19]:
query_reset_table = "DROP TABLE IF EXISTS Motif"
query_reset_index = "DROP INDEX IF EXISTS idx_motif_loc"

query_table = """
    CREATE TABLE IF NOT EXISTS Motif(
        binding TEXT PRIMARY KEY, 
        chrom   TEXT,
        start   INTEGER,
        end     INTEGER,
        motif   TEXT,
        score   REAL
    );"""

query_index  = """CREATE INDEX idx_motif_loc ON Motif (chrom, start, end)"""
query_insert = """
    INSERT OR IGNORE INTO Motif 
        (binding,chrom,start,end,motif,score)
    VALUES 
        (?,?,?,?,?,?)
    """

In [21]:
fdiry = os.path.join(FD_ANN, "motif_cluster_jvierstra", "hg38_archetype_motifs_v1")
fname = "chr17_rm_mouse_merge.bed.gz"
fpath_gz = os.path.join(fdiry, fname)
fpath_db = FPATH_DB

with sqlite3.connect(fpath_db) as conn, gzip.open(fpath_gz, "rb") as file:
    ### reset
    cursor = conn.cursor()
    query  = query_reset_table
    cursor.execute(query)
    query  = query_reset_index
    cursor.execute(query)
    
    ### create table
    cursor = conn.cursor()
    query  = query_table
    cursor.execute(query)
    
    ### create index
    cursor = conn.cursor()
    query  = query_index
    cursor.execute(query)
    
    ### insert values
    query  = query_insert
    lines = it.islice(file, n_lines)
    for line in lines:
        lst = line.decode('ASCII').strip().split('\t')  
        lst = prep_line(lst)
        cursor.execute(query, lst)
    
    ### show that the table is created
    cursor.execute("SELECT * FROM Motif")
    for row in cursor.fetchall():
        print(row)

('chr17_60004_60022_ZNF140', 'chr17', 60004, 60022, 'ZNF140', 5.6897)
('chr17_60004_60022_ZNF667', 'chr17', 60004, 60022, 'ZNF667', 8.024)
('chr17_60006_60015_Ebox/CAGCTG', 'chr17', 60006, 60015, 'Ebox/CAGCTG', 7.9275)
('chr17_60011_60031_GC-tract', 'chr17', 60011, 60031, 'GC-tract', 12.122)
('chr17_60012_60025_PRDM4', 'chr17', 60012, 60025, 'PRDM4', 1.3083)
('chr17_60017_60028_NR/19', 'chr17', 60017, 60028, 'NR/19', 9.668)
('chr17_60019_60035_HEN1', 'chr17', 60019, 60035, 'HEN1', 5.5854)
('chr17_60023_60042_ZNF680', 'chr17', 60023, 60042, 'ZNF680', 6.3901)
('chr17_60027_60037_SMARCA1', 'chr17', 60027, 60037, 'SMARCA1', 7.5566)
('chr17_60027_60040_LEF1', 'chr17', 60027, 60040, 'LEF1', 7.1402)


## Insert the whole file

In [22]:
%%time

fdiry = os.path.join(FD_ANN, "motif_cluster_jvierstra", "hg38_archetype_motifs_v1")
fname = "chr17_rm_mouse_merge.bed.gz"
fpath_gz = os.path.join(fdiry, fname)
fpath_db = FPATH_DB

with sqlite3.connect(fpath_db) as conn, gzip.open(fpath_gz, "rb") as file:
    ### reset
    cursor = conn.cursor()
    query  = query_reset_table
    cursor.execute(query)
    query  = query_reset_index
    cursor.execute(query)
    
    ### create table
    cursor = conn.cursor()
    query  = query_table
    cursor.execute(query)
    
    ### create index
    cursor = conn.cursor()
    query  = query_index
    cursor.execute(query)
    
    ### insert values
    query  = query_insert
    lines = file #it.islice(file, n_lines)
    for line in lines:
        lst = line.decode('ASCII').strip().split('\t')  
        lst = prep_line(lst)
        cursor.execute(query, lst)

CPU times: user 2min 17s, sys: 5.75 s, total: 2min 23s
Wall time: 2min 42s


**Check**

In [23]:
fdiry = os.path.join(FD_ANN, "motif_cluster_jvierstra", "hg38_archetype_motifs_v1")
fname = "chr17_rm_mouse_merge.bed.gz"
fpath = os.path.join(fdiry, fname)

print(fpath)

/mount/work/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1/chr17_rm_mouse_merge.bed.gz


In [24]:
%%script env FPATH="$fpath" bash
zcat ${FPATH} | wc -l

14308212


In [25]:
fpath_db = FPATH_DB
with sqlite3.connect(fpath_db) as conn:
    cursor = conn.cursor()
    query = "select count(*) from Motif"
    cursor.execute(query)
    print(cursor.fetchall())

[(14308212,)]
