# Profile read file function

**Set environment**

In [1]:
#######################################################
### Set environment
###++++++++++++++++++++++++++++++++++++++++++++++++++++

### import common packages
import numpy  as np
import itertools as it
import sys, os, gzip
from   functools import reduce

### update print
from functools import partial
print = partial(print, flush=True)

### set working directories
FD_RES = "/gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect"

### import specific packages
import sqlite3
# https://stackoverflow.com/questions/49456158/integer-in-python-pandas-becomes-blob-binary-in-sqlite
sqlite3.register_adapter(np.int64, lambda val: int(val))
sqlite3.register_adapter(np.int32, lambda val: int(val))


### import packages for benchmark performance
import cProfile, pstats, time, timeit
import matplotlib.pyplot as plt

**Global variables of database and file**

In [2]:
#######################################################
### parse arguments
###++++++++++++++++++++++++++++++++++++++++++++++++++++
CHROM   = "chr17"                          #args.chrom
FD_OUT  = os.path.join(FD_RES, "database") #args.fout
FD_INP  = os.path.join(FD_RES, "nuc")      #args.finp
PREFIX  = "test_insert"                    #args.prefix
VERBOSE = True                             #args.verbose

#######################################################
### Global varialbes and I/O
###++++++++++++++++++++++++++++++++++++++++++++++++++++

### file path of fragment database
fdiry  = FD_OUT
fname  = f"{PREFIX}_{CHROM}.db"
FP_DTB = os.path.join(fdiry, fname)

### file path of fragment table
sample  = "Input1_20x"
fdiry   = os.path.join(FD_INP, sample)
fname   = "chr17.bed.gz"
FP_FRG  = os.path.join(fdiry, fname)

### show info
if (VERBOSE):
    print("Global variables:")
    print("Chromsome:  ", CHROM)
    print("Database:   ", FP_DTB)
    print("Input file: ", FP_FRG)
    print()

Global variables:
Chromsome:   chr17
Database:    /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/database/test_insert_chr17.db
Input file:  /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/nuc/Input1_20x/chr17.bed.gz



## Setup helper function for reading files

In [3]:
##################################################
### Helper functions
### ++++++++++++++++++++++++++++++++++++++++++++++

### helper function to get a chunk of file
def get_chunks(gen, rows=10):
    """Divides the data into chunks with size as rows"""
    iterable = iter(gen)
    while True:
        x = list(it.islice(iterable, rows))
        if not x:
            return
        yield x

### helper function to process each row
def prep_line(line):
    """Function to process each line"""
    ### Decode
    lst = line.decode('ASCII').strip().split('\t') 

    ### parse info
    key = "_".join(lst[0:3])
    val = lst[0:3] + lst[4:-1]
    return [key] + val

def gen_line(file, n_chunksize=None, n_lines=None):
    """generate lines or chunks of lines from the file"""
    ### remove file header
    header = file.readline()
    lines  = file
    
    ### preprocess each line
    fun = prep_line
    gen = map(fun, lines)
    
    ### set number of lines read if specified
    if n_lines is not None:
        gen = it.islice(gen, n_lines)
    
    ### set chunks if specified
    if n_chunksize is not None:
        gen = get_chunks(gen, n_chunksize)

    return gen

## Test the helper functions

In [4]:
fpath = FP_FRG
print(fpath)

/gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/nuc/Input1_20x/chr17.bed.gz


In [5]:
with gzip.open(fpath, "rb") as file:
    lines = gen_line(file, n_lines=3)
    for line in lines:
        print(line)

['chr17_107410_108464', 'chr17', '107410', '108464', '0.512334', '0.487666', '342', '243', '271', '198', '0', '0']
['chr17_159026_160040', 'chr17', '159026', '160040', '0.506903', '0.493097', '286', '259', '241', '228', '0', '0']
['chr17_159426_160303', 'chr17', '159426', '160303', '0.491448', '0.508552', '224', '247', '199', '207', '0', '0']


In [6]:
with gzip.open(fpath, "rb") as file:
    chunks = gen_line(file, n_chunksize=3, n_lines=10)
    for chunk in chunks:
        for line in chunk:
            print(line)
        print("++++++++++++++++++++++++++++++")

['chr17_107410_108464', 'chr17', '107410', '108464', '0.512334', '0.487666', '342', '243', '271', '198', '0', '0']
['chr17_159026_160040', 'chr17', '159026', '160040', '0.506903', '0.493097', '286', '259', '241', '228', '0', '0']
['chr17_159426_160303', 'chr17', '159426', '160303', '0.491448', '0.508552', '224', '247', '199', '207', '0', '0']
++++++++++++++++++++++++++++++
['chr17_159510_160362', 'chr17', '159510', '160362', '0.497653', '0.502347', '222', '231', '197', '202', '0', '0']
['chr17_159977_160849', 'chr17', '159977', '160849', '0.458716', '0.541284', '198', '229', '243', '202', '0', '0']
['chr17_174938_176057', 'chr17', '174938', '176057', '0.468275', '0.531725', '294', '300', '295', '230', '0', '0']
++++++++++++++++++++++++++++++
['chr17_175103_176248', 'chr17', '175103', '176248', '0.468122', '0.531878', '282', '308', '301', '254', '0', '0']
['chr17_177613_178563', 'chr17', '177613', '178563', '0.454737', '0.545263', '221', '256', '262', '211', '0', '0']
['chr17_180289_181

## Profiling

In [7]:
def read_null(fpath, n_lines):
    pass

In [8]:
def read_line_by_line(fpath, n_lines):
    with gzip.open(fpath, "rb") as file:
        lines = gen_line(file, n_lines=3)
        for line in lines:
            print(line)

In [9]:
def read_line_by_chunk(fpath, n_lines, n_chunksize):
    with gzip.open(fpath, "rb") as file:
        chunks = gen_line(file, n_chunksize=3, n_lines=10)
        for chunk in chunks:
            for line in chunk:
                print(line)
            print("++++++++++++++++++++++++++++++")

In [10]:
n_lines = 10
n_chunksize=3

In [11]:
### start the profiler
pr = cProfile.Profile()
pr.enable()

### execution
read_null(fpath, n_lines)

### end the profiler
pr.disable()
ps = pstats.Stats(pr).print_stats()

         48 function calls in 0.000 seconds

   Random listing order was used

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000    0.000    0.000 {built-in method builtins.compile}
        2    0.000    0.000    0.000    0.000 {built-in method builtins.exec}
        4    0.000    0.000    0.000    0.000 {built-in method builtins.getattr}
        4    0.000    0.000    0.000    0.000 {built-in method builtins.next}
        1    0.000    0.000    0.000    0.000 /tmp/ipykernel_13762/1932013334.py:6(<module>)
        1    0.000    0.000    0.000    0.000 /tmp/ipykernel_13762/1932013334.py:9(<module>)
        1    0.000    0.000    0.000    0.000 /tmp/ipykernel_13762/1439239155.py:1(read_null)
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
        2    0.000    0.000    0.000    0.000 /gpfs/fs1/data/reddylab/Kuei/venv/env_jupyter/lib/python3.7/site-packages/IPython/core/hooks.py:103(__call__)


In [12]:
### start the profiler
pr = cProfile.Profile()
pr.enable()

### execution
read_line_by_line(fpath, n_lines)

### end the profiler
pr.disable()
ps = pstats.Stats(pr).print_stats()

['chr17_107410_108464', 'chr17', '107410', '108464', '0.512334', '0.487666', '342', '243', '271', '198', '0', '0']
['chr17_159026_160040', 'chr17', '159026', '160040', '0.506903', '0.493097', '286', '259', '241', '228', '0', '0']
['chr17_159426_160303', 'chr17', '159426', '160303', '0.491448', '0.508552', '224', '247', '199', '207', '0', '0']
         358 function calls in 0.017 seconds

   Random listing order was used

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000    0.000    0.000 {method 'close' of '_io.BufferedReader' objects}
        3    0.010    0.003    0.010    0.003 {method 'read' of '_io.BufferedReader' objects}
        4    0.000    0.000    0.011    0.003 {method 'readline' of '_io.BufferedReader' objects}
        1    0.000    0.000    0.000    0.000 {built-in method io.open}
       30    0.005    0.000    0.005    0.000 {method 'acquire' of '_thread.lock' objects}
        3    0.000    0.000    0.000    0.000 {method

In [13]:
### start the profiler
pr = cProfile.Profile()
pr.enable()

### execution
read_line_by_chunk(fpath, n_lines, n_chunksize)

### end the profiler
pr.disable()
ps = pstats.Stats(pr).print_stats()

['chr17_107410_108464', 'chr17', '107410', '108464', '0.512334', '0.487666', '342', '243', '271', '198', '0', '0']
['chr17_159026_160040', 'chr17', '159026', '160040', '0.506903', '0.493097', '286', '259', '241', '228', '0', '0']
['chr17_159426_160303', 'chr17', '159426', '160303', '0.491448', '0.508552', '224', '247', '199', '207', '0', '0']
++++++++++++++++++++++++++++++
['chr17_159510_160362', 'chr17', '159510', '160362', '0.497653', '0.502347', '222', '231', '197', '202', '0', '0']
['chr17_159977_160849', 'chr17', '159977', '160849', '0.458716', '0.541284', '198', '229', '243', '202', '0', '0']
['chr17_174938_176057', 'chr17', '174938', '176057', '0.468275', '0.531725', '294', '300', '295', '230', '0', '0']
++++++++++++++++++++++++++++++
['chr17_175103_176248', 'chr17', '175103', '176248', '0.468122', '0.531878', '282', '308', '301', '254', '0', '0']
['chr17_177613_178563', 'chr17', '177613', '178563', '0.454737', '0.545263', '221', '256', '262', '211', '0', '0']
['chr17_180289_181