**Set environment**

In [1]:
import sys
sys.path.append('../')
from config_sing import *
show_env()

You are in: Singularity | singularity_proj_combeffect
    BASE DIRECTORY:     /mount/work
    PATH OF SOURCE:     /mount/work/source
    PATH OF EXECUTABLE: /mount/work/exe
    PATH OF ANNOTATION: /mount/work/annotation
    PATH OF PROJECT:    /mount/project
    PATH OF RESULTS:    /mount/work/out/proj_combeffect

Library imported:
    numpy, pandas, matplotlib.pyplot
    os, sys, time, gzip, glob



In [10]:
import sqlite3
import itertools as it
from collections import defaultdict
import matplotlib as mpl
from matplotlib.lines import Line2D
from mpl_toolkits.axes_grid1 import make_axes_locatable
plt.style.use("seaborn-whitegrid")

import math
import random
from sklearn.manifold import TSNE
from functools import partial, reduce
print = partial(print, flush=True)

### Set Samples
fun = np.core.defchararray.add
idx = np.arange(1,6).astype("str")

INPUT    = reduce(fun, ["Input", idx])
INPUT20X = reduce(fun, ["Input", idx,     "_20x"])
TFX_DMSO = reduce(fun, ["TFX",   idx[1:], "_DMSO"])
TFX_DEX  = reduce(fun, ["TFX",   idx[1:], "_Dex"])
SAMPLES  = np.concatenate([INPUT20X, TFX_DMSO, TFX_DEX])
GROUPS   = ["Input", "Input_20x", "TFX_DMSO", "TFX_Dex"]

### file path of database
fdiry = os.path.join(FD_RES, 'database')
fname = "fragment_chr17.db"
FPATH_DB = os.path.join(fdiry, fname)

## Import library size

In [3]:
fdiry = os.path.join(FD_RES, "source")
fname = "library_size.tsv"
fpath = os.path.join(fdiry, fname)

dat = pd.read_csv(fpath, sep="\t")
dat = dat.assign(SizeM = lambda x: x.Size / 10**6)
dat

Unnamed: 0,Sample,Group,Size,SizeM
0,Input1_20x,Input_20x,371718546,371.718546
1,Input1,Input,18666630,18.66663
2,Input2_20x,Input_20x,347635732,347.635732
3,Input2,Input,20167924,20.167924
4,Input3_20x,Input_20x,349994051,349.994051
5,Input3,Input,23280988,23.280988
6,Input4_20x,Input_20x,413508358,413.508358
7,Input4,Input,19003938,19.003938
8,Input5_20x,Input_20x,341110487,341.110487
9,Input5,Input,15325016,15.325016


## Import fragments

In [4]:
def prep_line(line):
    cnames = (
        "Frag_Chrom", 
        "Frag_Start", 
        "Frag_End", 
        "Frag_Count", 
        "Region_Chrom", 
        "Region_Start", 
        "Region_End",
        "Region_Name", 
        "Region_Score",
        "Region_Strand",
        "Overlap")
    lst = line.decode('ASCII').strip().split('\t')  
    dct = dict(zip(cnames, lst))
    fragment = "_".join([dct["Frag_Chrom"],   dct["Frag_Start"],   dct["Frag_End"]]) 
    region   = "_".join([dct["Region_Chrom"], dct["Region_Start"], dct["Region_End"]]) 
    return fragment, region

### helper function to get a chunk of file
def get_chunks(gen, rows=10):
    """Divides the data into #rows in each list"""
    iterable = iter(gen)
    while True:
        x = list(it.islice(iterable, rows))
        if not x:
            return
        yield x

In [72]:
fdiry = os.path.join(FD_RES, "count_fragment", "TFX2_DMSO")
fname = "region_dex_GR_P300_dnase_chr17.bed.gz"
fpath = os.path.join(fdiry, fname)
print(fpath)

frgs = []
with gzip.open(fpath, 'rb') as file:
    chunks = get_chunks(file)
    chunk  = next(chunks)
    for line in chunk:
        print(line)
        frg, reg = prep_line(line)
        frgs.append(frg)
        
frgs

/mount/work/out/proj_combeffect/count_fragment/TFX2_DMSO/region_dex_GR_P300_dnase_chr17.bed.gz
b'chr17\t6001570\t6002624\t2\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t63\n'
b'chr17\t6001571\t6002624\t1\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t63\n'
b'chr17\t6001762\t6002691\t2\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t130\n'
b'chr17\t6001762\t6002692\t1\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t131\n'
b'chr17\t6001763\t6002692\t1\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t131\n'
b'chr17\t6001765\t6002692\t1\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t131\n'
b'chr17\t6001969\t6002895\t1\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t334\n'
b'chr17\t6001969\t6002896\t1\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t335\n'
b'chr17\t6001970\t6002896\t1\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t335\n'
b'chr17\t6002054\t6002943\t2\tchr17\t6002561\t6003866\tchr17:6002561-6003866\t.\t.\t382\

['chr17_6001570_6002624',
 'chr17_6001571_6002624',
 'chr17_6001762_6002691',
 'chr17_6001762_6002692',
 'chr17_6001763_6002692',
 'chr17_6001765_6002692',
 'chr17_6001969_6002895',
 'chr17_6001969_6002896',
 'chr17_6001970_6002896',
 'chr17_6002054_6002943']

In [73]:
%%bash
FPATH=/mount/work/out/proj_combeffect/count_fragment/TFX2_DMSO/region_dex_GR_P300_dnase_chr17.bed.gz
zcat ${FPATH} | wc -l 

3452


## Annotation: get depth

In [64]:
### get locations from 
def get_locs(frg):
    chrom, start, end = frg.split("_")
    loc = (int(start) + int(end)) // 2
    return loc

def get_depth(frgs, fpath_db = FPATH_DB):
    """get"""
    ### get location
    locs = list(map(get_locs, frgs))
    
    ### set query
    txt   = ', '.join('?' for _ in locs)
    query = f"""
        SELECT   Cov.location, Cov.sample, Cov.depth, Sam.treatment, Sam.size
        FROM     Coverage Cov
        JOIN     Sample   Sam 
        ON       Cov.sample = Sam.sample
        WHERE    Cov.location IN ({txt})
        ORDER BY Cov.location
        """
    
    ### query out from database
    with sqlite3.connect(fpath_db) as conn:
        cursor = conn.cursor()
        cursor = cursor.execute(query, locs)
    
    ### generate each row
    for row in cursor:
        grp = "loc", "sam", "depth", "trt", "size"
        dct = {k:v for k, v in zip(grp, row)}
        yield dct

In [65]:
frgs

['chr17_6001570_6002624',
 'chr17_6001571_6002624',
 'chr17_6001762_6002691',
 'chr17_6001762_6002692',
 'chr17_6001763_6002692',
 'chr17_6001765_6002692',
 'chr17_6001969_6002895',
 'chr17_6001969_6002896',
 'chr17_6001970_6002896',
 'chr17_6002054_6002943']

In [88]:
gen = get_depth(frgs)
dct = defaultdict(lambda: {k:0 for k in GROUPS})

for row in gen:
    loc, trt, val, size = row["loc"], row["trt"], row["depth"], row["size"]
    cpm = float(val) * (10**6) / float(size)
    dct[loc][trt] += cpm
    #print(loc, trt, val, size)

tmp = dict()
for loc, vals in dct.items():
    print(loc)
    print(vals)
    tmp[loc] = {key:np.round(val, decimals=3) for key, val in vals.items()}
tmp

6002097
{'Input': 4.131060293685776, 'Input_20x': 4.8790073424684, 'TFX_DMSO': 2.4135642891968883, 'TFX_Dex': 2.250582974942897}
6002226
{'Input': 4.794910003920724, 'Input_20x': 5.518106426207691, 'TFX_DMSO': 2.871054199327967, 'TFX_Dex': 2.536841220529087}
6002227
{'Input': 4.794910003920724, 'Input_20x': 5.515249234786458, 'TFX_DMSO': 2.871054199327967, 'TFX_Dex': 2.536841220529087}
6002228
{'Input': 4.794910003920724, 'Input_20x': 5.515249234786458, 'TFX_DMSO': 2.871054199327967, 'TFX_Dex': 2.536841220529087}
6002432
{'Input': 5.96962273353042, 'Input_20x': 6.360210894073804, 'TFX_DMSO': 2.891257549734757, 'TFX_Dex': 5.238523085451046}
6002433
{'Input': 5.96962273353042, 'Input_20x': 6.354347691163833, 'TFX_DMSO': 2.891257549734757, 'TFX_Dex': 5.216503220405955}
6002498
{'Input': 6.134042891136207, 'Input_20x': 6.450315561502998, 'TFX_DMSO': 2.5982076753057246, 'TFX_Dex': 6.496798374225601}


{6002097: {'Input': 4.131,
  'Input_20x': 4.879,
  'TFX_DMSO': 2.414,
  'TFX_Dex': 2.251},
 6002226: {'Input': 4.795,
  'Input_20x': 5.518,
  'TFX_DMSO': 2.871,
  'TFX_Dex': 2.537},
 6002227: {'Input': 4.795,
  'Input_20x': 5.515,
  'TFX_DMSO': 2.871,
  'TFX_Dex': 2.537},
 6002228: {'Input': 4.795,
  'Input_20x': 5.515,
  'TFX_DMSO': 2.871,
  'TFX_Dex': 2.537},
 6002432: {'Input': 5.97,
  'Input_20x': 6.36,
  'TFX_DMSO': 2.891,
  'TFX_Dex': 5.239},
 6002433: {'Input': 5.97,
  'Input_20x': 6.354,
  'TFX_DMSO': 2.891,
  'TFX_Dex': 5.217},
 6002498: {'Input': 6.134,
  'Input_20x': 6.45,
  'TFX_DMSO': 2.598,
  'TFX_Dex': 6.497}}

In [81]:
%%time
fun  = get_locs
locs = map(fun, frgs)
for loc, frg in zip(locs, frgs):
    print(frg, loc, dct[k])

chr17_6001570_6002624 6002097 {'Input': 6.134042891136207, 'Input_20x': 6.450315561502998, 'TFX_DMSO': 2.5982076753057246, 'TFX_Dex': 6.496798374225601}
chr17_6001571_6002624 6002097 {'Input': 6.134042891136207, 'Input_20x': 6.450315561502998, 'TFX_DMSO': 2.5982076753057246, 'TFX_Dex': 6.496798374225601}
chr17_6001762_6002691 6002226 {'Input': 6.134042891136207, 'Input_20x': 6.450315561502998, 'TFX_DMSO': 2.5982076753057246, 'TFX_Dex': 6.496798374225601}
chr17_6001762_6002692 6002227 {'Input': 6.134042891136207, 'Input_20x': 6.450315561502998, 'TFX_DMSO': 2.5982076753057246, 'TFX_Dex': 6.496798374225601}
chr17_6001763_6002692 6002227 {'Input': 6.134042891136207, 'Input_20x': 6.450315561502998, 'TFX_DMSO': 2.5982076753057246, 'TFX_Dex': 6.496798374225601}
chr17_6001765_6002692 6002228 {'Input': 6.134042891136207, 'Input_20x': 6.450315561502998, 'TFX_DMSO': 2.5982076753057246, 'TFX_Dex': 6.496798374225601}
chr17_6001969_6002895 6002432 {'Input': 6.134042891136207, 'Input_20x': 6.45031556

## Annotation: GC content

In [37]:
def get_pct(frgs, fpath_db=FPATH_DB):
    """get GC content of the given fragments"""
    ### set query
    txt   = ', '.join('?' for _ in frgs)
    query = f"""
        SELECT   Frg.fragment, Frg.pct_gc
        FROM     Fragment Frg
        WHERE    Frg.fragment IN ({txt})
        ORDER BY Frg.fragment
        """
    
    ### query out from database
    with sqlite3.connect(fpath_db) as conn:
        cursor = conn.cursor()
        cursor = cursor.execute(query, frgs)
    
    ### generate each row
    for row in cursor:
        yield row

In [38]:
gen = get_pct(frgs)
list(gen)

[('chr17_6001570_6002624', 0.541746),
 ('chr17_6001571_6002624', 0.54226),
 ('chr17_6001762_6002691', 0.528525),
 ('chr17_6001762_6002692', 0.527957),
 ('chr17_6001763_6002692', 0.528525),
 ('chr17_6001765_6002692', 0.528587),
 ('chr17_6001969_6002895', 0.534557),
 ('chr17_6001969_6002896', 0.533981),
 ('chr17_6001970_6002896', 0.533477),
 ('chr17_6002054_6002943', 0.524184)]

## Annotation: motif

In [43]:
def get_annot(frgs, fpath_db=FPATH_DB):
    """get annotation from the given fragments"""
    ### set query
    txt   = ', '.join('?' for _ in frgs)
    query = f"""
        SELECT   Ant.Fragment, Mtf.motif, Mtf.score
        FROM     Annotation Ant
        JOIN     Motif      Mtf 
        ON       Ant.binding = Mtf.binding
        WHERE    Ant.Fragment IN ({txt})
        ORDER BY Ant.Fragment
        """
    
    ## query out from database
    with sqlite3.connect(fpath_db) as conn:
        cursor = conn.cursor()    
        cursor = cursor.execute(query, frgs)
        
    ### summarize the motif annotation scores
    dct_ann_count = defaultdict(lambda: defaultdict(lambda: 0))
    dct_ann_score = defaultdict(lambda: defaultdict(lambda: 0.0))
    
    for row in cursor:
        ### parse info
        frg, mtf, val = row
        
        ### count and sum the annotation scores
        dct_ann_count[frg][mtf] += 1
        dct_ann_score[frg][mtf] += val
    
    ### arrange and return
    dct_ann = dict()
    dct_ann["count"] = dct_ann_count
    dct_ann["score"] = dct_ann_score
    return dct_ann

In [57]:
frg = frgs[0]
dct_tot = get_annot([frg])

dct_cnt = dct_tot["count"]
dct_cnt = {k:dict(v) for k,v in dct_cnt.items()}
dct_ann = dct_tot["score"]
dct_ann = {k:dict(v) for k,v in dct_ann.items()}

In [54]:
key = frg
val = str(dict(dct_cnt[key]))
val

"{'HD/23': 1, 'SOX/1': 2, 'EVI1/MECOM': 1, 'ZNF768': 1, 'NFY': 1, 'EBF1': 1, 'GC-tract': 8, 'ZNF320': 2, 'CTCF': 2, 'KLF/SP/2': 9, 'INSM1': 1, 'ETS/2': 4, 'ZBTB48': 1, 'Ebox/CAGATGG': 2, 'HEN1': 4, 'OSR2': 2, 'Ebox/CAGCTG': 3, 'NR/3': 5, 'NR/15': 2, 'KLF/SP/1': 3, 'GLI': 1, 'ZNF53': 2, 'TFAP2/1': 4, 'NR/16': 2, 'PRDM14': 1, 'ZIC': 1, 'Ebox/CACCTG': 1, 'IRF/2': 4, 'ZIM3': 1, 'ZNF354': 2, 'YY1': 3, 'HD/14': 2, 'RFX/1': 2, 'ZNF549': 2, 'FEZF1': 2, 'GCM': 1, 'SIX/1': 1, 'HIC/1': 3, 'P53-like/1': 2, 'ZNF449': 2, 'ZNF143': 3, 'NR/17': 5, 'TBX/4': 2, 'AP1/1': 2, 'SMAD': 4, 'PAX/2': 2, 'MAF': 1, 'HD/20': 3, 'HD/22': 4, 'TBX/3': 2, 'ZFX': 3, 'PLAG1': 1, 'ZNF324': 2, 'AP1/2': 2, 'ZNF554': 3, 'NR/18': 1, 'NFKB/2': 1, 'Ebox/CACGTG/1': 3, 'SREBF1': 2, 'GRHL': 1, 'ZFN121': 2, 'NR/19': 2, 'MEF2': 1, 'FOX/5': 1, 'HD/2': 3, 'EGR': 1, 'REST/NRSF': 1, 'ZNF335': 1, 'PAX/1': 1, 'POU/3': 1, 'ZNF257': 1, 'E2F/2': 3, 'MFZ1': 2, 'HINFP1/1': 1, 'GFI': 1, 'ZNF146': 1, 'E2F/1': 1, 'CCAAT/CEBP': 1, 'ZNF140': 1, 'Z

In [56]:
import json

In [58]:
key = frg
val = dct_ann[frg]

In [60]:
tmp = json.dumps(val)
tmp

'{"HD/23": 10.0246, "SOX/1": 15.424949999999999, "EVI1/MECOM": 7.0973, "ZNF768": 7.0552, "NFY": 7.4972, "EBF1": 8.399, "GC-tract": 78.256257774, "ZNF320": 17.3581, "CTCF": 14.9269, "KLF/SP/2": 83.21311666999999, "INSM1": 5.6777, "ETS/2": 33.2254, "ZBTB48": 8.3812, "Ebox/CAGATGG": 16.7208, "HEN1": 25.345899999999997, "OSR2": 16.9187, "Ebox/CAGCTG": 26.96235, "NR/3": 43.372, "NR/15": 16.793799999999997, "KLF/SP/1": 29.400049999999997, "GLI": 8.4084, "ZNF53": 14.709, "TFAP2/1": 33.9674, "NR/16": 18.346249999999998, "PRDM14": 9.5001, "ZIC": 9.4232, "Ebox/CACCTG": 9.4328, "IRF/2": 28.356849999999998, "ZIM3": 7.9898, "ZNF354": 14.3681, "YY1": 23.2647, "HD/14": 15.2789, "RFX/1": 14.2249, "ZNF549": 13.8719, "FEZF1": 15.210899999999999, "GCM": 7.80295, "SIX/1": 8.5294, "HIC/1": 24.9856, "P53-like/1": 18.1449, "ZNF449": 20.6387, "ZNF143": 21.957099999999997, "NR/17": 35.9963, "TBX/4": 8.9748, "AP1/1": 15.593, "SMAD": 36.3176, "PAX/2": 18.354, "MAF": 8.3036, "HD/20": 21.9543, "HD/22": 33.28740000

In [61]:
tmp = json.dumps(val)
tmp = json.loads(tmp)
tmp

{'HD/23': 10.0246,
 'SOX/1': 15.424949999999999,
 'EVI1/MECOM': 7.0973,
 'ZNF768': 7.0552,
 'NFY': 7.4972,
 'EBF1': 8.399,
 'GC-tract': 78.256257774,
 'ZNF320': 17.3581,
 'CTCF': 14.9269,
 'KLF/SP/2': 83.21311666999999,
 'INSM1': 5.6777,
 'ETS/2': 33.2254,
 'ZBTB48': 8.3812,
 'Ebox/CAGATGG': 16.7208,
 'HEN1': 25.345899999999997,
 'OSR2': 16.9187,
 'Ebox/CAGCTG': 26.96235,
 'NR/3': 43.372,
 'NR/15': 16.793799999999997,
 'KLF/SP/1': 29.400049999999997,
 'GLI': 8.4084,
 'ZNF53': 14.709,
 'TFAP2/1': 33.9674,
 'NR/16': 18.346249999999998,
 'PRDM14': 9.5001,
 'ZIC': 9.4232,
 'Ebox/CACCTG': 9.4328,
 'IRF/2': 28.356849999999998,
 'ZIM3': 7.9898,
 'ZNF354': 14.3681,
 'YY1': 23.2647,
 'HD/14': 15.2789,
 'RFX/1': 14.2249,
 'ZNF549': 13.8719,
 'FEZF1': 15.210899999999999,
 'GCM': 7.80295,
 'SIX/1': 8.5294,
 'HIC/1': 24.9856,
 'P53-like/1': 18.1449,
 'ZNF449': 20.6387,
 'ZNF143': 21.957099999999997,
 'NR/17': 35.9963,
 'TBX/4': 8.9748,
 'AP1/1': 15.593,
 'SMAD': 36.3176,
 'PAX/2': 18.354,
 'MAF': 8

## Preprocess

In [10]:
def get_depth(frgs, fpath_db = FPATH_DB):
    """get"""
    
    ### get locations from 
    def fun(frg):
        chrom, start, end = frg.split("_")
        loc = (int(start) + int(end)) // 2
        return loc
    
    locs = list(map(fun, frgs))
    
    ### set query
    txt   = ', '.join('?' for _ in locs)
    query = f"""
        SELECT   Cov.location, Cov.sample, Cov.depth, Sam.treatment, Sam.size
        FROM     Coverage Cov
        JOIN     Sample   Sam 
        ON       Cov.sample = Sam.sample
        WHERE    Cov.location IN ({txt})
        ORDER BY Cov.location
        """
    
    ### query out from database
    with sqlite3.connect(fpath_db) as conn:
        cursor = conn.cursor()
        cursor = cursor.execute(query, locs)
    
    ### generate each row
    for row in cursor:
        yield row
        
def get_pct(frgs, fpath_db=FPATH_DB):
    """get GC content of the given fragments"""
    ### set query
    txt   = ', '.join('?' for _ in frgs)
    query = f"""
        SELECT   Frg.fragment, Frg.pct_gc
        FROM     Fragment Frg
        WHERE    Frg.fragment IN ({txt})
        ORDER BY Frg.fragment
        """
    
    ### query out from database
    with sqlite3.connect(fpath_db) as conn:
        cursor = conn.cursor()
        cursor = cursor.execute(query, frgs)
    
    ### generate each row
    for row in cursor:
        yield row
        
def get_annot(frgs, fpath_db):
    """get annotation from the given fragments"""
    ### set query
    txt   = ', '.join('?' for _ in frgs)
    query = f"""
        SELECT   Ant.Fragment, Mtf.motif, Mtf.score
        FROM     Annotation Ant
        JOIN     Motif      Mtf 
        ON       Ant.binding = Mtf.binding
        WHERE    Ant.Fragment IN ({txt})
        ORDER BY Ant.Fragment
        """
    
    ## query out from database
    with sqlite3.connect(fpath_db) as conn:
        cursor = conn.cursor()    
        cursor = cursor.execute(query, frgs)
        
    ### summarize the motif annotation scores
    dct_ann_count = defaultdict(lambda: defaultdict(lambda: 0))
    dct_ann_score = defaultdict(lambda: defaultdict(lambda: 0.0))
    
    for row in cursor:
        ### parse info
        frg, mtf, val = row
        
        ### count and sum the annotation scores
        dct_ann_count[frg][mtf] += 1
        dct_ann_score[frg][mtf] += val
    
    ### arrange and return
    dct_ann = dict()
    dct_ann["count"] = dct_ann_count
    dct_ann["score"] = dct_ann_score
    return dct_ann

In [None]:
query = f"""
        SELECT   Frg.fragment, Frg.pct_gc
        FROM     Fragment Frg
        WHERE    Frg.fragment IN ({txt})
        ORDER BY Frg.fragment
        """

In [None]:
def get_depth(frgs, fpath_db = FPATH_DB):
    """get"""
    
    ### get locations from 
    def fun(frg):
        chrom, start, end = frg.split("_")
        loc = (int(start) + int(end)) // 2
        return loc
    
    locs = list(map(fun, frgs))
    
    ### set query
    txt   = ', '.join('?' for _ in locs)
    query = f"""
        SELECT   Cov.location, Cov.sample, Cov.depth, Sam.treatment, Sam.size
        FROM     Coverage Cov
        JOIN     Sample   Sam 
        ON       Cov.sample = Sam.sample
        WHERE    Cov.location IN ({txt})
        ORDER BY Cov.location
        """
    
    ### query out from database
    with sqlite3.connect(fpath_db) as conn:
        cursor = conn.cursor()
        cursor = cursor.execute(query, locs)
    
    ### generate each row
    for row in cursor:
        yield row

In [12]:
frgs

['chr17_6001570_6002624',
 'chr17_6001571_6002624',
 'chr17_6001762_6002691',
 'chr17_6001762_6002692',
 'chr17_6001763_6002692',
 'chr17_6001765_6002692',
 'chr17_6001969_6002895',
 'chr17_6001969_6002896',
 'chr17_6001970_6002896',
 'chr17_6002054_6002943']

In [11]:
gen  = get_depth(frgs)
line = next(gen)
lst  = [line]
loc  = line[0]

for line in gen:
    tmp = line[0]
    if loc != tmp:
        print(lst)
        print("+++++++++++++++++")
        lst = [line]
        loc = tmp
    else:
        lst.append(line)
print(lst)

[(6002097, 'Input1', 8, 'Input', 18666630), (6002097, 'Input2', 21, 'Input', 20167924), (6002097, 'Input3', 18, 'Input', 23280988), (6002097, 'Input4', 21, 'Input', 19003938), (6002097, 'Input5', 12, 'Input', 15325016), (6002097, 'Input1_20x', 395, 'Input_20x', 371718546), (6002097, 'Input2_20x', 342, 'Input_20x', 347635732), (6002097, 'Input3_20x', 309, 'Input_20x', 349994051), (6002097, 'Input4_20x', 388, 'Input_20x', 413508358), (6002097, 'Input5_20x', 345, 'Input_20x', 341110487), (6002097, 'TFX2_DMSO', 27, 'TFX_DMSO', 43844606), (6002097, 'TFX3_DMSO', 18, 'TFX_DMSO', 26819569), (6002097, 'TFX4_DMSO', 22, 'TFX_DMSO', 30951533), (6002097, 'TFX5_DMSO', 12, 'TFX_DMSO', 28859151), (6002097, 'TFX2_Dex', 25, 'TFX_Dex', 45413539), (6002097, 'TFX3_Dex', 29, 'TFX_Dex', 26400671), (6002097, 'TFX4_Dex', 11, 'TFX_Dex', 34590086), (6002097, 'TFX5_Dex', 12, 'TFX_Dex', 42310249)]
+++++++++++++++++
[(6002226, 'Input1', 11, 'Input', 18666630), (6002226, 'Input2', 21, 'Input', 20167924), (6002226, '

In [56]:
gen = get_pct(frgs)
list(gen)

[('chr17_6001570_6002624', 0.541746),
 ('chr17_6001571_6002624', 0.54226),
 ('chr17_6001762_6002691', 0.528525),
 ('chr17_6001762_6002692', 0.527957),
 ('chr17_6001763_6002692', 0.528525)]

In [25]:
dct = get_annot([frg])
key = frg
val = str(dict(dct[key]))

In [27]:
"\t".join([key, val])

"chr17_6001763_6002692\t{'ZNF549': 13.8719, 'FEZF1': 15.210899999999999, 'GCM': 7.80295, 'SIX/1': 8.5294, 'HIC/1': 24.9856, 'P53-like/1': 18.1449, 'GC-tract': 59.847296662999995, 'ZNF449': 20.6387, 'ZNF143': 21.957099999999997, 'NR/17': 35.9963, 'TBX/4': 16.6979, 'AP1/1': 15.593, 'SMAD': 36.3176, 'PAX/2': 18.354, 'MAF': 8.3036, 'HD/20': 21.9543, 'HD/22': 33.287400000000005, 'TBX/3': 15.8424, 'KLF/SP/2': 64.118633337, 'ZFX': 28.601799999999997, 'PLAG1': 4.7136, 'ZNF324': 17.449399999999997, 'AP1/2': 16.7908, 'ZNF554': 27.7143, 'NR/18': 7.3832, 'NFKB/2': 5.7384, 'HEN1': 17.3812, 'NR/16': 9.80965, 'Ebox/CACGTG/1': 21.4349, 'SREBF1': 18.7661, 'CTCF': 6.198, 'GRHL': 8.5493, 'ZFN121': 13.9672, 'NR/19': 13.5623, 'MEF2': 9.0723, 'FOX/5': 6.5793, 'HD/2': 22.459, 'EGR': 7.4308, 'REST/NRSF': 6.9359, 'ZNF335': 8.8592, 'KLF/SP/1': 20.67185, 'PAX/1': 7.2091, 'OSR2': 8.879, 'TFAP2/1': 25.5336, 'POU/3': 7.5086, 'ZNF257': 7.4359, 'E2F/2': 32.980850000000004, 'NR/3': 27.859899999999996, 'MFZ1': 16.3626,

In [24]:
str(dict(dct['chr17_6001763_6002692']))

"{'ZNF549': 13.8719, 'FEZF1': 15.210899999999999, 'GCM': 7.80295, 'SIX/1': 8.5294, 'HIC/1': 24.9856, 'P53-like/1': 18.1449, 'GC-tract': 59.847296662999995, 'ZNF449': 20.6387, 'ZNF143': 21.957099999999997, 'NR/17': 35.9963, 'TBX/4': 16.6979, 'AP1/1': 15.593, 'SMAD': 36.3176, 'PAX/2': 18.354, 'MAF': 8.3036, 'HD/20': 21.9543, 'HD/22': 33.287400000000005, 'TBX/3': 15.8424, 'KLF/SP/2': 64.118633337, 'ZFX': 28.601799999999997, 'PLAG1': 4.7136, 'ZNF324': 17.449399999999997, 'AP1/2': 16.7908, 'ZNF554': 27.7143, 'NR/18': 7.3832, 'NFKB/2': 5.7384, 'HEN1': 17.3812, 'NR/16': 9.80965, 'Ebox/CACGTG/1': 21.4349, 'SREBF1': 18.7661, 'CTCF': 6.198, 'GRHL': 8.5493, 'ZFN121': 13.9672, 'NR/19': 13.5623, 'MEF2': 9.0723, 'FOX/5': 6.5793, 'HD/2': 22.459, 'EGR': 7.4308, 'REST/NRSF': 6.9359, 'ZNF335': 8.8592, 'KLF/SP/1': 20.67185, 'PAX/1': 7.2091, 'OSR2': 8.879, 'TFAP2/1': 25.5336, 'POU/3': 7.5086, 'ZNF257': 7.4359, 'E2F/2': 32.980850000000004, 'NR/3': 27.859899999999996, 'MFZ1': 16.3626, 'ETS/2': 16.0095, 'HIN