In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
#import statsmodels.api as sm
#import itertools as it
#from statsmodels.sandbox.stats.multicomp import multipletests
#from itertools import compress
from pyBedGraph import BedGraph
from pybedtools import BedTool
import itertools

In [2]:
def read_cf(directory, file_name):
    with open(directory + file_name) as f:
        chrom = {}
        for line in f:
            tmp = line.strip().split("\t")
            if tmp[0] != 'chrM':
                chrom[tmp[0]] = int(tmp[1])
    return chrom

In [3]:
def read_motif(directory, file_name, chromfile):
    with open(directory + file_name) as f:
        motif = []
        for line in f:
            tmp = line.strip().split("\t")
            if tmp[0] in chromfile.keys():
                motif.append([tmp[0], int(tmp[1]), int(tmp[2]), tmp[3]])
    return motif

In [4]:
def write_result(directory, out_list, out_name):
    with open(directory+out_name, 'a') as file1:
        for i in range(len(out_list)):
            file1.write('\t'.join(map(str, out_list[i])) + '\n')
    file1.close()

In [5]:
def annot_domainid(directory, domain_file):
    domains = {}
    with open(directory + domain_file) as f:
        for line in f:
            tmp = line.strip().split("\t")
            domains[tmp[12]+":"+str(int(tmp[13])-4000)+"-"+str(int(tmp[14])+4000)] = [tmp[11], "L"]
            domains[tmp[17]+":"+str(int(tmp[18])-4000)+"-"+str(int(tmp[19])+4000)] = [tmp[11], "R"]
    return domains

In [6]:
def get_domainid(domains, motif_bed):
    motif_str = motif_bed[0]+":"+str(motif_bed[1])+"-"+str(motif_bed[2])
    if motif_str in domains.keys():
        domainid = domains[motif_str]
    else:
        domainid = ['.','.']
    return domainid

In [7]:
def get_intensity_dmid(onemotif, bg_cpet_ctcf, domains):
    #tlist = []
    tlist = onemotif
    dmid = get_domainid(domains, tlist[0:3])
    tlist.append(dmid[0])
    tlist.append(dmid[1])
    tlist.append(int(bg_cpet_ctcf.stats(intervals = [tlist[0:3]], stat = 'max')[0]))
    return tlist

In [8]:
def extract_supermotif(spmt_list):
    final_list = []
    prev = spmt_list[0]
    cnt = 1
    for i in range(1, len(spmt_list)):
        curr = spmt_list[i]
        if curr[3] == prev[3]: # same orientation
            prev[2] = curr[2] # extending end position
            if curr[4] != '.': # domain IDs
                prev[4] = curr[4]
            if curr[5] != '.':
                prev[5] = curr[5]
            prev[6] = max(prev[6], curr[6])
            cnt += 1
        else: # conflicting orientation
            if cnt == 1: # only two motifs conflicting
#                if curr[4] != '.' or prev[4] != '.': # if one of the two motifs in domain boundary, write both
#                    final_list.append([prev, cnt]) # dump 
#                    prev = curr
#                    cnt = 1
                if prev[4] != '.' and curr[4] != '.': # both in the domain boundary, then write both
                    final_list.append([prev, cnt]) # dump 
                    prev = curr
                elif prev[4] == '.' and curr[4] != '.': # if current is in boundary, then write current
                    prev = curr
                elif prev[4] != '.' and curr[4] == '.': # if prev is in boundary, then write previous
                    prev = prev
                else: # both not in boundary, then pick one with higher binding
                    if curr[6] > prev[6]:
                        prev = curr
            else: # previous are supermotifs, should write both 
                final_list.append([prev, cnt]) # dump 
                prev = curr
                cnt = 1
    final_list.append([prev, cnt])
    return final_list

In [11]:
directory='/Users/kimm/Desktop/GM12878_files/'
#cdrop_ctcf_cov='CDH0002NR_hg38_CTCF.bedgraph'
cdrop_ctcf_cov='GM12878-CTCF-pooled.bedgraph'
cpet_ctcf_cov='LHG0052H.for.BROWSER.sorted.bedgraph'
#cdrop_cohesin_cov='SHG0180-181-182NR_hg38_cohesin.bedgraph'
cdrop_cohesin_cov='GM12878-cohesin-pooled.bedgraph'
cpet_cohesin_cov='LHG0051H.for.BROWSER.sorted.bedgraph'
motif_file='CTCF_motifs_STORM_hg38_Ext4kbBoth.sorted.bed'
domain_file='LHG0052H.e500.clusters.cis.bothanchint_G250.PETcnt_G9.motifannot.sorted.domains'

In [12]:
chromfile = read_cf(directory, 'hg38.chrom.sizes')
#repet = BedTool(directory+'hg38PAM.sameChr.tx.sorted.legal.6To12Copies.within5kb_col1-4.bed')

In [13]:
#del motif
#del domains

In [14]:
motif = read_motif(directory, motif_file, chromfile)

In [15]:
bg_cpet_ctcf = BedGraph(directory+'hg38.chrom.sizes', directory+cpet_ctcf_cov)

In [16]:
for key,val in chromfile.items():
    #bgcd.load_chrom_data(key)
    bg_cpet_ctcf.load_chrom_data(key)

In [17]:
domains = annot_domainid(directory, domain_file)

In [18]:
chromcnt = {}
chromcnt[motif[0][0]] = 0 # initialize
past_info = get_intensity_dmid(motif[0], bg_cpet_ctcf, domains)
tmp_smtf = []
single_motif_list = []
super_motif_list = []
tmp_smtf = []
for i in range(1, len(motif)):
    if motif[i][0] not in chromcnt.keys(): # new chromosome
        if len(tmp_smtf) == 0: # previous was not supermotif
            single_motif_list.append(past_info)
        else: # previous was supermotif
            super_motif_list.append(tmp_smtf)
        past_info = get_intensity_dmid(motif[i], bg_cpet_ctcf, domains)
        tmp_smtf = []
        chromcnt[motif[i][0]] = 0
    else: # same chromosome
        curr_info = get_intensity_dmid(motif[i], bg_cpet_ctcf, domains)
        if curr_info[1] > past_info[2]: # individual motif since no overlap
            if len(tmp_smtf) == 0: # previous was not supermotif
                single_motif_list.append(past_info)
            else: # previous was supermotif
                super_motif_list.append(tmp_smtf)
            tmp_smtf = []
        else: # super motif
            if len(tmp_smtf) == 0:
                tmp_smtf.append(past_info)
            tmp_smtf.append(curr_info)
        past_info = curr_info
if len(tmp_smtf) == 0: # previous was not supermotif
    single_motif_list.append(past_info)
else: # previous was supermotif
    super_motif_list.append(tmp_smtf)

In [20]:
len(motif)

22643

In [22]:
len(single_motif_list)

17875

In [26]:
listcnt = [len(x) for x in super_motif_list]
sum(listcnt)

4768

In [27]:
len(single_motif_list) + sum(listcnt)

22643

In [32]:
motif_annotated = []
for i in range(len(super_motif_list)):
    extracted = extract_supermotif(super_motif_list[i])
    for j in range(len(extracted)):
        tmp = extracted[j][0][0:4]
        tmp.append('smt'+str(i)+':'+str(j)+"-"+str(extracted[j][1]))
        tmp.extend(extracted[j][0][4:])
        motif_annotated.append(tmp)

In [33]:
len(motif_annotated)

2376

In [34]:
for i in range(len(single_motif_list)):
    tmp = single_motif_list[i][0:4]
    tmp.append('mot'+str(i))
    tmp.extend(single_motif_list[i][4:])
    motif_annotated.append(tmp)

In [35]:
len(motif_annotated)

20251

In [36]:
l = [int(x[4][-1]) for x in motif_annotated[0:2319]]

In [37]:
[[x,l.count(x)] for x in set(l)]

[[1, 953], [2, 1289], [3, 70], [4, 6], [5, 1]]

In [38]:
sum([l.count(x) for x in set(l)])

2319

In [39]:
dm = [x[4] for x in motif_annotated if x[5] != '.']

In [40]:
tmp_annot = []
for x in dm:
    if x[0:3] == 'mot':
        tmp_annot.append(0)
    else:
        tmp_annot.append(int(x[-1]))

In [41]:
[[x,tmp_annot.count(x)] for x in set(tmp_annot)]

[[0, 3660], [1, 365], [2, 511], [3, 37], [4, 4], [5, 1]]

In [42]:
tm_list = [x[5] for x in motif_annotated]
for y in [[x,tm_list.count(x)] for x in set(tm_list)]:
    if y[1] != 2:
        print(y)

['.', 15673]


In [43]:
tm_list2 = [x[6] for x in motif_annotated]
[[x,tm_list2.count(x)] for x in set(tm_list2)]

[['R', 2289], ['L', 2289], ['.', 15673]]

In [44]:
bg_cdrop_ctcf = BedGraph(directory+'hg38.chrom.sizes', directory+cdrop_ctcf_cov)

In [45]:
del bg_cpet_ctcf

In [47]:
bg_cpet_cohesin = BedGraph(directory+'hg38.chrom.sizes', directory+cpet_cohesin_cov)

In [48]:
bg_cdrop_cohesin = BedGraph(directory+'hg38.chrom.sizes', directory+cdrop_cohesin_cov)

In [49]:
for key,val in chromfile.items():
    #bgcd.load_chrom_data(key)
    bg_cdrop_ctcf.load_chrom_data(key)
    bg_cdrop_cohesin.load_chrom_data(key)
    bg_cpet_cohesin.load_chrom_data(key)

In [50]:
for x in motif_annotated:
    if x[0] in chromfile.keys():
        cdrop_ctcf = int(bg_cdrop_ctcf.stats(intervals = [x[0:3]], stat = 'max')[0])
        cpet_cohesin = int(bg_cpet_cohesin.stats(intervals = [x[0:3]], stat = 'max')[0])
        cdrop_cohesin = int(bg_cdrop_cohesin.stats(intervals = [x[0:3]], stat = 'max')[0])
        x.append(cdrop_ctcf)
        x.append(cpet_cohesin)
        x.append(cdrop_cohesin)

In [51]:
motif_annotated[0:5]

[['chr1', 1286331, 1296037, '-', 'smt0:0-2', '.', '.', 333, 133, 120, 600],
 ['chr1', 2544186, 2552205, '-', 'smt1:0-1', '.', '.', 217, 264, 134, 2142],
 ['chr1', 3487388, 3495407, '+', 'smt2:0-1', '.', '.', 185, 125, 42, 302],
 ['chr1', 3611320, 3625350, '-', 'smt3:0-3', 'dm2', 'R', 715, 510, 232, 2569],
 ['chr1', 3720583, 3731486, '-', 'smt4:0-2', '.', '.', 587, 440, 182, 2144]]

In [52]:
write_result(directory, motif_annotated, 'CTCF_motifs_STORM_hg38_Ext4kbBoth_with_supermotif_domain_id_v8.bed')

In [116]:
del bg_cpet_cohesin
del bg_cdrop_cohesin