In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
#import statsmodels.api as sm
#import itertools as it
#from statsmodels.sandbox.stats.multicomp import multipletests
#from itertools import compress
from pyBedGraph import BedGraph
from pybedtools import BedTool
import scipy.stats
from collections import Counter

In [2]:
def read_cf(directory, file_name):
    with open(directory + file_name) as f:
        chrom = {}
        for line in f:
            tmp = line.strip().split("\t")
            if tmp[0] != 'chrM':
                chrom[tmp[0]] = int(tmp[1])
    return chrom

In [3]:
def read_anchorfile(directory, file_name):
    with open(directory + file_name) as f:
        crnpk = {}
        #next(f)
        for line in f:
            tmp = line.strip().split("\t")[:-1]
            region = tmp[0]+":"+tmp[1]+"-"+tmp[2] + ";" + tmp[9]
            tmp[1] = int(tmp[1]) # peak start
            tmp[2] = int(tmp[2]) # peak end
            tmp[7] = int(tmp[7]) # gene start
            tmp[8] = int(tmp[8]) # gene end
            tmp[13] = float(tmp[13]) # TPM
            tmp[14] = float(tmp[14]) # RPKM
            mtstrand = tmp[3]
            gnstrand = tmp[9]
            if gnstrand == '+': # positive strand
                dist = min(abs(tmp[7]-tmp[2]), abs(tmp[7]-tmp[1]))
            else: # negative strand
                dist = min(abs(tmp[8]-tmp[2]), abs(tmp[8]-tmp[1]))
            tmp.append(dist) #distance to promoter;  tmp[15]
            tmp.append(tmp[8]-tmp[7]) # gene length; tmp[16]
            if tmp[15] < 5000 and tmp[13] > 0.5 and tmp[16] > 5000: # distance to promoter < 5kb and TMP > 0.5 & length > 5kb
                if region not in crnpk.keys():
                    crnpk[region] = [tmp]
                else:
                    crnpk[region].append(tmp)
            else:
                tmp = [tmp[0], tmp[1], tmp[2], tmp[3], '.', '.', '.', 0, 0, '.', '.', '.', '.', 0, 0, 0, 0]
                if region not in crnpk.keys():
                    crnpk[region] = [tmp]
                else:
                    if tmp not in crnpk[region]:
                        crnpk[region].append(tmp)
    return crnpk

In [4]:
def write_result(directory, out_list, out_name):
    with open(directory+out_name, 'a') as file1:
        for i in range(len(out_list)):
            file1.write('\t'.join(map(str, out_list[i])) + '\n')
    file1.close()

In [5]:
directory = '/Users/kimm/Desktop/GM12878_files/'
#gtf_file='hg38.ensGene.gtf'
#gtf_file = 'Homo_sapiens.GRCh38.100.gtf'
#anchor_file = 'RNAPII-peaks-overlap_CTCF_motif_cohesin_annot_ENCFF879FKF_20200711.bed'
anchor_file = 'RNAPII-peaks-overlap_cohesin-within25kb_CTCF_motif_annot_ENCFF879FKF_20200713.bed'
#loading_file = 'RNAPII-peaks-overlap_NIPBL_cohesin-notoverlap_CTCF_motif_annot_ENCFF879FKF_20200711.bed'
#comp_file='Cohesin_0.2Pass_List1.txt'
#comp_file='Cohesin_All_20200512_List1.txt'

In [6]:
chromsize = read_cf(directory, 'hg38.chrom.sizes')

In [7]:
anchor = read_anchorfile(directory, anchor_file)

In [8]:
len(Counter([x.split(";")[0] for x in anchor.keys()]).keys())

5521

In [9]:
filtered = {}
for key, val in anchor.items():
    #print(key)
    #print(val)
    #maxtpm = val[0][11]
    #maxlength = val[0][14]
    #geneid = val[0][9]
    final = val[0]
    for x in val:
        if x[13] > 1.5*final[13]: ## tpm
            if x[11] == final[11]: ## gene id same
                if x[16] > final[16]: # max length of gene 
                    final = x
                else: 
                    final[13] = x[13]
            else: 
                final = x
    #print("final is: ")
    #print(final)
    #print("\n")
    #bed = [final[0], ]
    if key.split(';')[0]+';'+final[3] not in filtered.keys():
        filtered[key.split(';')[0]+';'+final[3]] = [[],[]]
    if x[9] == '+':
        filtered[key.split(';')[0]+';'+final[3]][0] = final
    elif x[9] == '-':
        filtered[key.split(';')[0]+';'+final[3]][1] = final
    #crnpk[key] = final
    #print("=====")

In [10]:
cnts = [str(len(x[0]))+','+str(len(x[1])) for x in filtered.values()]

In [11]:
Counter(cnts)

Counter({'0,0': 3885, '0,17': 690, '17,17': 127, '17,0': 819})

In [12]:
sum(Counter(cnts).values())

5521

In [13]:
#### Gene body length ####
forward = []
reverse = []
none = []
region_size = []
cnt = 0
for key, val in filtered.items():
    if len(val[1]) == 0 and len(val[0]) > 0: # forward direction
        #print("forward")
        x = val[0]
        start = x[7]
        end = x[8]
        dist = end - start
        bed = [x[0], start-dist, end, x[3], x[9], x[10], x[11], x[12], x[13], x[16], '.', '.', '.', '.', '.', '.']
        forward.append(bed)
    elif len(val[0]) == 0 and len(val[1]) > 0: # reverse direction
        #print("reverse")
        x = val[1]
        start = x[7]
        end = x[8]
        dist = end - start
        bed = [x[0], start, end + dist, x[3], '.', '.', '.', '.', '.', '.', x[9], x[10], x[11], x[12], x[13], x[16]]
        reverse.append(bed)
    elif len(val[0]) > 0 and len(val[1]) > 0:  # could be both 
        if val[0][10] > 2*val[1][10]: # forward
            #print("forward")
            x = val[0]
            start = x[7]
            end = x[8]
            dist = end - start
            bed = [x[0], start-dist, end, x[3], x[9], x[10], x[11], x[12], x[13], x[16], '.', '.', '.', '.', '.', '.']
            forward.append(bed)
        elif val[1][10] > 2*val[0][10]: # reverse
            #print("reverse")
            x = val[1]
            start = x[7]
            end = x[8]
            dist = end - start
            bed = [x[0], start, end + dist, x[3], '.', '.', '.', '.', '.', '.', x[9], x[10], x[11], x[12], x[13], x[16]]
            reverse.append(bed)
        else:
            cnt += 1
    else: # none
        chrom = key.split(':')[0]
        start = int(key.split(':')[1].split("-")[0])
        end = int(key.split(':')[1].split("-")[1].split(";")[0])
        motifstrand = key.split(";")[1]
        bed = [chrom, max(1, start-150000), min(chromsize[chrom], end+150000), motifstrand, '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.']
        none.append(bed)
    region_size.append(bed[2]-bed[1])

In [14]:
print("forward : " + str(len(forward)))
print("reverse : " + str(len(reverse)))
print("none : " + str(len(none)))

forward : 879
reverse : 757
none : 3885


In [15]:
len(forward) + len(reverse) + len(none) + cnt

5521

In [16]:
cnt

0

In [19]:
len([x for x in reverse if x[3]=='-' and x[2]-x[1]>100000])

138

In [20]:
len([x for x in forward if x[3]=='-'])

456

In [21]:
#print([x[0] + ':' + str(x[1]) + "-" + str(x[2]) for x in none if x[3]=='-' and (x[2]-x[1])>100000])

In [77]:
len([x[0] + ':' + str(x[1]) + "-" + str(x[2]) for x in forward if (x[2]-x[1])>50000])

229

In [22]:
write_result(directory, [x for x in forward if x[3]=='+'], 'RNAPII-peaks-anchor_mt-forward_TSS-forward_20200713.bed')
write_result(directory, [x for x in forward if x[3]=='-'], 'RNAPII-peaks-anchor_mt-reverse_TSS-forward_20200713.bed')
write_result(directory, [x for x in reverse if x[3]=='+'], 'RNAPII-peaks-anchor_mt-forward_TSS-reverse_20200713.bed')
write_result(directory, [x for x in reverse if x[3]=='-'], 'RNAPII-peaks-anchor_mt-reverse_TSS-reverse_20200713.bed')
write_result(directory, [x for x in none if x[3]=='+'], 'RNAPII-peaks-anchor_mt-forward_non-TSS_20200713.bed')
write_result(directory, [x for x in none if x[3]=='-'], 'RNAPII-peaks-anchor_mt-reverse_non-TSS_20200713.bed')

In [23]:
[x for x in none if x[3]!='-' and x[3]!='+']

[]

In [24]:
np.median(region_size)

300697.0