In [3]:
from tqdm import tqdm
import numpy as np
from IPython.display import clear_output
import matplotlib.pyplot as plt
import glob

In [120]:
%%writefile make_nz_snipr.py

import numpy as np
import sys

chrom = sys.argv[1]
strand = sys.argv[2]

a = np.load('../data/snipr/%s_%s_scores.npy' % (chrom, strand))
np.save('../data/snipr/%s_%s_scores_NZ.npy' % (chrom, strand), a[a>0])

Overwriting make_nz_snipr.py


In [310]:
!rm -rf snipr_nz_logs
!mkdir snipr_nz_logs
chroms = ['chr'+str(x) for x in range(1,23)] + ['chrX', 'chrY', 'chrM']
for chrom in chroms:
    for strand in ['positive', 'negative']:
        sig = "%s_%s" % (chrom, strand[0])
        !bsub -q smp -J $sig -o ./snipr_nz_logs/$sig python make_nz_snipr.py $chrom $strand
clear_output()

In [None]:
%%writefile TFBS_snipr.py

import numpy as np
import sys
from tqdm import tqdm

def bed_to_intervals(fn):
    intervals = []
    with open(fn) as h:
        for l in h:
            c = l.rstrip('\n').split('\t')
            intervals.append((int(c[1]), int(c[2])))
    return intervals

if __name__ == '__main__':
    
    chrom = sys.argv[1]
    tf = sys.argv[2]

    rand_scores = []
    tf_dir = "/home/parashar/scratch/quadcomb/data/REMAP_TFBS/tf_wise_files"
    snipr_dir = '/home/parashar/scratch/quadcomb/data/snipr'

    snipr = np.load('%s/%s_positive_scores.npy' % (snipr_dir, chrom), mmap_mode='r')
    snipr_nz = np.load('%s/%s_positive_scores_NZ.npy' % (snipr_dir, chrom), mmap_mode='r')
    nz_len = len(snipr_nz)
    
    scores = []
    rand_scores = []
    intervals = bed_to_intervals("%s/%s/%s.bed" % (tf_dir, tf, chrom))
    for interval in tqdm(intervals):
        s = snipr[interval[0]:interval[1]]
        s = s[s > 0]
        s_len = len(s)
        scores.append(s.sum())
        temp = []
        for i in range(1000):
            r = np.random.randint(0, nz_len-s_len)
            temp.append(snipr_nz[r:r+s_len].sum())
        rand_scores.append(temp)

    np.save('../data/REMAP_TFBS/snipr_tf/%s_%s_scores' % (chrom, tf), np.array(scores))
    np.save('../data/REMAP_TFBS/snipr_tf/%s_%s_random_scores' % (chrom, tf), np.array(rand_scores))


In [8]:
%%writefile TFBS_snipr_wrapper.py

import os
import numpy as np
import glob

if __name__ == '__main__':
    chroms = ['chr'+str(x) for x in range(1,23)] + ['chrX', 'chrY']
    base_dir = "/home/parashar/scratch/quadcomb/data/REMAP_TFBS/tf_wise_files"
    tfs = np.array([x.split('/')[-1] for x in sorted(glob.glob("%s/*" % base_dir))])

    script = 'TFBS_snipr.py'
    for chrom in chroms:
        for tf in tfs:
            sig = "%s_%s" % (chrom, tf)
            cmd = 'bsub -q debugq -J %s -o ./tfbs_snipr_logs/%s.log python %s %s %s' % (
                    sig, sig, script, chrom, tf)
            print (cmd)
            os.system(cmd)

Overwriting TFBS_snipr_wrapper.py


In [9]:
%%writefile TFBS_calc_pval.py

import numpy as np
from scipy.stats import mannwhitneyu
from statsmodels.sandbox.stats.multicomp import multipletests
import sys

def load_scores(chrom, tf):
    a = np.load('../data/REMAP_TFBS/snipr_tf/%s_%s_scores.npy' % (chrom, tf),
                mmap_mode='r')
    b = np.load('../data/REMAP_TFBS/snipr_tf/%s_%s_random_scores.npy' % (chrom, tf),
               mmap_mode='r')
    return a,b

def calc_pval(a, b):
    pvals = []
    for i in b:
        r_a = [a[np.random.randint(len(a))] for x in range(1000)]
        r_b = [i[np.random.randint(len(i))] for x in range(1000)]
        pvals.append(mannwhitneyu(r_a, r_b)[1])       
    corrected_pvals = multipletests(pvals, alpha=0.05, method='holm',
                                    is_sorted=False, returnsorted=False)[1]
    return np.array([pvals, corrected_pvals])

if __name__ == '__main__':
    
    tf = sys.argv[1]
    
    scores = []
    rand_scores = []
    chroms = ['chr'+str(x) for x in range(1,23)] + ['chrX', 'chrY']
    for chrom in chroms:
        a, b = load_scores(chrom, tf)
        scores.extend(a)
        rand_scores.extend(b)
    scores = np.array(scores)
    rand_scores = np.array(rand_scores).T
    np.save('../data/REMAP_TFBS/pvals/%s.npy' % tf,
            calc_pval(scores, rand_scores))

Overwriting TFBS_calc_pval.py


In [10]:
! rm -rf tfbs_pval_logs
! mkdir tfbs_pval_logs
base_dir = "/home/parashar/scratch/quadcomb/data/REMAP_TFBS/tf_wise_files"
tfs = np.array([x.split('/')[-1] for x in sorted(glob.glob("%s/*" % base_dir))])
for tf in tfs:
    !bsub -q debugq -J $tf -o ./tfbs_pval_logs/$tf python TFBS_calc_pval.py $tf
clear_output()