Convert TOMBO WIG files to bed files using bedtools

In [None]:
pip install wiggelen


In [None]:
pip install pybedtools

In [1]:
import os
import wiggelen
import pybedtools
from pybedtools import BedTool
import pandas as pd
import subprocess

In [2]:
##define functions
def wig_to_bed(input_wig, out_bed):
    "Convert WIG files from tombo into BED files for BEDTOOL analysis"
    mod = os.path.basename(input_wig).split('.')[1]
    if os.path.basename(input_wig).split('.')[-2] == 'plus':
        strand = '+'
    elif os.path.basename(input_wig).split('.')[-2] == 'minus':
        strand = '-'
    print('Writing file %s' % out_bed)
    with open(input_wig) as wig_file:   
        with open(out_bed, 'w') as bed_file:
            for x in wiggelen.walk(wig_file):
                    print('%s\t%d\t%d\t%s\t%s\t%s' % (x[0], int(x[1])-1, x[1], mod, x[2], strand), file=bed_file)

In [3]:
#combine bed files
def combine_and_sort(in_fn_list, out_fn):
    with open(out_fn, 'w') as out_fh:
        for fn in in_fn_list:
            with open(fn, 'r') as fh:
                for line in fh:
                    print(line.rstrip(), file=out_fh)
    command = 'sort -o %s -k1,1 -k2,2n %s' % (out_fn , out_fn)
    !{command}

In [4]:
def bed_frac_cov_filter(in_fn, cutoff, genome_file_fn):
    high_out_fn = in_fn.replace('.all.', '.g%s.' % str(cutoff).replace('.', '') )
    low_out_fn = in_fn.replace('.all.', '.s%s.' % str(cutoff).replace('.', '') )
    with open(in_fn, 'r') as in_fh:
        with open(high_out_fn, 'w') as high_fh:
            with open(low_out_fn, 'w') as low_fh:
                for line in in_fh:
                    line = line.rstrip()
                    frac = float(line.split('\t')[4])
                    if frac >= cutoff:
                        print(line, file= high_fh)
                    elif frac < cutoff:
                        print(line, file=low_fh)
    #no_cov_fn = in_fn.replace('.all.', '.nocov.' )
    #Bedtools(in_fn).merge().complement(g=genome_file_fn).saveas(no_cov_fn)
    

In [5]:
def bedgraph_to_bed(in_fn, out_fn):
    if os.path.basename(in_fn).split('.')[-2] == 'minus':
        strand = '-'
    elif os.path.basename(in_fn).split('.')[-2] == 'plus':
        strand = '+'
    with open(in_fn) as in_fh:
        with open(out_fn, 'w') as out_fh:
            for line in in_fh:
                line = line.rstrip()
                
                if not line.startswith('track'):
                    
                    values = line.split('\t')
                    if values[3] == '0':
                        #print(line)
                        print('%s\t%s\t%s\t%s\t%s\t%s' % (values[0], values[1], values[2], 'novoc', '0', strand), file=out_fh) 

In [6]:
###Make sure you define your directories
DIRS ={}
DIRS['BASE'] = '../../analyses/methylation_calling/'
DIRS['WIG_INPUT'] = os.path.join(DIRS['BASE'], 'germinated_spores')
DIRS['BED_OUT'] = os.path.join(DIRS['BASE'], 'germinated_spores', 'bed_files')
genome_file_fn = os.path.abspath('../../data/genomic_resources/Pgt_genome_size.txt')

In [8]:
for value in DIRS.values():
    if not os.path.exists(value):
        print('%s does not exist' % value)
        os.makedirs(value)
    else:
        print(value)

../../analyses/methylation_calling/
../../analyses/methylation_calling/germinated_spores
../../analyses/methylation_calling/germinated_spores/bed_files


In [9]:
#Wig files from tombo

m5c_plus_fh = os.path.abspath(os.path.join(DIRS['WIG_INPUT'], 'germinated_spores.5mC.dampened_fraction_modified_reads.plus.wig'))
m5c_minus_fh = os.path.abspath(os.path.join(DIRS['WIG_INPUT'], 'germinated_spores.5mC.dampened_fraction_modified_reads.minus.wig'))
m6a_plus_fh = os.path.abspath(os.path.join(DIRS['WIG_INPUT'], 'germinated_spores.6mA.dampened_fraction_modified_reads.plus.wig'))
m6a_minus_fh = os.path.abspath(os.path.join(DIRS['WIG_INPUT'], 'germinated_spores.6mA.dampened_fraction_modified_reads.minus.wig'))

bedgraph_minus_fh = os.path.abspath(os.path.join(os.path.join(DIRS['WIG_INPUT'], 'germinated_spores.coverage.minus.bedgraph')))
bedgraph_plus_fh = os.path.abspath(os.path.join(os.path.join(DIRS['WIG_INPUT'], 'germinated_spores.coverage.plus.bedgraph')))


#Make a directory of wig file handles for easy use in the function
wig_fh_list = [m5c_plus_fh, m5c_minus_fh, m6a_plus_fh, m6a_minus_fh]                          
                            
                            
#Set file handles for BED output files generated from tombo WIG files
m5c_plus = os.path.abspath(os.path.join(DIRS['BED_OUT'], 'germinated_spores.5mC.plus.tombo.bed'))
m5c_minus = os.path.abspath(os.path.join(DIRS['BED_OUT'], 'germinated_spores.5mC.minus.tombo.bed'))
m6a_plus = os.path.abspath(os.path.join(DIRS['BED_OUT'], 'germinated_spores.6mA.plus.tombo.bed'))
m6a_minus = os.path.abspath(os.path.join(DIRS['BED_OUT'], 'germinated_spores.6mA.minus.tombo.bed'))        

no_cov_plus = os.path.abspath(os.path.join(DIRS['BED_OUT'], 'germinated_spores.nocov.plus.bed'))
no_cov_minus = os.path.abspath(os.path.join(DIRS['BED_OUT'], 'germinated_spores.nocov.minus.bed'))
                                   
bed_fh_list = [m5c_plus, m5c_minus, m6a_plus, m6a_minus] 
                            
                            

In [10]:
#Check if this list works
print(wig_fh_list)

['/home/jamila/jamila_Storage/analyses/methylation_calling/germinated_spores/germinated_spores.5mC.dampened_fraction_modified_reads.plus.wig', '/home/jamila/jamila_Storage/analyses/methylation_calling/germinated_spores/germinated_spores.5mC.dampened_fraction_modified_reads.minus.wig', '/home/jamila/jamila_Storage/analyses/methylation_calling/germinated_spores/germinated_spores.6mA.dampened_fraction_modified_reads.plus.wig', '/home/jamila/jamila_Storage/analyses/methylation_calling/germinated_spores/germinated_spores.6mA.dampened_fraction_modified_reads.minus.wig']


In [11]:
#convert all files to bed format
for in_fh, out_fh in zip(wig_fh_list, bed_fh_list):
    print(in_fh, out_fh)
    wig_to_bed(in_fh, out_fh)

/home/jamila/jamila_Storage/analyses/methylation_calling/germinated_spores/germinated_spores.5mC.dampened_fraction_modified_reads.plus.wig /home/jamila/jamila_Storage/analyses/methylation_calling/germinated_spores/bed_files/germinated_spores.5mC.plus.tombo.bed
Writing file /home/jamila/jamila_Storage/analyses/methylation_calling/germinated_spores/bed_files/germinated_spores.5mC.plus.tombo.bed
/home/jamila/jamila_Storage/analyses/methylation_calling/germinated_spores/germinated_spores.5mC.dampened_fraction_modified_reads.minus.wig /home/jamila/jamila_Storage/analyses/methylation_calling/germinated_spores/bed_files/germinated_spores.5mC.minus.tombo.bed
Writing file /home/jamila/jamila_Storage/analyses/methylation_calling/germinated_spores/bed_files/germinated_spores.5mC.minus.tombo.bed
/home/jamila/jamila_Storage/analyses/methylation_calling/germinated_spores/germinated_spores.6mA.dampened_fraction_modified_reads.plus.wig /home/jamila/jamila_Storage/analyses/methylation_calling/germinate

In [12]:
m5c_all = m5c_plus.replace('.plus.', '.all.')
combine_and_sort([m5c_plus, m5c_minus], m5c_all)
m6a_all = m6a_plus.replace('.plus.', '.all.')
combine_and_sort([m6a_plus, m6a_minus], m6a_all)

In [13]:
bed_frac_cov_filter(m5c_all, 0.3, genome_file_fn)
bed_frac_cov_filter(m6a_all, 0.3, genome_file_fn)

In [14]:
bedgraph_to_bed(bedgraph_plus_fh, no_cov_plus)
bedgraph_to_bed(bedgraph_minus_fh, no_cov_minus)

In [15]:
!head {no_cov_plus}

tig00001174	0	2	novoc	0	+
chr8A	0	414	novoc	0	+
chr8A	370570	370802	novoc	0	+
chr8A	737711	740163	novoc	0	+
chr8A	2652627	2654918	novoc	0	+
chr8A	2845639	2846955	novoc	0	+
chr8A	3486207	3488740	novoc	0	+
chr8A	4190986	4195605	novoc	0	+
chr8A	4686480	4686827	novoc	0	+
tig00001154	0	2	novoc	0	+


In [16]:
nocov_all = no_cov_plus.replace('.plus.', '.all.')

In [17]:
combine_and_sort([no_cov_plus, no_cov_minus], nocov_all)

In [18]:
bed_frac_cov_filter(m5c_all, 0.5, genome_file_fn)
bed_frac_cov_filter(m6a_all, 0.5, genome_file_fn)