### Notebook to do methylation calling with tombo

In [1]:
import os
from joblib import Parallel, delayed

In [2]:
INITIAL_MAPPED_BASEDIR = os.path.abspath('../../analyses/mapping/infected_leaves')
#####One OUT_DIR per treatment. This should be one for germinated spores and one for infected leaves
IN_DIR = os.path.abspath('../../analyses/single_fast5s/infected_leaves/mapped_fast5s')
#####One OUT_DIR per treatment. This should be one for germinated spores and one for infected leaves
OUT_DIR = os.path.abspath('../../analyses/methylation_calling/infected_leaves')


In [3]:
ref_genome = os.path.abspath('../../data/genomic_resources/chr_A_B_unassigned.fasta')
n_threads = 20

# Section 1 checking the input

In [4]:
mappingid_fns = []
for dir_ in os.listdir(INITIAL_MAPPED_BASEDIR):
    dir_ = os.path.join(INITIAL_MAPPED_BASEDIR, dir_) 
    if os.path.isdir(dir_):
        mappingid_fn = [os.path.join(dir_, x) for x in os.listdir(dir_) if x.endswith('.mappedids.txt') ][0]
        mappingid_fns.append(mappingid_fn)

In [5]:
nummapped_reads = 0
for mappingid_fn in mappingid_fns:
    with open(mappingid_fn, mode = 'r') as fh:
        for line in fh:
            nummapped_reads += 1

In [6]:
#This checks if the number of mapped reads is consistent with the number of single fast5s
len([x for x in os.listdir(IN_DIR) if x.endswith('.fast5')]) == nummapped_reads

True

In [7]:
(IN_DIR)

'/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/mapped_fast5s'

# Section 2 tombo methylation calling

In [None]:
#change directory
os.chdir(OUT_DIR)
basename = os.path.basename(OUT_DIR)

In [None]:
%%capture cap_out_resquiggle
%time
!tombo resquiggle {IN_DIR} {ref_genome} --processes {n_threads} --num-most-common-error 5 --dna --ignore-read-locks --signal-align-parameters --overwrite  


In [None]:
###print stdout
print(cap_out_resquiggle.stdout)

In [None]:
###print stderr
print(cap_out_resquiggle.stderr)

In [None]:
%%capture cap_out_detect_modifications
%time
!tombo detect_modifications alternative_model --fast5-basedirs {IN_DIR} --statistics-file-basename {basename} --alternate-bases 5mC 6mA --processes {n_threads}

In [None]:
###print stdout
print(cap_out_detect_modifications.stdout)

In [None]:
###print stderr
print(cap_out_detect_modifications.stderr)

In [None]:
%%capture cap_out_most_significant_5mC
%time
!tombo plot most_significant --fast5-basedirs {IN_DIR} --statistics-filename {basename}.5mC.tombo.stats  --plot-standard-model --plot-alternate-model 5mC --pdf-filename {basename}.most_significant_5mC_sites.pdf 

In [None]:
###print stdout
print(cap_out_most_significant_5mC.stdout)

In [None]:
###print stderr
print(cap_out_most_significant_5mC.stderr)

In [None]:
%%capture cap_out_most_significant_6mA
%time
!tombo plot most_significant --fast5-basedirs {IN_DIR} --statistics-filename {basename}.6mA.tombo.stats  --plot-standard-model --plot-alternate-model 6mA --pdf-{basename}.most_significant_6mA_sites.pdf 

In [None]:
###print stdout
print(cap_out_most_significant_6mA.stdout)

In [None]:
###print stderr
print(cap_out_most_significant_6mA.stderr)

In [None]:
##produce wig files with estimated fraction if modified reads at each valid reference site
%%capture cap_out_5mC_wigfile
%time
!tombo text_output browser_files --statistics-filename {basename}.5mC.tombo.stats --file-type dampened_fraction --browser-file-basename {basename}.5mC

In [None]:
###print stdout
print(cap_out_5mC_wigfile.stdout)

In [None]:
###print stderr
print(cap_out_5mC_wigfile.stderr)

In [None]:
%%capture cap_out_6mA.wig
%time
!tombo text_output browser_files --statistics-filename {basename}.6mA.tombo.stats --file-type dampened_fraction --browser-file-basename {basename}.6mA

In [None]:
###print stdout
print(cap_out_6mA_wig.stdout)

In [None]:
###print stderr
print(cap_out_6mA_wig.stderr)

In [None]:
###to produce successfully processed reads coverage file for reference
!tombo text_output browser_file --fast5-basedirs {IN_DIR} --file-types coverage --browser-file-basename {basename}

In [None]:
###first check if we have the right amount of fastq entries in our file
int(fastq_entries[0]) == single_fast5_count
###You want this to be True

In [None]:
###Now check on if ids match up
fastqids_fn = fastq_all_fn.replace('.fastq', '.fastqids.txt')
!cat {fastq_all_fn} | grep 'sampleid'|  cut -d ' ' -f 1 | sed 's/@//g' > {fastqids_fn}

In [None]:
###Read in ids as set
fastq_ids = []
with open(fastqids_fn) as fh:
    for line in fh:
        fastq_ids.append(line.strip('\n'))
fastq_ids = set(fastq_ids)

In [None]:
match_count = 0
for directory in os.listdir(FAST5singleIN_DIR):
    directory = os.path.join(FAST5singleIN_DIR, directory)
    if os.path.isdir(directory):
        fast5s = [fn for fn in os.listdir(directory) if fn.endswith('.fast5')]
        for fast5_file in fast5s: 
            if fast5_file.replace('.fast5', '') in fastq_ids:
                match_count = match_count + 1

In [None]:
####This needs to be true
match_count == int(fastq_entries[0]) == single_fast5_count
####This needs to be true

### If above is false go to section 3 and execute this before moving on

# Section 2 mapping the reads and pulling out the mapped fast5s

In [None]:
bam_fn = os.path.join(BAM_DIR, os.path.basename(fastq_all_fn).replace('.fastq', '.sorted.bam'))

In [None]:
!minimap2 -t 15 -ax map-ont {minimap_index} {fastq_all_fn} | samtools sort -@ 15 -o {bam_fn} -

In [None]:
#this is only here because the mapping was done on the command line and not in here
#if mapping is done in here don't execute this cell
bam_fn = '../../analyses/mapping/infected_leaves/infected_leaves_1/infected_leaves_1.sorted.bam'

In [None]:
##generated the mapped read ID list
mappedids_fn = bam_fn.replace('.bam', '.mappedids.txt')
!samtools  view -F 4  {bam_fn} | cut -f 1 | sort | uniq > {mappedids_fn}

In [None]:
#get the mapped ids as a set
mapped_reads = []
with open(mappedids_fn) as fh:
    for line in fh:
        mapped_reads.append(line.rstrip())
mapped_reads = set(mapped_reads)

In [None]:
len(mapped_reads)

In [None]:
#move fast5s you want from tmp to out dir
match_count = 0
for directory in os.listdir(FAST5singleIN_DIR):
    directory = os.path.join(FAST5singleIN_DIR, directory)
    #check if path is directory
    if os.path.isdir(directory):
        #get all fastq files
        fast5s = [fn for fn in os.listdir(directory) if fn.endswith('.fast5')]
        for fast5_file in fast5s:
            if fast5_file.replace('.fast5', '') in mapped_reads:
                match_count = match_count + 1
                #move the files by renaming absolute path
                old_fn = os.path.join(directory, fast5_file)
                new_fn = os.path.join(OUT_DIR, fast5_file)
                os.replace(old_fn, new_fn)
        

In [None]:
##This should be true
len(mapped_reads) == match_count

### Below are useful code snippets we leave for now but won't execute

# Section 3 Regenerating fastqs if they don't add up

In [None]:
#Run only if the tests above do fail
%run -i infected_leaves_2_fast5_to_fastq.py

In [None]:
#combine all fastqs
all_fastq_fn = os.path.join(FAST5singleIN_DIR,  '%s.fastq' % os.path.basename(FAST5singleIN_DIR))
with open(all_fastq_fn, mode='w') as all_fastq_fh:
    for dir_ in dirs:
        fn = os.path.join(os.path.join(dir_), os.path.basename(dir_) + '.fastq')
        #print(fn)
        with open(fn, mode = 'r') as fh:
            for line in fh:
                line = line.rstrip()
                print(line, file=all_fastq_fh)

In [None]:
fastq_entries = !cat {all_fastq_fn} | grep 'sampleid' | wc -l

In [None]:
int(fastq_entries[0]) == single_fast5_count

In [None]:
all_fastq_fn = os.path.join(FAST5singleIN_DIR,  '%s.fastq' % os.path.basename(FAST5singleIN_DIR))
fastqids_fn = all_fastq_fn.replace('.fastq', '.fastqids.txt')
!cat {all_fastq_fn} | grep 'sampleid'|  cut -d ' ' -f 1 | sed 's/@//g' > {fastqids_fn}

In [None]:
fastq_reads = []
with open(fastqids_fn) as fh:
    for line in fh:
        fastq_reads.append(line.strip('\n'))
fastq_reads = set(fastq_reads)

In [None]:
len(fastq_reads) == single_fast5_count

In [None]:
count = 0
TMPOUT_DIR = FAST5singleIN_DIR
for directory in os.listdir(TMPOUT_DIR):
    directory = os.path.join(TMPOUT_DIR, directory)
    #check if path is directory
    if os.path.isdir(directory):
        #print(directory)
        fast5s = [fn for fn in os.listdir(directory) if fn.endswith('.fast5')]
        #missing = set([x.replace('.fast5', '') for x in fast5s]) - fastq_reads
        #print(len(missing))
        for fast5_file in fast5s:
            
            if fast5_file.replace('.fast5', '') in fastq_reads:
                count = count + 1
                #move the files by renaming absolute path
                #old_fn = os.path.join(directory, fast5_file)
                #new_fn = os.path.join(OUT_DIR, fast5_file)
                #os.replace(old_fn, new_fn)
        #count = count + len(fast5s)
        #print(count)

In [None]:
count == single_fast5_count