### Notebook to make the the fastq files line up with the mapped bam files and the single fast5 files

In [11]:
import os
from joblib import Parallel, delayed

In [12]:
###define input directories here 
##FAST5_single(unfiltered_single_fast5s)
FAST5singleIN_DIR = '../../analyses/single_fast5s/germinated_spores/germinated_spores_1_fast5_single_fast5'
#####One OUT_DIR per treatment. This should be one for germinated spores and one for infected leaves
OUT_DIR = '../../analyses/single_fast5s/germinated_spores/mapped_fast5s'
#####One OUT_DIR per treatment. This should be one for germinated spores and one for infected leaves
BAM_DIR = '../../analyses/mapping/germinated_spores/rep1'
fastq_all_fn = '../../data/genomic_data/germinated_spores/all_fastq/germinated_spores_1.all.fastq'

In [13]:
minimap_index = '../../data/genomic_resources/chr_A_B_unassigned.fasta'
n_threads = 20

In [14]:
#counts single fast5s and fastqs
single_fast5_count = 0
fastqs = []
dirs = []
for direcotry in (os.path.join(FAST5singleIN_DIR, x) for x in os.listdir(FAST5singleIN_DIR) if os.path.isdir(os.path.join(FAST5singleIN_DIR, x))):
    dirs.append(direcotry)
    fast5s = [os.path.join(direcotry ,x) for x in os.listdir(direcotry) if x.endswith('.fast5')]
    single_fast5_count += len(fast5s)
    for x in [os.path.join(direcotry ,x) for x in os.listdir(direcotry) if x.endswith('.fastq')]:
        fastqs.append(x)
print('This is the number of fast5s: %s' % single_fast5_count)

This is the number of fast5s: 377791


# Section 1 checking the input

In [15]:
fastq_entries = !cat {fastq_all_fn} | grep 'sampleid' | wc -l

In [16]:
###first check if we have the right amount of fastq entries in our file
int(fastq_entries[0]) == single_fast5_count
###You want this to be True

True

In [17]:
###Now check on if ids match up
fastqids_fn = fastq_all_fn.replace('.fastq', '.fastqids.txt')
!cat {fastq_all_fn} | grep 'sampleid'|  cut -d ' ' -f 1 | sed 's/@//g' > {fastqids_fn}

In [18]:
###Read in ids as set
fastq_ids = []
with open(fastqids_fn) as fh:
    for line in fh:
        fastq_ids.append(line.strip('\n'))
fastq_ids = set(fastq_ids)

In [19]:
match_count = 0
for directory in os.listdir(FAST5singleIN_DIR):
    directory = os.path.join(FAST5singleIN_DIR, directory)
    if os.path.isdir(directory):
        fast5s = [fn for fn in os.listdir(directory) if fn.endswith('.fast5')]
        for fast5_file in fast5s: 
            if fast5_file.replace('.fast5', '') in fastq_ids:
                match_count = match_count + 1

In [20]:
####This needs to be true
match_count == int(fastq_entries[0]) == single_fast5_count
####This needs to be true

True

### If above is false go to section 3 and execute this before moving on

# Section 2 mapping the reads and pulling out the mapped fast5s

In [21]:
bam_fn = os.path.join(BAM_DIR, os.path.basename(fastq_all_fn).replace('.fastq', '.sorted.bam'))

In [22]:
#!minimap2 -t 15 -ax map-ont {minimap_index} {fastq_all_fn} | samtools sort -@ 15 -o {bam_fn} -

In [23]:
#this is only here because the mapping was done on the command line and not in here
#if mapping is done in here don't execute this cell
bam_fn = '../../analyses/mapping/germinated_spores/rep1/germinated_spores_1.sorted.bam'

In [24]:
##generated the mapped read ID list
mappedids_fn = bam_fn.replace('.bam', '.mappedids.txt')
!samtools  view -F 4  {bam_fn} | cut -f 1 | sort | uniq > {mappedids_fn}

In [25]:
#get the mapped ids as a set
mapped_reads = []
with open(mappedids_fn) as fh:
    for line in fh:
        mapped_reads.append(line.rstrip())
mapped_reads = set(mapped_reads)

In [26]:
len(mapped_reads)

159573

In [27]:
#move fast5s you want from tmp to out dir
match_count = 0
for directory in os.listdir(FAST5singleIN_DIR):
    directory = os.path.join(FAST5singleIN_DIR, directory)
    #check if path is directory
    if os.path.isdir(directory):
        #get all fastq files
        fast5s = [fn for fn in os.listdir(directory) if fn.endswith('.fast5')]
        for fast5_file in fast5s:
            if fast5_file.replace('.fast5', '') in mapped_reads:
                match_count = match_count + 1
                #move the files by renaming absolute path
                old_fn = os.path.join(directory, fast5_file)
                new_fn = os.path.join(OUT_DIR, fast5_file)
                os.replace(old_fn, new_fn)
        

In [28]:
##This should be true
len(mapped_reads) == match_count

True

### Below are useful code snippets we leave for now but won't execute

# Section 3 Regenerating fastqs if they don't add up

In [None]:
#Run only if the tests above do fail
%run -i infected_leaves_2_fast5_to_fastq.py

In [None]:
#combine all fastqs
all_fastq_fn = os.path.join(FAST5singleIN_DIR,  '%s.fastq' % os.path.basename(FAST5singleIN_DIR))
with open(all_fastq_fn, mode='w') as all_fastq_fh:
    for dir_ in dirs:
        fn = os.path.join(os.path.join(dir_), os.path.basename(dir_) + '.fastq')
        #print(fn)
        with open(fn, mode = 'r') as fh:
            for line in fh:
                line = line.rstrip()
                print(line, file=all_fastq_fh)

In [None]:
fastq_entries = !cat {all_fastq_fn} | grep 'sampleid' | wc -l

In [None]:
int(fastq_entries[0]) == single_fast5_count

In [None]:
all_fastq_fn = os.path.join(FAST5singleIN_DIR,  '%s.fastq' % os.path.basename(FAST5singleIN_DIR))
fastqids_fn = all_fastq_fn.replace('.fastq', '.fastqids.txt')
!cat {all_fastq_fn} | grep 'sampleid'|  cut -d ' ' -f 1 | sed 's/@//g' > {fastqids_fn}

In [None]:
fastq_reads = []
with open(fastqids_fn) as fh:
    for line in fh:
        fastq_reads.append(line.strip('\n'))
fastq_reads = set(fastq_reads)

In [None]:
len(fastq_reads) == single_fast5_count

In [None]:
count = 0
TMPOUT_DIR = FAST5singleIN_DIR
for directory in os.listdir(TMPOUT_DIR):
    directory = os.path.join(TMPOUT_DIR, directory)
    #check if path is directory
    if os.path.isdir(directory):
        #print(directory)
        fast5s = [fn for fn in os.listdir(directory) if fn.endswith('.fast5')]
        #missing = set([x.replace('.fast5', '') for x in fast5s]) - fastq_reads
        #print(len(missing))
        for fast5_file in fast5s:
            
            if fast5_file.replace('.fast5', '') in fastq_reads:
                count = count + 1
                #move the files by renaming absolute path
                #old_fn = os.path.join(directory, fast5_file)
                #new_fn = os.path.join(OUT_DIR, fast5_file)
                #os.replace(old_fn, new_fn)
        #count = count + len(fast5s)
        #print(count)

In [None]:
count == single_fast5_count