### Notebook to do methylation calling with nanopolish

In [1]:
import os
from Bio import SeqIO

In [2]:
notebook_path = os.path.abspath(".")

In [3]:
#####One OUT_DIR per treatment. This should be one for germinated spores and one for infected leaves
IN_FAST5 = os.path.abspath('../../analyses/single_fast5s/infected_leaves/mapped_fast5s')
in_fastq_fn = os.path.abspath('../../data/genomic_data/infected_leaves/all_fastq/infected_leaves_all.all.fastq')
seq_sum_fn = os.path.abspath('../../data/genomic_data/infected_leaves/sequencing_summary/infected_leaves_all.sequencing_summary.txt')
#####One OUT_DIR per treatment. This should be one for germinated spores and one for infected leaves
OUT_DIR = os.path.abspath('../../analyses/pycometh/infected_leaves')
meth_freq_script = '/home/jamila/jamila_Storage/scripts/nanopolish_scripts/calculate_methylation_frequency.py'
meth_compare_script = '/home/jamila/jamila_Storage/scripts/nanopolish_scripts/co'

In [5]:
ref_genome = os.path.abspath('../../analyses/pycometh/chr_A_B_unassigned.fasta')
n_threads = 20

In [6]:
###file names for rerunning 
in_fastq_clean_fn = os.path.join(OUT_DIR, os.path.basename(in_fastq_fn).replace('all.fastq','clean_all.fastq'))
bam_fn = os.path.basename(ref_genome).replace('.fa', '') \
+'.'+ os.path.basename(in_fastq_clean_fn).replace('.fastq', '.bam')
bam_fn = os.path.join(OUT_DIR, bam_fn)
meth_call_fn = bam_fn.replace('.bam', '.meth_call.tsv')
meth_freq_fn = meth_call_fn.replace( '.meth_call.tsv', '.meth_freq.tsv')

# Section 1 checking the input

In [None]:
#filer the fastq files to line up with the fast5 files
fast5s_names = set([x.replace('.fast5','') for x in os.listdir(IN_FAST5)])

In [None]:
fastqs = []
for seq in SeqIO.parse(in_fastq_fn, 'fastq'):
    if seq.id in fast5s_names:
        fastqs.append(seq)

In [None]:
in_fastq_clean_fn = os.path.join(OUT_DIR, os.path.basename(in_fastq_fn).replace('all.fastq','clean_all.fastq'))

In [None]:
if len(fastqs) == len(fast5s_names):
    in_fastq_clean_fn = os.path.join(OUT_DIR, os.path.basename(in_fastq_fn).replace('all.fastq','clean_all.fastq'))
    with open(in_fastq_clean_fn,'w') as fh:
        SeqIO.write(fastqs, fh, 'fastq')
    print('fastq subsetted')
    !head {in_fastq_clean_fn}

# Section 2 nanopolish methylation calling

In [None]:
#change directory
os.chdir(OUT_DIR)
basename = os.path.basename(OUT_DIR)

In [None]:
%%capture cap_out_index
%time
!nanopolish index -d {IN_FAST5} {in_fastq_clean_fn}

In [None]:
###print stdout
print(cap_out_index.stdout)

In [None]:
###print stderr
print(cap_out_index.stderr)

In [None]:
bam_fn = os.path.basename(ref_genome).replace('.fa', '') \
+'.'+ os.path.basename(in_fastq_clean_fn).replace('.fastq', '.bam')
bam_fn = os.path.join(OUT_DIR, bam_fn)
print(bam_fn)

In [None]:
#%%capture cap_out_map
%time
!minimap2 -a -t 20 -x map-ont {ref_genome} {in_fastq_clean_fn} | samtools sort -@20 -T tmp -o {bam_fn}
!samtools index -@20 {bam_fn}

In [None]:
%%capture cap_out_methcall
%time
!nanopolish call-methylation -t 20 -r {in_fastq_clean_fn} -b {bam_fn} -g {ref_genome} > {meth_call_fn}

In [None]:
print(cap_out_methcall.stdout)

In [None]:
!python3 {meth_freq_script} {meth_call_fn} > {meth_freq_fn}

In [None]:
!head -200 {meth_freq_fn}

## Comparative methylation calling with pycometh

In [7]:
os.chdir(notebook_path)

In [8]:
PYCO_OUT_DIR = os.path.abspath('../../pycometh/comparative/')

In [9]:
from pycoMeth.CpG_Aggregate import CpG_Aggregate
from pycoMeth.Interval_Aggregate import Interval_Aggregate
from pycoMeth.CGI_Finder import CGI_Finder
from pycoMeth.Meth_Comp import Meth_Comp
# Optionally inport jupyter helper functions
from pycoMeth.common import head, jhelp
import sys

In [10]:
##setup
min_depth = 3
sample_id = 'infected_leaves'

In [11]:
cpg_agg_bed_fn = meth_call_fn.replace('.meth_call', '.CpG_agg').replace('.tsv','.bed')
cpg_agg_tsv_fn = meth_call_fn.replace('.meth_call', '.CpG_agg').replace('.tsv','.tsv.gz')

In [12]:
ff = CpG_Aggregate(nanopolish_fn= meth_call_fn,
    ref_fasta_fn=ref_genome,
    output_bed_fn=cpg_agg_bed_fn,
    output_tsv_fn=cpg_agg_tsv_fn,
    sample_id=sample_id,
    progress=True)

[01;34m## Checking options and input files ##[0m
[01;34m## Parsing methylation_calls file ##[0m
[32m	Starting to parse file Nanopolish methylation call file[0m
	Progress: 100%|██████████| 3.14G/3.14G [03:50<00:00, 13.7M bytes/s]
[32m	Parsing summary[0m
[32m		Lines Parsed: 30,825,533[0m
[32m		Line successfully parsed: 30,825,533[0m
[32m		Input files: 1[0m
[32m	Filtering out low coverage sites[0m
[32m	Sorting each chromosome by coordinates[0m
[32m	Sites summary[0m
[32m		Total Valid Lines: 30,825,533[0m
[32m		Initial Sites: 3,270,496[0m
[32m		Low Count Sites: 1,828,922[0m
[32m		Valid Sites Found: 1,441,574[0m
[01;34m## Processing valid sites found and write to file ##[0m
	Progress: 100%|██████████| 1.44M/1.44M [02:59<00:00, 8.01k sites/s]
[32m	Results summary[0m
[32m		Total Sites Writen: 1,441,574[0m
[32m		Unmethylated sites: 657,050[0m
[32m		Ambiguous sites: 435,359[0m
[32m		Methylated sites: 349,165[0m


In [13]:
for fn in [500, 1000, 5000]:
    interval = fn
    int_agg_bed_fn = meth_call_fn.replace('.meth_call', '.interval_%s_agg' % interval).replace('.tsv','.bed')
    int_agg_tsv_fn = meth_call_fn.replace('.meth_call', '.interval_%s_agg' % interval).replace('.tsv','.tsv.gz')
    #print(int_agg_bed_fn, '\n', int_agg_tsv_fn)
    fg = Interval_Aggregate(
    cpg_aggregate_fn=cpg_agg_tsv_fn,
    ref_fasta_fn=ref_genome,
    output_bed_fn=int_agg_bed_fn,
    output_tsv_fn=int_agg_tsv_fn,
    interval_size=interval,
    min_cpg_per_interval=1,
    sample_id=sample_id,
    progress=True)

[01;34m## Checking options and input files ##[0m
[01;34m## Parsing CpG_aggregate file ##[0m
	Progress: 172M bytes [00:31, 5.45M bytes/s]                        
[32m	Results summary[0m
[32m		Lines parsed: 1,441,574[0m
[32m		Total number of intervals: 353,009[0m
[32m	Writter summary[0m
[32m		Empty intervals skipped: 194,000[0m
[32m		Valid intervals written: 159,009[0m
[01;34m## Checking options and input files ##[0m
[01;34m## Parsing CpG_aggregate file ##[0m
	Progress: 172M bytes [00:26, 6.45M bytes/s]                        
[32m	Results summary[0m
[32m		Lines parsed: 1,441,574[0m
[32m		Total number of intervals: 176,557[0m
[32m	Writter summary[0m
[32m		Valid intervals written: 89,540[0m
[32m		Empty intervals skipped: 87,017[0m
[01;34m## Checking options and input files ##[0m
[01;34m## Parsing CpG_aggregate file ##[0m
	Progress: 172M bytes [00:17, 9.78M bytes/s]                        
[32m	Results summary[0m
[32m		Lines parsed: 1,441,574[0m
[32

In [14]:
###CGI island finder -> error might fix it
fl = CGI_Finder (
    ref_fasta_fn=ref_genome,
    output_bed_fn=ref_genome.replace(".fasta", ".CGI.bed"),
    output_tsv_fn=ref_genome.replace(".fasta", ".CGI.tsv"),
    progress=True)

[01;34m## Checking options and input files ##[0m
[01;34m## Parsing reference fasta file ##[0m
[32m	Parsing Reference sequence: chr1A[0m
	Progress: 100%|██████████| 6.16M/6.16M [00:05<00:00, 1.07M bases/s]
[32m	Parsing Reference sequence: chr2A[0m
	Progress: 100%|██████████| 6.06M/6.06M [00:05<00:00, 1.09M bases/s]
[32m	Parsing Reference sequence: chr3A[0m
	Progress: 100%|██████████| 6.03M/6.03M [00:05<00:00, 1.04M bases/s]
[32m	Parsing Reference sequence: chr4A[0m
	Progress: 100%|██████████| 5.97M/5.97M [00:05<00:00, 1.05M bases/s]
[32m	Parsing Reference sequence: chr5A[0m
	Progress: 100%|██████████| 5.56M/5.56M [00:05<00:00, 1.03M bases/s]
[32m	Parsing Reference sequence: chr6A[0m
	Progress: 100%|██████████| 5.55M/5.55M [00:05<00:00, 1.08M bases/s]
[32m	Parsing Reference sequence: chr7A[0m
	Progress: 100%|██████████| 5.18M/5.18M [00:04<00:00, 1.08M bases/s]
[32m	Parsing Reference sequence: chr8A[0m
	Progress: 100%|██████████| 5.11M/5.11M [00:04<00:00, 1.07M bases/s

	Progress: 100%|██████████| 37.4k/37.4k [00:00<00:00, 1.13M bases/s]
[32m	Parsing Reference sequence: tig00000329[0m
	Progress: 100%|██████████| 103k/103k [00:00<00:00, 1.02M bases/s]
[32m	Parsing Reference sequence: tig00000331[0m
	Progress: 100%|██████████| 51.5k/51.5k [00:00<00:00, 1.01M bases/s]
[32m	Parsing Reference sequence: tig00000332[0m
	Progress: 100%|██████████| 105k/105k [00:00<00:00, 1.04M bases/s]
[32m	Parsing Reference sequence: tig00000333[0m
	Progress: 100%|██████████| 24.0k/24.0k [00:00<00:00, 986k bases/s]
[32m	Parsing Reference sequence: tig00000486[0m
	Progress: 100%|██████████| 29.0k/29.0k [00:00<00:00, 1.01M bases/s]
[32m	Parsing Reference sequence: tig00000487[0m
	Progress: 100%|██████████| 63.2k/63.2k [00:00<00:00, 1.02M bases/s]
[32m	Parsing Reference sequence: tig00000489[0m
	Progress: 100%|██████████| 57.2k/57.2k [00:00<00:00, 1.03M bases/s]
[32m	Parsing Reference sequence: tig00000491[0m
	Progress: 100%|██████████| 25.0k/25.0k [00:00<00:00,

	Progress: 100%|██████████| 32.4k/32.4k [00:00<00:00, 1.04M bases/s]
[32m	Parsing Reference sequence: tig00001174[0m
	Progress: 100%|██████████| 88.9k/88.9k [00:00<00:00, 937k bases/s]
[32m	Parsing Reference sequence: tig00001177[0m
	Progress: 100%|██████████| 46.2k/46.2k [00:00<00:00, 1.08M bases/s]
[32m	Parsing Reference sequence: tig00001179[0m
	Progress: 100%|██████████| 95.2k/95.2k [00:00<00:00, 968k bases/s]
[32m	Parsing Reference sequence: tig00001182[0m
	Progress: 100%|██████████| 21.7k/21.7k [00:00<00:00, 973k bases/s]
[32m	Parsing Reference sequence: tig00001189[0m
	Progress: 100%|██████████| 29.5k/29.5k [00:00<00:00, 1.13M bases/s]
[32m	Parsing Reference sequence: tig00001197[0m
	Progress: 100%|██████████| 28.8k/28.8k [00:00<00:00, 994k bases/s]
[32m	Parsing Reference sequence: tig00001199[0m
	Progress: 100%|██████████| 17.9k/17.9k [00:00<00:00, 1.04M bases/s]
[32m	Parsing Reference sequence: tig00001202[0m
	Progress: 100%|██████████| 33.3k/33.3k [00:00<00:00

[32m		Number of reference sequences: 207[0m


In [20]:
ref_genome.replace(".fasta", ".CGI.bed")

'/home/jamila/jamila_Storage/analyses/pycometh/chr_A_B_unassigned.CGI.bed'

In [21]:
ref_genome.replace(".fasta", ".CGI.tsv")

'/home/jamila/jamila_Storage/analyses/pycometh/chr_A_B_unassigned.CGI.tsv'

In [15]:
cpg_agg_il_tsv = cpg_agg_tsv_fn
cpg_agg_sp_tsv = '/home/jamila/jamila_Storage/analyses/pycometh/germinated_spores/chr_A_B_unassignedsta.germinated_spores_1.clean_all.CpG_agg.tsv.gz'

In [16]:
ref_genome

'/home/jamila/jamila_Storage/analyses/pycometh/chr_A_B_unassigned.fasta'

In [17]:
fg = Meth_Comp (
    aggregate_fn_list=[cpg_agg_il_tsv, cpg_agg_sp_tsv],
    ref_fasta_fn=ref_genome,
    output_bed_fn=os.path.join(PYCO_OUT_DIR, 'Pgt21-0.comp_CpG.il_sp.bed'),
    output_tsv_fn=os.path.join(PYCO_OUT_DIR, 'Pgt21-0.comp_CpG.il_sp.tsv.gz'),
    sample_id_list=['infected_leaves',"spores"],
    max_missing = 1,
    min_diff_llr = 0,
    progress=True)

[01;34m## Checking options and input files ##[0m
[01;34m## Parsing files ##[0m
[32m	Reading input files header and checking consistancy between headers[0m
[32m	Starting asynchronous file parsing[0m
	Progress: 322M bytes [00:36, 8.72M bytes/s]                       
[32m	Adjust pvalues[0m
[32m	Writing output file[0m
	Progress: 100%|██████████| 34.2k/34.2k [00:02<00:00, 15.4k sites/s]
[32m	Results summary[0m
[32m		Sites with insufficient samples: 1,551,785[0m
[32m		Sites with insufficient effect size: 553,811[0m
[32m		Valid sites: 34,242[0m
[32m		Sites with non-significant adjusted pvalue: 33,945[0m
[32m		Sites with non-significant pvalue: 31,623[0m
[32m		Sites with significant pvalue: 2,619[0m
[32m		Sites with significant adjusted pvalue: 297[0m


In [18]:
int_agg_il_tsv = "/home/jamila/jamila_Storage/analyses/pycometh/infected_leaves/chr_A_B_unassignedsta.infected_leaves_all.clean_all.interval_1000_agg.tsv.gz"
int_agg_sp_tsv = "/home/jamila/jamila_Storage/analyses/pycometh/germinated_spores/chr_A_B_unassignedsta.germinated_spores_1.clean_all.interval_1000_agg.tsv.gz"
fg = Meth_Comp (
    aggregate_fn_list=[cpg_agg_il_tsv, cpg_agg_sp_tsv],
    ref_fasta_fn=ref_genome,
    output_bed_fn=os.path.join(PYCO_OUT_DIR, 'Pgt21-0.comp_w1000.il_sp.bed'),
    output_tsv_fn=os.path.join(PYCO_OUT_DIR, 'Pgt21-0.comp_w1000.il_sp.tsv.gz'),
    sample_id_list=['infected_leaves',"spores"],
    max_missing = 1,
    min_diff_llr = 1,
    progress=True)

[01;34m## Checking options and input files ##[0m
[01;34m## Parsing files ##[0m
[32m	Reading input files header and checking consistancy between headers[0m
[32m	Starting asynchronous file parsing[0m
	Progress: 322M bytes [00:28, 11.2M bytes/s]                       
[32m	Adjust pvalues[0m
[32m	Writing output file[0m
	Progress: 100%|██████████| 6.17k/6.17k [00:00<00:00, 19.0k sites/s]
[32m	Results summary[0m
[32m		Sites with insufficient samples: 1,551,785[0m
[32m		Sites with insufficient effect size: 581,884[0m
[32m		Valid sites: 6,169[0m
[32m		Sites with non-significant adjusted pvalue: 6,169[0m
[32m		Sites with non-significant pvalue: 5,468[0m
[32m		Sites with significant pvalue: 701[0m


In [19]:
int_agg_il_tsv = "/home/jamila/jamila_Storage/analyses/pycometh/infected_leaves/chr_A_B_unassignedsta.infected_leaves_all.clean_all.interval_500_agg.tsv.gz"
int_agg_sp_tsv = "/home/jamila/jamila_Storage/analyses/pycometh/germinated_spores/chr_A_B_unassignedsta.germinated_spores_1.clean_all.interval_500_agg.tsv.gz"
fg = Meth_Comp (
    aggregate_fn_list=[cpg_agg_il_tsv, cpg_agg_sp_tsv],
    ref_fasta_fn=ref_genome,
    output_bed_fn=os.path.join(PYCO_OUT_DIR, 'Pgt21-0.comp_w500.il_sp.bed'),
    output_tsv_fn=os.path.join(PYCO_OUT_DIR, 'Pgt21-0.comp_w500.il_sp.tsv.gz'),
    sample_id_list=['infected_leaves',"spores"],
    max_missing = 1,
    min_diff_llr = 1,
    progress=True)

[01;34m## Checking options and input files ##[0m
[01;34m## Parsing files ##[0m
[32m	Reading input files header and checking consistancy between headers[0m
[32m	Starting asynchronous file parsing[0m
	Progress: 322M bytes [00:28, 11.5M bytes/s]                       
[32m	Adjust pvalues[0m
[32m	Writing output file[0m
	Progress: 100%|██████████| 6.17k/6.17k [00:00<00:00, 19.6k sites/s]
[32m	Results summary[0m
[32m		Sites with insufficient samples: 1,551,785[0m
[32m		Sites with insufficient effect size: 581,884[0m
[32m		Valid sites: 6,169[0m
[32m		Sites with non-significant adjusted pvalue: 6,169[0m
[32m		Sites with non-significant pvalue: 5,468[0m
[32m		Sites with significant pvalue: 701[0m
