### Notebook to do methylation calling with nanopolish

This is a re-run of the notebook for revisions. There are some adjustments based on adjusted input infrastructure. Rewind with git for older version.

In [1]:
import os
from Bio import SeqIO

In [2]:
notebook_path = os.path.abspath(".")

In [3]:
INITIAL_MAPPED_BASEDIR = os.path.abspath('../../analyses/mapping/infected_leaves')
#####One OUT_DIR per treatment. This should be one for germinated spores and one for infected leaves
IN_FAST5 = os.path.abspath('../../data/genomic_data/infected_leaves/workspace_fast5/basecalled/workspace')
in_fastq_fn = os.path.abspath('../../data/genomic_data/infected_leaves/workspace_fast5/basecalled/20210407.PgtInfectedLeaves.Mapped.fastq.gz')
seq_sum_fn = os.path.abspath('../../data/genomic_data/infected_leaves/workspace_fast5/basecalled/sequencing_summary.txt')
#####One OUT_DIR per treatment. This should be one for germinated spores and one for infected leaves
OUT_DIR = os.path.abspath('../../analyses/pycometh/infected_leaves')
meth_freq_script = '/home/jamila/jamila_Storage/scripts/nanopolish_scripts/calculate_methylation_frequency.py'

In [4]:
ref_genome = os.path.abspath('../../analyses/pycometh/chr_A_B_unassigned.fasta')
n_threads = 20

In [5]:
###file names for rerunning 
in_fastq_clean_fn = os.path.join(OUT_DIR, os.path.basename(in_fastq_fn).replace('all.fastq','clean_all.fastq'))
bam_fn = os.path.basename(ref_genome).replace('.fa', '') \
+'.'+ os.path.basename(in_fastq_clean_fn).replace('.fastq.gz', '.bam')
bam_fn = os.path.join(OUT_DIR, bam_fn)
SamtoolsStatsFn = bam_fn.replace('.bam', '.stats.txt')
meth_call_fn = bam_fn.replace('.bam', '.meth_call.tsv')
meth_freq_fn = meth_call_fn.replace( '.meth_call.tsv', '.meth_freq.tsv')

# Section 1 checking the input

# Section 2 nanopolish methylation calling

In [6]:
#change directory
os.chdir(OUT_DIR)
basename = os.path.basename(OUT_DIR)

In [None]:
%%capture cap_out_index
%time
!nanopolish index -d {IN_FAST5} {in_fastq_fn}

In [None]:
###print stdout
print(cap_out_index.stdout)

In [None]:
###print stderr
print(cap_out_index.stderr)

In [None]:
%%capture cap_out_map
%time
!minimap2 -a -t 20 -x map-ont {ref_genome} {in_fastq_fn} | samtools sort -@6 -T tmp -o {bam_fn}
!samtools index -@20 {bam_fn}
!samtools stats -@20 {bam_fn} > {SamtoolsStatsFn}

In [None]:
!head {SamtoolsStatsFn}

In [None]:
%%capture cap_out_methcall
%time
!nanopolish call-methylation -t 20 -r {in_fastq_fn} -b {bam_fn} -g {ref_genome} > {meth_call_fn}

In [21]:
print('hello')

hello


In [None]:
print(cap_out_methcall.stdout)

In [None]:
!python3 {meth_freq_script} {meth_call_fn} > {meth_freq_fn}

In [23]:
!head -200 {meth_freq_fn}

chromosome	start	end	num_motifs_in_group	called_sites	called_sites_methylated	methylated_frequency	group_sequence
chr10A	24486	24486	1	1	1	1.000	GACAGCGTCAG
chr10A	24745	24745	1	1	1	1.000	TATGCCGTATA
chr10A	25163	25163	1	1	1	1.000	AGTTGCGTTGG
chr10A	25564	25564	1	1	1	1.000	TGATCCGTATC
chr10A	25636	25636	1	1	1	1.000	CAAAGCGCACA
chr10A	25854	25854	1	1	1	1.000	CAAACCGTACA
chr10A	26003	26003	1	1	1	1.000	AACATCGTAGA
chr10A	26319	26324	2	2	2	1.000	AATCACGCCTCGTGCC
chr10A	26646	26646	1	1	0	0.000	ATTGGCGGGCT
chr10A	27878	27878	1	4	4	1.000	GAGGGCGCAGG
chr10A	27908	27908	1	3	3	1.000	TTGGACGGAAC
chr10A	27926	27926	1	3	3	1.000	TTGTTCGAAAG
chr10A	27953	27953	1	1	1	1.000	GGCACCGTGGG
chr10A	27977	27977	1	1	1	1.000	GGGGCCGGAGT
chr10A	27997	28000	2	6	6	1.000	TTGTGCGGCGGAGA
chr10A	28032	28032	1	2	2	1.000	GCAGGCGTGAA
chr10A	28056	28060	2	4	4	1.000	CAAGACGATCGACTG
chr10A	28086	28086	1	2	2	1.000	GAGCTCGTGGT
chr10A	28100	28100	1	1	1	1.000	GTTTCCGGATT
chr10A	28115	28115	1	2	2	1.000	AGGGTCGAAGC
chr10A	28160	2

## Comparative methylation calling with pycometh

In [24]:
os.chdir(notebook_path)

In [25]:
PYCO_OUT_DIR = os.path.abspath('../../pycometh/comparative/')

In [26]:
from pycoMeth.CpG_Aggregate import CpG_Aggregate
from pycoMeth.Interval_Aggregate import Interval_Aggregate
from pycoMeth.CGI_Finder import CGI_Finder
from pycoMeth.Meth_Comp import Meth_Comp
# Optionally inport jupyter helper functions
from pycoMeth.common import head, jhelp
import sys

In [27]:
##setup
min_depth = 5
sample_id = 'infected_leaves'

In [28]:
cpg_agg_bed_fn = meth_call_fn.replace('.meth_call', '.CpG_agg').replace('.tsv','.bed')
cpg_agg_tsv_fn = meth_call_fn.replace('.meth_call', '.CpG_agg').replace('.tsv','.tsv.gz')

In [29]:
ff = CpG_Aggregate(nanopolish_fn= meth_call_fn,
    ref_fasta_fn=ref_genome,
    output_bed_fn=cpg_agg_bed_fn,
    output_tsv_fn=cpg_agg_tsv_fn,
    sample_id=sample_id,
    progress=True)

[01;34m## Checking options and input files ##[0m
[01;34m## Parsing methylation_calls file ##[0m
[32m	Starting to parse file Nanopolish methylation call file[0m
	Progress: 100%|██████████| 11.0G/11.0G [11:30<00:00, 15.9M bytes/s]
[32m	Parsing summary[0m
[32m		Lines Parsed: 107,408,372[0m
[32m		Line successfully parsed: 107,408,372[0m
[32m		Input files: 1[0m
[32m	Filtering out low coverage sites[0m
[32m	Sorting each chromosome by coordinates[0m
[32m	Sites summary[0m
[32m		Total Valid Lines: 107,408,372[0m
[32m		Initial Sites: 3,295,365[0m
[32m		Valid Sites Found: 3,200,119[0m
[32m		Low Count Sites: 95,246[0m
[01;34m## Processing valid sites found and write to file ##[0m
	Progress: 100%|██████████| 3.20M/3.20M [07:39<00:00, 6.96k sites/s]
[32m	Results summary[0m
[32m		Total Sites Writen: 3,200,119[0m
[32m		Unmethylated sites: 1,458,073[0m
[32m		Ambiguous sites: 890,445[0m
[32m		Methylated sites: 851,601[0m


In [30]:
for fn in [500, 1000, 5000]:
    interval = fn
    int_agg_bed_fn = meth_call_fn.replace('.meth_call', '.interval_%s_agg' % interval).replace('.tsv','.bed')
    int_agg_tsv_fn = meth_call_fn.replace('.meth_call', '.interval_%s_agg' % interval).replace('.tsv','.tsv.gz')
    #print(int_agg_bed_fn, '\n', int_agg_tsv_fn)
    fg = Interval_Aggregate(
    cpg_aggregate_fn=cpg_agg_tsv_fn,
    ref_fasta_fn=ref_genome,
    output_bed_fn=int_agg_bed_fn,
    output_tsv_fn=int_agg_tsv_fn,
    interval_size=interval,
    min_cpg_per_interval=1,
    sample_id=sample_id,
    progress=True)

[01;34m## Checking options and input files ##[0m
[01;34m## Parsing CpG_aggregate file ##[0m
	Progress: 754M bytes [01:06, 11.4M bytes/s]                       
[32m	Results summary[0m
[32m		Lines parsed: 3,200,119[0m
[32m		Total number of intervals: 353,405[0m
[32m	Writter summary[0m
[32m		Valid intervals written: 322,680[0m
[32m		Empty intervals skipped: 30,725[0m
[01;34m## Checking options and input files ##[0m
[01;34m## Parsing CpG_aggregate file ##[0m
	Progress: 754M bytes [00:54, 13.8M bytes/s]                       
[32m	Results summary[0m
[32m		Lines parsed: 3,200,119[0m
[32m		Total number of intervals: 176,756[0m
[32m	Writter summary[0m
[32m		Valid intervals written: 167,301[0m
[32m		Empty intervals skipped: 9,455[0m
[01;34m## Checking options and input files ##[0m
[01;34m## Parsing CpG_aggregate file ##[0m
	Progress: 754M bytes [00:41, 18.0M bytes/s]                       
[32m	Results summary[0m
[32m		Lines parsed: 3,200,119[0m
[32m		T

In [31]:
###CGI island finder -> error might fix it
fl = CGI_Finder (
    ref_fasta_fn=ref_genome,
    output_bed_fn=ref_genome.replace(".fasta", ".CGI.bed"),
    output_tsv_fn=ref_genome.replace(".fasta", ".CGI.tsv"),
    progress=True)

[01;34m## Checking options and input files ##[0m
[01;34m## Parsing reference fasta file ##[0m
[32m	Parsing Reference sequence: chr1A[0m
	Progress: 100%|██████████| 6.16M/6.16M [00:05<00:00, 1.04M bases/s]
[32m	Parsing Reference sequence: chr2A[0m
	Progress: 100%|██████████| 6.06M/6.06M [00:05<00:00, 1.05M bases/s]
[32m	Parsing Reference sequence: chr3A[0m
	Progress: 100%|██████████| 6.03M/6.03M [00:05<00:00, 1.06M bases/s]
[32m	Parsing Reference sequence: chr4A[0m
	Progress: 100%|██████████| 5.97M/5.97M [00:05<00:00, 1.05M bases/s]
[32m	Parsing Reference sequence: chr5A[0m
	Progress: 100%|██████████| 5.56M/5.56M [00:05<00:00, 1.06M bases/s]
[32m	Parsing Reference sequence: chr6A[0m
	Progress: 100%|██████████| 5.55M/5.55M [00:05<00:00, 1.06M bases/s]
[32m	Parsing Reference sequence: chr7A[0m
	Progress: 100%|██████████| 5.18M/5.18M [00:04<00:00, 1.06M bases/s]
[32m	Parsing Reference sequence: chr8A[0m
	Progress: 100%|██████████| 5.11M/5.11M [00:04<00:00, 1.08M bases/s

In [34]:
ref_genome.replace(".fasta", ".CGI.bed")

'/home/jamila/jamila_Storage/analyses/pycometh/chr_A_B_unassigned.CGI.bed'

In [35]:
ref_genome.replace(".fasta", ".CGI.tsv")

'/home/jamila/jamila_Storage/analyses/pycometh/chr_A_B_unassigned.CGI.tsv'

In [36]:
#germinated_spores_1.clean_all.CpG_agg.tsv.gz
cpg_agg_il_tsv = cpg_agg_tsv_fn
cpg_agg_sp_tsv = '/home/jamila/jamila_Storage/analyses/pycometh/germinated_spores/chr_A_B_unassignedsta.20210407.PgtGerminatedSpores.Mapped.CpG_agg.tsv.gz'

In [37]:
ref_genome

'/home/jamila/jamila_Storage/analyses/pycometh/chr_A_B_unassigned.fasta'

In [38]:
fg = Meth_Comp (
    aggregate_fn_list=[cpg_agg_il_tsv, cpg_agg_sp_tsv],
    ref_fasta_fn=ref_genome,
    output_bed_fn=os.path.join(PYCO_OUT_DIR, 'Pgt21-0.comp_CpG.il_sp.bed'),
    output_tsv_fn=os.path.join(PYCO_OUT_DIR, 'Pgt21-0.comp_CpG.il_sp.tsv.gz'),
    sample_id_list=['infected_leaves',"spores"],
    max_missing = 1,
    min_diff_llr = 0,
    progress=True)

[01;34m## Checking options and input files ##[0m
[01;34m## Parsing files ##[0m
[32m	Reading input files header and checking consistancy between headers[0m
[32m	Starting asynchronous file parsing[0m
	Progress: 1.80G bytes [01:33, 19.3M bytes/s]                      
[32m	Adjust pvalues[0m
[32m	Writing output file[0m
	Progress: 100%|██████████| 85.9k/85.9k [00:14<00:00, 5.96k sites/s]
[32m	Results summary[0m
[32m		Sites with insufficient effect size: 3,112,757[0m
[32m		Sites with insufficient samples: 109,991[0m
[32m		Valid sites: 85,920[0m
[32m		Sites with non-significant adjusted pvalue: 84,996[0m
[32m		Sites with non-significant pvalue: 77,957[0m
[32m		Sites with significant pvalue: 7,963[0m
[32m		Sites with significant adjusted pvalue: 924[0m


In [39]:
int_agg_il_tsv = "/home/jamila/jamila_Storage/analyses/pycometh/infected_leaves/chr_A_B_unassignedsta.20210407.PgtInfectedLeaves.Mapped.interval_1000_agg.tsv.gz"
int_agg_sp_tsv = "/home/jamila/jamila_Storage/analyses/pycometh/germinated_spores/chr_A_B_unassignedsta.20210407.PgtGerminatedSpores.Mapped.interval_1000_agg.tsv.gz"
fg = Meth_Comp (
    aggregate_fn_list=[cpg_agg_il_tsv, cpg_agg_sp_tsv],
    ref_fasta_fn=ref_genome,
    output_bed_fn=os.path.join(PYCO_OUT_DIR, 'Pgt21-0.comp_w1000.il_sp.bed'),
    output_tsv_fn=os.path.join(PYCO_OUT_DIR, 'Pgt21-0.comp_w1000.il_sp.tsv.gz'),
    sample_id_list=['infected_leaves',"spores"],
    max_missing = 1,
    min_diff_llr = 0,
    progress=True)

[01;34m## Checking options and input files ##[0m
[01;34m## Parsing files ##[0m
[32m	Reading input files header and checking consistancy between headers[0m
[32m	Starting asynchronous file parsing[0m
	Progress: 1.80G bytes [01:32, 19.4M bytes/s]                      
[32m	Adjust pvalues[0m
[32m	Writing output file[0m
	Progress: 100%|██████████| 85.9k/85.9k [00:14<00:00, 6.04k sites/s]
[32m	Results summary[0m
[32m		Sites with insufficient effect size: 3,112,757[0m
[32m		Sites with insufficient samples: 109,991[0m
[32m		Valid sites: 85,920[0m
[32m		Sites with non-significant adjusted pvalue: 84,996[0m
[32m		Sites with non-significant pvalue: 77,957[0m
[32m		Sites with significant pvalue: 7,963[0m
[32m		Sites with significant adjusted pvalue: 924[0m


In [40]:
int_agg_il_tsv = "/home/jamila/jamila_Storage/analyses/pycometh/infected_leaves/chr_A_B_unassignedsta.20210407.PgtInfectedLeaves.Mapped.interval_500_agg.tsv.gz"
int_agg_sp_tsv = "/home/jamila/jamila_Storage/analyses/pycometh/germinated_spores/chr_A_B_unassignedsta.20210407.PgtGerminatedSpores.Mapped.interval_500_agg.tsv.gz"
fg = Meth_Comp (
    aggregate_fn_list=[cpg_agg_il_tsv, cpg_agg_sp_tsv],
    ref_fasta_fn=ref_genome,
    output_bed_fn=os.path.join(PYCO_OUT_DIR, 'Pgt21-0.comp_w500.il_sp.bed'),
    output_tsv_fn=os.path.join(PYCO_OUT_DIR, 'Pgt21-0.comp_w500.il_sp.tsv.gz'),
    sample_id_list=['infected_leaves',"spores"],
    max_missing = 1,
    min_diff_llr = 0,
    progress=True)

[01;34m## Checking options and input files ##[0m
[01;34m## Parsing files ##[0m
[32m	Reading input files header and checking consistancy between headers[0m
[32m	Starting asynchronous file parsing[0m
	Progress: 1.80G bytes [01:31, 19.6M bytes/s]                      
[32m	Adjust pvalues[0m
[32m	Writing output file[0m
	Progress: 100%|██████████| 85.9k/85.9k [00:14<00:00, 6.01k sites/s]
[32m	Results summary[0m
[32m		Sites with insufficient effect size: 3,112,757[0m
[32m		Sites with insufficient samples: 109,991[0m
[32m		Valid sites: 85,920[0m
[32m		Sites with non-significant adjusted pvalue: 84,996[0m
[32m		Sites with non-significant pvalue: 77,957[0m
[32m		Sites with significant pvalue: 7,963[0m
[32m		Sites with significant adjusted pvalue: 924[0m


In [None]:
!ls {PYCO_OUT_DIR}

In [None]:
PYCO_OUT_DIR