This notebook was used to make Figure 1b, showing how modified data differs from canonical in raw space.

In [None]:
import bz2
import numpy as np
import matplotlib.pyplot as plt
import pysam

path_fasta = '../tamatoa/data/lambda_phage_unix.fa'
fasta = pysam.FastaFile(path_fasta)

path_align_canon = './data/canon_eventalign.tsv.bz2'
path_align_mod = './data/mod_eventalign.tsv.bz2'

# path_align_canon = './data/canon_eventalign.tsv.bz2'
# path_align_mod = './data/glu_pass_eventalign.tsv.bz2'
# path_align_mod = './data/glu_fail_eventalign.tsv.bz2'
# path_align_mod = './data/glu_AJG810_fail_c4241646_14_eventalign.tsv.bz2'

# path_align_canon = './data/canon_eventalign.tsv.bz2'
# path_align_mod = './data/gpc_pass_eventalign.tsv.bz2'
# path_align_mod = './data/gpc_fail_eventalign.tsv.bz2'

region_start, region_end = 12335, 12355
region_start, region_end = 20600, 20640

def load_event_align(path_align,reads=[]):
    with bz2.open(path_align,mode='rb') as f_ea:

        header = f_ea.readline().decode().split()
        splitline = []
        cur_read_id = 0
        cur_read = []
        last_pos = -1
        for line in f_ea:
            if len(reads) >= 20:
                break
            splitline = line.split()
            if region_start <= int(splitline[1]) < region_end:
                # print(splitline[3],splitline[1],splitline[-1])
                # new read
                # Ignore reverse strand results (ref kmer must match model kmer)
                if splitline[2]!=splitline[9]:
                    continue
                if int(splitline[3]) != cur_read_id:
                    # Don't add if didn't cover full target region
                    if len(cur_read) == region_end-region_start:
                        reads.append(cur_read)
                    cur_read_id = int(splitline[3])
                    cur_read = []
                    # print(line)
                if int(splitline[1]) != last_pos:
                    cur_read.append([int(splitline[1])])
                    last_pos = int(splitline[1])
                cur_read[-1].extend(float(x) for x in splitline[-1].decode().split(','))

    return reads

reads_canon = load_event_align(path_align_canon,[])
print('canon',len(reads_canon))
reads_mod_CpG = load_event_align(path_align_mod,[])
print('mods',len(reads_mod_CpG))

# GpC data
reads_mod_GpC = load_event_align('./data/gpc_pass_eventalign.tsv.bz2',[])
reads_mod_GpC = load_event_align('./data/gpc_fail_eventalign.tsv.bz2',reads_mod_GpC)
print('mods',len(reads_mod_GpC))


### Gluc stuff, as that needed more data to plot properly
#Load a lot of eventalign files due to little target coverage per batch in this sample
reads_mod_GluC = load_event_align('./data/glu_fail_eventalign.tsv.bz2',[])
for i in ['4','14','34','44','54','64','74','8','18','28','38','48','58','68','78']:
    reads_mod_GluC = load_event_align('./data/glu_AJG810_fail_c4241646_'+i+'_eventalign.tsv.bz2',reads=reads_mod_GluC)
    print(i,len(reads_mod_GluC))
print('mods',len(reads_mod_GluC))

# reads_mod = load_event_align('./data/glu_fail_eventalign.tsv.bz2')
# for i in ['4','14']:
#     reads_mod = load_event_align('./data/glu_AJG810_fail_c4241646_'+i+'_eventalign.tsv.bz2',reads=reads_mod)
#     print(i,len(reads_mod))

# reads_mod = []
# for i in ['4','14','24','34','44','54','64','74','8','18','28','38','48','58','68','78']:
#     reads_mod = load_event_align('./data/glu_AJG810_fail_c4241646_'+i+'_eventalign_mcpg.tsv.bz2',reads=reads_mod)
#     print(i,len(reads_mod))
# print('mods',len(reads_mod))

In [None]:

def plot_event_align(reads,my_color='k'):
    # print(reads)
    i = 0
    for read in reads[:15]:
        # Ignore reads that didn't cover the full area

        # print(read)
        x_values = []
        y_values = []
        for base in read:
            x_values.extend(base[0] + np.arange(len(base)-1)/len(base)+1/len(base))
            y_values.extend(np.array(base[1:]))#+i*50)
        plt.plot(x_values,y_values,color=my_color,alpha=0.2,linewidth=.75)
        i+=1

# plot_type = 'glu'

def plot_aligned_events(reads_mod,plot_type):
    plt.figure(figsize=(8,4))
    for pos in np.arange(region_start,region_end):
        plt.axvline(pos,linestyle='--',linewidth=.75,color='k',alpha=.5)
    plot_event_align(reads_canon,my_color='b')
    # plot_event_align(reads_mod_CpG,my_color='r')
    # plot_event_align(reads_mod_GpC,my_color='r')
    # plot_event_align(reads_mod_GluC,my_color='r')
    plot_event_align(reads_mod,my_color='r')
    refseq = fasta.fetch('J02459.1')[region_start+2:region_end+2]
    refseq_list = list(refseq)
    if plot_type in ['cpg','glu']:
        refseq_list[17] = 'C/M' # CpG
    if plot_type == 'gpc':
        refseq_list[14] = 'C/M' # GpC
        refseq_list[19] = 'C/M' # GpC
        refseq_list[24] = 'C/M' # GpC
    plt.xticks(np.arange(region_start,region_end)+.5,labels=refseq_list)
    plt.ylim(60,135)
    # plt.xlim(region_start+12,region_end-15)
    plt.xlim(region_start+11,region_end-16)
    plt.xlabel('Reference sequence')
    plt.ylabel('Measured current')
    plt.plot([0,0],color='b',alpha=0.5,linewidth=2,label='Canonical')
    label = 'Modified'
    if plot_type == 'cpg': label='CpG methylated'
    if plot_type == 'gpc': label='GpC methylated'
    if plot_type == 'glu': label='CpG glucosylated'

    plt.plot([0,0],color='r',alpha=0.5,linewidth=2,label=label)
    plt.title('Comparison between canonical and '+label+' data')
    plt.legend(loc='upper right')
    plt.tight_layout()
    plt.savefig('./data/event_align_'+plot_type+'.pdf') #CpG

# plt.savefig('./data/event_align_glu_mcpg.pdf') # GluC with methylated ref
# plt.savefig('./data/event_align_all.pdf') 
# print('J02459.1',region_start+2,region_end+2,refseq)
    
plot_aligned_events(reads_mod_CpG,'cpg')
plot_aligned_events(reads_mod_GpC,'gpc')
plot_aligned_events(reads_mod_GluC,'glu')

In [None]:

plt.figure(figsize=(8,4))
# plot_event_align(reads_canon,my_color='b')
print((reads_canon[0]))
plt.plot(np.concatenate([x[1:] for x in reads_canon[0]]))
refseq = fasta.fetch('J02459.1')[region_start+2:region_end+2]
plt.xlabel('Reference sequence')
plt.ylabel('Measured current')
plt.title('Raw read data aligned to reference sequence')
plt.legend()
plt.tight_layout()
plt.savefig('./data/raw_data.pdf')
print('J02459.1',region_start+2,region_end+2,refseq)