This notebook visualizes models using scatterplots, showing the distribution of modified vs canonical versions of the same k-mers.

In [None]:
import sys
sys.path.append("..")

import requant as rq
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
# sns.color_palette("tab10")
# sns.color_palette("colorblind")

In [None]:

plt.rcParams['pdf.fonttype'] = 42
mpl.rcParams['figure.figsize'] = 5,5

mod_motif, mod_motif_M, mod_alphabet = 'CG', 'MG', 'cpg'
# mod_motif, mod_motif_M, mod_alphabet = 'GC', 'GM', 'gpc'

# np_model_file = 'data/gpc/nanopolish_train/r9.4_450bps.gpc.6mer.template.round4.model'
# np_model_file = 'data/output_CpG_meth/refcuts_CG_test_2/p50_0/requant/p50_0.replaced.model'
# np_model_file = '/Users/rstraver/Workspace/requant/data/gpc/requant/p50_0.added.model'
# np_model_file = '/Users/rstraver/Workspace/requant/data/gpc/requant/p50_0.replaced.model'
# np_model_file = '../nanopolish/OUD6725/r9.4_450bps.gpc.6mer.template.round7.model'
# np_model_file = '../nanopolish/nanopolish/etc/r9-models/r9.4_450bps.'+mod_alphabet+'.6mer.template.model'

def load_np_model(np_model_file):
    # Load nanopolish model file
    df = pd.read_csv(np_model_file,delimiter='\t',comment='#',usecols=[0,1,2],index_col=0,header=None)
    df.columns=['value','stddev']
    df.index.rename('kmer',inplace=1)

    # Take all kmers with a mod motif
    df_mod = df.filter(like=mod_motif_M, axis=0)
    # Drop kmers with "multiple modifications"
    df_mod = df_mod[df_mod.index.str.count('M') == 1]
    # Remove all kmers with multiple motifs independent of mod or canonical notation
    df_mult = df_mod[df_mod.index.str.replace('M','C').str.count(mod_motif)>1]
    # df_mod = df_mult
    df_mod = df_mod[df_mod.index.str.replace('M','C').str.count(mod_motif)==1]

    # Get canonical equivalent kmers
    df_can = df.loc[df_mod.index.str.replace('M','C')]
    # Move kmers as index to another column
    df_can = df_can.rename_axis('kmer_canon').reset_index()

    # Match indexes
    df_can.index = df_mod.index
    # Merge mod and canonical dataframes
    df_mod[['kmer_canon','value_canon']] = df_can[['kmer_canon','value']]


    # Add index of motif for plot coloring
    df_mod['mod_pos'] = df_mod.index.str.index(mod_motif_M)

    # Replace mod pos integers with strings for readability
    def labeler(x):
        return 'N'*x+mod_motif_M+'N'*(4-x)
    df_mod['mod_label'] = df_mod.apply(lambda row : labeler(row['mod_pos']), axis=1)


    # Select all kmers with multiple mod motifs
    df_mult = df[df.index.str.count(mod_motif_M) > 1]
    df_mult = df_mult[df_mult.index.str.replace(mod_motif_M, mod_motif).str.count('M') == 0]

    # Get canonical equivalent kmers
    df_mult_can = df.loc[df_mult.index.str.replace('M','C')]
    # Move kmers as index to another column
    df_mult_can = df_mult_can.rename_axis('kmer_canon').reset_index()

    # Match indexes
    df_mult_can.index = df_mult.index
    # Merge mod and canonical dataframes
    df_mult[['kmer_canon','value_canon']] = df_mult_can[['kmer_canon','value']]
    df_mult[['mod_pos','mod_label']] = -1,'Multiple'

    # Merge single and multi mod data
    df_plot = pd.concat([df_mod,df_mult])
    df_plot = df_plot.rename(columns={'mod_label': 'Mod index'})

    df_plot['Delta'] = df_plot['value']-df_plot['value_canon']

    return df_plot


# sns.set(rc={'figure.figsize':(12,12)})

def plot_finish(label):
    plt.plot([55,135],[55,135],linestyle='--',c='k',linewidth=1)
    plt.xlim(55,125)
    plt.ylim(55,125)

    plt.tight_layout()
    plt.savefig('./data/6mer/6mer_plot_s_'+label+'.pdf')
    plt.show()

def plot_finish_delta(label):
    # plt.plot([55,135],[55,135],linestyle='--',c='k',linewidth=1)
    plt.xlim(55,125)
    plt.ylim(-11,11)

    plt.tight_layout()
    plt.savefig('./data/6mer/6mer_plot_s_'+label+'.pdf')
    plt.show()

def plot_models(model_id,prefix,suffix=''):
    # Stay with the original trained set of k-mers
    df_train = load_np_model(prefix+model_id+'/nanopolish_train/r9.4_450bps.'+mod_alphabet+'.6mer.template.round4.model')
    df_train['Delta'] = df_train['value']-df_train['value_canon']
    sns.scatterplot(data=df_train,x='value_canon',y='value',hue='Mod index',linewidth=0,s=10)
    plt.xlabel('Canonical value')
    plt.ylabel('Modified value (direct)')
    plot_finish('directmodel'+'_'+model_id+suffix)
    df_trained = df_train[df_train['value_canon'] != df_train['value']]
    sns.scatterplot(data=df_trained,x='value_canon',y='value',hue='Mod index',linewidth=0,s=10)
    plt.xlabel('Canonical value')
    plt.ylabel('Modified value (trained)')
    plot_finish('trainedmodel'+'_'+model_id+suffix)
    # Delta version
    df_trained = df_train[df_train['value_canon'] != df_train['value']]
    sns.scatterplot(data=df_trained,x='value_canon',y='Delta',hue='Mod index',linewidth=0,s=10)
    plt.axhline(0,linestyle='--',c='k',linewidth=1)
    plt.xlabel('Canonical value')
    plt.ylabel(r'$\Delta C$')
    plot_finish_delta('traineddelta'+'_'+model_id+suffix)

    # Adding requant imputation stuff
    df_add = load_np_model(prefix+model_id+'/requant/'+model_id+'.added.model')
    df_train['Delta'] = df_train['value']-df_train['value_canon']
    sns.scatterplot(data=df_add,x='value_canon',y='value',hue='Mod index',linewidth=0,s=10)
    plt.xlabel('Canonical value')
    plt.ylabel('Modified value (add)')
    plot_finish('addedmodel'+'_'+model_id+suffix)
    # Delta version
    df_add = df_add[df_add['value_canon'] != df_add['value']]
    sns.scatterplot(data=df_add,x='value_canon',y='Delta',hue='Mod index',linewidth=0,s=10)
    plt.axhline(0,linestyle='--',c='k',linewidth=1)
    plt.xlabel('Canonical value')
    plt.ylabel(r'$\Delta C$')
    plot_finish_delta('addeddelta'+'_'+model_id+suffix)

    df_rep = load_np_model(prefix+model_id+'/requant/'+model_id+'.replaced.model')
    sns.scatterplot(data=df_rep,x='value_canon',y='value',hue='Mod index',linewidth=0,s=10)
    plt.xlabel('Canonical value')
    plt.ylabel('Modified value (replace)')
    plot_finish('replacedmodel'+'_'+model_id+suffix)
    # Delta version
    df_rep = df_rep[df_rep['value_canon'] != df_rep['value']]
    sns.scatterplot(data=df_rep,x='value_canon',y='Delta',hue='Mod index',linewidth=0,s=10)
    plt.axhline(0,linestyle='--',c='k',linewidth=1)
    plt.xlabel('Canonical value')
    plt.ylabel(r'$\Delta C$')
    plot_finish_delta('replaceddelta'+'_'+model_id+suffix)

    df_plot['replaced'] = df_rep['value']
    df_plot['added'] = df_add['value']
    sns.scatterplot(data=df_plot,x='added',y='replaced',hue='Mod index',linewidth=0,s=10)
    plt.xlabel('Modified value (add)')
    plt.ylabel('Modified value (replace)')
    plot_finish('addvsreplaced'+'_'+model_id+suffix)

    df_wtf = df_add[df_train['value_canon'] != df_train['value']]
    sns.scatterplot(data=df_wtf,x='value_canon',y='value',hue='Mod index',linewidth=0,s=10)
    plt.xlabel('Canonical value')
    plt.ylabel('Modified value (add)')
    plot_finish('wtf'+'_'+model_id+suffix)

    # sns.scatterplot(data=df_plot,x='value',y='replaced',hue='Mod index',linewidth=0,s=10)
    # plt.xlabel('Modified value (trained)')
    # plt.ylabel('Modified value (replace)')
    # plot_finish('valvsreplaced'+'_'+model_id+suffix)


    # This makes little sense, the canonical values are different between the base nanopolish model and the trained ones
    df_plot['replaced'] = df_rep['value']
    sns.scatterplot(data=df_plot,x='value',y='replaced',hue='Mod index',linewidth=0,s=10)
    plt.xlabel('Modified value (known)')
    plt.ylabel('Modified value (replace)')
    plot_finish('replaceddiff'+'_'+model_id+suffix)

    # df_plot['added'] = df_add['value']
    # sns.scatterplot(data=df_plot,x='value',y='added',hue='Mod index',linewidth=0,s=10)
    # plt.xlabel('Modified value (known)')
    # plt.ylabel('Modified value (add)')
    # plot_finish('addeddiff'+'_'+model_id+suffix)


# sns.set_palette("colorblind")
sns.set_palette("husl",6)
# sns.set_palette("rocket",6)
# sns.set_palette("Set2")
df_plot = load_np_model('../../nanopolish/nanopolish/etc/r9-models/r9.4_450bps.'+mod_alphabet+'.6mer.template.model')
# sns.scatterplot(data=df_plot,x='value_canon',y='value',hue='Mod index',linewidth=0,s=10)

jg = sns.jointplot(data=df_plot,x='value_canon',y='value',hue='Mod index',linewidth=.25,s=10)
plt.xlabel('Canonical value')
plt.ylabel('Modified value')
jg.ax_joint.plot([55,135],[55,135],linestyle='--',c='k',linewidth=1)
# plt.plot([55,135],[55,135],linestyle='--',c='k',linewidth=1)
jg.ax_joint.set_xlim(55,125)
jg.ax_joint.set_ylim(55,125)
# Remove fill in under curves:
for child in jg.ax_marg_x.get_children()[:6]:
    child.set_facecolor('#00000000')
for child in jg.ax_marg_y.get_children()[:6]:
    child.set_facecolor('#00000000')
    
jg.ax_joint.set_xlabel('Canonical value')
jg.ax_joint.set_ylabel('Modified value')

# jg.ax_marg_y.get_children()[0].set_facecolor('k') 
plt.tight_layout()
plt.savefig('./data/6mer/6mer_plot_s_'+'fullmodel_'+mod_alphabet+'.pdf')
plt.show()
# plot_finish('fullmodel_'+mod_alphabet)

# # create a excel writer object
# with pd.ExcelWriter("./data/np_to_excel_"+mod_alphabet+".xlsx") as writer:
   
#     # use to_excel function and specify the sheet_name and index
#     # to store the dataframe in specified sheet
#     df_plot.to_excel(writer, sheet_name="Raw model")
#     df_plot.to_excel(writer, sheet_name="Matched kmers")

model_id = 'p25_2'
# # prefix = 'data/refcuts_CG_test_2/'
# prefix = 'data/output_CpG_gluc/refcuts_CG_test_2/'
# prefix = 'data/output_GpC_meth/refcuts_GC_test_2/'
prefix = 'data/output_CpG_meth/refcuts_CG_test_2/'
plot_models(model_id,prefix,suffix='_'+mod_alphabet+'_meth')




In [None]:


# Load model in the pre-panda's requant way:

dict_canon, dict_modif, dict_main_sd = rq.load_polishmodel('../../nanopolish/nanopolish/etc/r9-models/r9.4_450bps.'+mod_alphabet+'.6mer.template.model')

eff_dict = rq.my_shifter(dict_canon,dict_modif)
# imputes = rq.impute_table(dict_canon,dict_modif,eff_dict)

df_plot.loc['AAMGTT']['value']-rq.impute_mod(eff_dict,'AAMGTT')

df_plot['diff_og'] = df_plot['value']-df_plot['value_canon']
df_plot['diff_replace'] = [-rq.impute_mod(eff_dict,x) for x in df_plot.index]
df_plot['diff_repdiff'] = df_plot['diff_replace']-df_plot['diff_og']
# df_plot.loc['AAMGTT']

def joint_plot_diff(y_col,y_label):

    jg = sns.jointplot(data=df_plot,x='value_canon',y=y_col,hue='Mod index',linewidth=.25,s=10)
    sns.move_legend(jg.ax_joint, "upper right")

    # jg.ax_joint.plot([55,135],[55,135],linestyle='--',c='k',linewidth=1)
    # plt.plot([55,135],[55,135],linestyle='--',c='k',linewidth=1)
    jg.ax_joint.set_xlim(55,125)
    jg.ax_joint.set_ylim(-12.5,15.5)
    # Remove fill in under curves:
    for child in jg.ax_marg_x.get_children()[:6]:
        child.set_facecolor('#00000000')
    for child in jg.ax_marg_y.get_children()[:6]:
        child.set_facecolor('#00000000')
    jg.ax_joint.axhline(0,linestyle='--',c='k',linewidth=1)

    jg.ax_joint.set_xlabel('Canonical value')
    jg.ax_joint.set_ylabel(r'$\Delta C$')#'Modified delta')
        
    # jg.ax_marg_y.get_children()[0].set_facecolor('k') 
    plt.tight_layout()
    # plt.legend(loc='upper right')
    plt.savefig('./data/6mer/6mer_plot_s_'+'fullmodel_'+y_col+'_delta.pdf')
    plt.show()

joint_plot_diff('diff_og','Known modified delta')
joint_plot_diff('diff_replace','Imputed modified delta')
joint_plot_diff('diff_repdiff','Imputed - known modified delta')

In [None]:
# Another plot showing the delta's directly by subtracting the canonical values from the mod values
# df_plot['Delta'] = df_plot['value']-df_plot['value_canon']
# df_plot
jg = sns.jointplot(data=df_plot,x='value_canon',y='Delta',hue='Mod index',linewidth=.25,s=10)

# jg.ax_joint.plot([55,135],[55,135],linestyle='--',c='k',linewidth=1)
# plt.plot([55,135],[55,135],linestyle='--',c='k',linewidth=1)
# jg.ax_joint.set_xlim(55,125)
# jg.ax_joint.set_ylim(55,125)
# Remove fill in under curves:
for child in jg.ax_marg_x.get_children()[:6]:
    child.set_facecolor('#00000000')
for child in jg.ax_marg_y.get_children()[:6]:
    child.set_facecolor('#00000000')
jg.ax_joint.axhline(0,linestyle='--',c='k',linewidth=1)

jg.ax_joint.set_xlabel('Canonical value')
jg.ax_joint.set_ylabel(r'$\Delta C$')
    
# jg.ax_marg_y.get_children()[0].set_facecolor('k') 
plt.tight_layout()
plt.savefig('./data/6mer/6mer_plot_s_'+'fullmodel_'+mod_alphabet+'_delta.pdf')
plt.show()

In [None]:

sns.set_palette("tab10")
base_colors = {
    'A':'C2',
    'C':'C0',
    'G':'C1',
    'T':'C3',
    'M':'C7'}

# base_colors = {
#     'A':'limegreen',
#     'C':'royalblue',
#     'G':'gold',
#     'T':'red',
#     'M':'C7'}
# sns.set(rc={'figure.figsize':(5,5)})
# sns.set_style('white')

mpl.rcParams['figure.figsize'] = 5,5
df_mod = load_np_model('../../nanopolish/nanopolish/etc/r9-models/r9.4_450bps.'+'CpG'+'.6mer.template.model')
df_mod['delta'] = df_mod['value']-df_mod['value_canon']

def scatter_base_at(mod_index,base_index,base_palette=base_colors,style='value'):
    # plt.figure(figsize=(5,5))
    # sns.set(rc={'figure.figsize':(5,5)})
    plt.figure()

    temp_name = 'Base at pos: '+str(base_index+1)

    df_mod[temp_name] = list(df_mod.index.str[base_index])
    df_plot_at = df_mod[df_mod['mod_pos']==mod_index]
    # print(df_mod[temp_name])
    df_plot_nat = df_mod[df_mod['mod_pos']!=mod_index]

    if style=='delta':
        # g = sns.scatterplot(data=df_plot_nat,x='value_canon',y='delta',hue=df_mod['mod_pos'], s=15,alpha=0.2,label=None)
        plt.scatter(df_plot_nat['value_canon'],df_plot_nat['Delta'],c='lightgray',s=1)
        plt.axhline(0,linestyle='--',c='k',linewidth=1)

        g = sns.scatterplot(data=df_plot_at,x='value_canon',y='Delta',hue=temp_name,palette=base_palette, s=15)


    elif style=='delta_shift':
        # g = sns.scatterplot(data=df_plot_nat,x='value_canon',y='delta',hue=df_mod['mod_pos'], s=15,alpha=0.2,label=None)
        plt.scatter(df_plot_nat['value_canon'],df_plot_nat['delta_shift'],c='lightgray',s=1)
        plt.axhline(0,linestyle='--',c='k',linewidth=1)

        g = sns.scatterplot(data=df_plot_at,x='value_canon',y='delta_shift',hue=temp_name,palette=base_palette, s=15)

    else:
        plt.plot([55,135],[55,135],linestyle='--',c='k',linewidth=1)
        plt.scatter(df_plot_nat['value_canon'],df_plot_nat['value'],c='lightgray',s=1)
        g = sns.scatterplot(data=df_plot_at,x='value_canon',y='value',hue=temp_name,palette=base_palette, s=15)
        plt.xlim(70,125)
        plt.ylim(70,125)



    # plt.xlabel(None)
    # plt.ylabel(None)

    # plt.title('Start')
    plt.xlabel('Canonical value')
    plt.ylabel(r'$\Delta C$')#+style)

    # ax = plt.gca()
    # # ax.plot([1, 2, 3], label='Inline label')
    # ax.legend(['','a','f','g'])
    # plt.legend([None,None,'A'])
    
    def get_bmer(base):
        kmer = mod_index*'N' + 'MG' + (6-mod_index-2)*'N'
        kmer = kmer[:base_index] + base + kmer[base_index+1:]
        # kmer[base_index] = 'x'
        return kmer

    leg = g.axes.get_legend()
    print(leg.texts[0].get_label())
    new_labels = [get_bmer(base) for base in 'ACGT']
    for t, l in zip(leg.texts, new_labels):
        t.set_text(l)
    h, l = plt.gca().get_legend_handles_labels()
    # print(h,l)

    # plt.gca().legend(h[5:10], l[-4:])

    # plt.xlabel('Canonical value')
    # plt.ylabel('Modified value')
    # plt.title('Mod index: '+str(mod_index)+' Base index: '+str(base_index))
    plt.tight_layout()
    plt.savefig('./data/6mer/6mer_plot_s_'+style+str(mod_index)+'_'+str(base_index)+'.pdf')
    plt.show()
    # fig.clear()
    # plt.close(fig)
    df_mod.drop(temp_name,axis='columns')


same_base_color = 'C1'
# scatter_base_at(3,5,base_palette={
#     'A':same_base_color,
#     'C':same_base_color,
#     'G':same_base_color,
#     'T':same_base_color,
#     'M':same_base_color})
# scatter_base_at(3,5)
scatter_base_at(3,2)
scatter_base_at(3,1)
scatter_base_at(3,2,style='delta')
scatter_base_at(3,1,style='delta')
# scatter_base_at(3,0)

In [None]:
# print(df_mod)

shifter = {-1:0}
for i in range(5):
    # df_mod['mod_pos'].unique():
    # print(np.median(df_mod[df_mod['mod_pos']==i]['delta']))
    # mod_index_median = 
    shifter[i] = np.median(df_mod[df_mod['mod_pos']==i]['delta'])
    # print(i)
# print(df_mod.loc[1])
# print(shifter)

# def func(num, condition, shifter):
#     return num * shifter[condition]

# df_mod['delta_shift'] = [func(df_mod.loc[idx, 'delta'], df_mod.loc[idx, 'mod_pos'], shifter) for idx in range(len(df_mod))]


def func(number,condition):
    return number - shifter[condition]

df_mod['delta_shift'] = df_mod.apply(lambda x: func(x['delta'], x['mod_pos']), axis=1)

df_mod

scatter_base_at(3,2,style='delta_shift')
scatter_base_at(3,1,style='delta_shift')