This notebook visualizes the call statistics made by `callstat_2.py` in various plots.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['pdf.fonttype'] = 42
sns.set_style(style='white',rc={'figure.figsize':(16,8)})
all_sets = [('CpG','meth'),('GpC','meth'),('CpG','gluc')]



# Read the CSV file
def read_csv(filepath,motif='N/A',mod='N/A'):
    df = pd.read_csv(filepath,delimiter=' ', header=None)
    df.columns = ['rep_id','contexts','kmers','modstate','impute','sites','llr',
                  'posperc','poscalls','negcalls','sitecount']

    # Split callset identifiers
    df['kmer_perc'] = [int(x.split('_')[0][1:]) for x in df['rep_id']]
    df['perc_repeat'] = [int(x.split('_')[1]) for x in df['rep_id']]
    # Recalculate call frequency
    df['pos_freq'] = df['poscalls']/(df['poscalls']+df['negcalls'])
    df['drop_freq'] = (df['sitecount']-df['poscalls']-df['negcalls'])/df['sitecount']

    # Rename model imputation types
    df.replace({'impute': {'direct':'not-imputed','added':'missing-imputed','replaced':'fully-imputed'}},inplace=True)

    # Sort using custom order by imputation type
    df.impute=pd.Categorical(df.impute,categories=['not-imputed', 'missing-imputed', 'fully-imputed'])
    df=df.sort_values('impute')

    # Target was 90 but reality got 0.85078125 max
    df.replace({'kmer_perc': {90:85}},inplace=True) 

    df = df[df['sitecount']>0]

    df['motif'] = motif
    df['mod'] = mod

    return df

# Grouping data to see if everything makes sense
def test_read_csv(df):
    print(df.head())
    grouped = df.groupby(['kmer_perc','modstate','impute'])
    grouped.mean('pos_freq')
    grouped['pos_freq'].plot(kind='box', title='title_of_plot',)
    for name, group in grouped:
        print(name,'\n',group['pos_freq'])


# motif, mod = 'GpC','meth'
# data_dir = f'data/output_{motif}_{mod}/'
# df = read_csv(f'{data_dir}/{motif}_{mod}_callstat_2.csv',motif,mod)
# df = df[df['sites']=='test']
# df = df[df['llr']==2.0]
# df[df['kmer_perc']<=100]
# df[df['modstate']=='canon']

def load_all_data():
    all_data = []
    for index,(motif,mod) in enumerate([('CpG','meth'),('GpC','meth'),('CpG','gluc')]):
        # Determine and load data
        data_dir = f'data/output_{motif}_{mod}/'
        df = read_csv(f'{data_dir}/{motif}_{mod}_callstat_2a.csv',motif,mod)
        all_data.append(df)
    return pd.concat(all_data)
df_all = load_all_data()

df_all

In [None]:

df = df_all
df = df[df['motif']=='CpG']
df = df[df['mod']=='meth']
df = df[df['sites']=='train']
df = df[df['llr']==2.0]
df = df[df['kmer_perc']<=100]
df = df[df['modstate']=='mod']
df = df[df['rep_id']=='p25_2']
df

In [None]:
def fix_ticks(axes,show=True):
    for ax in axes.flatten():
        for tk in ax.get_yticklabels():
            tk.set_visible(show)
        for tk in ax.get_xticklabels():
            tk.set_visible(show)

    # In case above didn't work, try this too
    for ax in axes.flatten():
        ax.xaxis.set_tick_params(labelbottom=show)
        ax.yaxis.set_tick_params(labelleft=show)

def add_labels(fig, axes, labels=['a.','b.','c.']):
    for ax, label in zip(axes, labels):
        bbox = ax.get_tightbbox(fig.canvas.get_renderer())
        fig.text(bbox.x0, bbox.y1, label, fontsize=16, fontweight="bold", va="top", ha="left",
                transform=None)

def boxplots_callstat_combined(df_a,sites='test',extra='',llr=2.0,max_kmer_perc=100,run_sets=all_sets,fig_width=18,label=True):
    ylabel = 'Positive call frequency'

    sns.set(rc={'figure.figsize':(fig_width,12)})
    fig, axs = plt.subplots(ncols=len(run_sets),nrows=2,sharey='row',sharex='col')
    for index,(motif,mod) in enumerate(run_sets):
        # Filter out irrelevant results
        df = df_a
        df = df[df['motif']==motif]
        df = df[df['mod']==mod]
        df = df[df['sites']==sites]
        df = df[df['llr']==llr]
        df = df[df['kmer_perc']<=max_kmer_perc]

        if len(run_sets)>1:
            my_ax = axs[0,index]
        else:
            my_ax = axs[0]
        sns.boxplot(data=df[df['modstate']=='mod'], x="kmer_perc", y="pos_freq",hue='impute',ax=my_ax)
        my_ax.set_title(f'{motif} {mod}: modified dataset')
        # my_ax.legend(loc='lower right')
        my_ax.set_xlabel(xlabel)
        my_ax.set_ylabel(ylabel)
        my_ax.grid(True)
        
        if len(run_sets)>1:
            my_ax = axs[1,index]
        else:
            my_ax = axs[1]
        sns.boxplot(data=df[df['modstate']=='canon'], x="kmer_perc", y="pos_freq",hue='impute',ax=my_ax)
        my_ax.set_title(f'{motif} {mod}: canonical dataset')
        my_ax.set_xlabel(xlabel)
        my_ax.set_ylabel(ylabel)
        # my_ax.legend(loc='upper right')
        my_ax.grid(True)

        fix_ticks(axs)
    plt.suptitle(sites+' sites: positive call frequencies')
    plt.tight_layout()
    if label:
        add_labels(fig,[axs[0,0],axs[0,1],axs[0,2]])
    plt.savefig('./data/boxplot_callstat_2_'+extra+sites+'-calls.pdf')
    plt.savefig('./data/boxplot_callstat_2_'+extra+sites+'-calls.png')

    return fig, axs


def boxplots_keepstat_combined(df_a,sites='test',extra='',llr=2.0,max_kmer_perc=100,run_sets=all_sets,fig_width=18,label=True):
    ylabel = 'Discarded call frequency'

    sns.set(rc={'figure.figsize':(fig_width,12)})
    fig, axs = plt.subplots(ncols=len(run_sets),nrows=2,sharey='row',sharex='col')
    for index,(motif,mod) in enumerate(run_sets):
        # Filter out irrelevant results
        df = df_a
        df = df[df['motif']==motif]
        df = df[df['mod']==mod]
        df = df[df['sites']==sites]
        df = df[df['llr']==llr]
        df = df[df['kmer_perc']<=max_kmer_perc]

        if len(run_sets)>1:
            my_ax = axs[0,index]
        else:
            my_ax = axs[0]
        sns.boxplot(data=df[df['modstate']=='mod'], x="kmer_perc", y="drop_freq",hue='impute',ax=my_ax)
        my_ax.set_title(f'{motif} {mod}: modified dataset')
        # my_ax.legend(loc='upper right')
        my_ax.set_xlabel(xlabel)
        my_ax.set_ylabel(ylabel)
        my_ax.grid(True)
        
        if len(run_sets)>1:
            my_ax = axs[1,index]
        else:
            my_ax = axs[1]
        sns.boxplot(data=df[df['modstate']=='canon'], x="kmer_perc", y="drop_freq",hue='impute',ax=my_ax)
        my_ax.set_title(f'{motif} {mod}: canonical dataset')
        my_ax.set_xlabel(xlabel)
        my_ax.set_ylabel(ylabel)
        # my_ax.legend(loc='upper right')
        my_ax.grid(True)

        fix_ticks(axs)
    plt.suptitle(sites+' sites: discarded call frequencies')
    plt.tight_layout()
    if label:
        add_labels(fig,[axs[0,0],axs[0,1],axs[0,2]])
    plt.savefig('./data/boxplot_callstat_2_'+extra+sites+'-drops.pdf')
    plt.savefig('./data/boxplot_callstat_2_'+extra+sites+'-drops.png')

    return fig, axs
# for sites in ['test','train','all','zero']:
#     boxplots_callstat_combined(sites=sites)
#     boxplots_keepstat_combined(sites=sites)
# boxplots_callstat_combined()

xlabel = 'Percentage of modified k-mers in training set'
boxplots_callstat_combined(df_all,sites='test')
boxplots_callstat_combined(df_all,sites='train')
boxplots_callstat_combined(df_all,sites='zero',max_kmer_perc=50)

boxplots_keepstat_combined(df_all,sites='test')
boxplots_keepstat_combined(df_all,sites='train',label=False)
boxplots_keepstat_combined(df_all,sites='zero',label=False, max_kmer_perc=50)

In [None]:
import matplotlib.ticker as mticker
locmin = mticker.LogLocator(base=10, subs=np.arange(0.1,1,0.1),numticks=10)  
def fix_log_ticks(ax):
    ax.yaxis.set_minor_locator(locmin)
    ax.yaxis.set_minor_formatter(mticker.NullFormatter())

def boxplots_colstat_combined(df_a,sites='test',llr=2.0,max_kmer_perc=100,plot_col='sitecount',run_sets=all_sets):
    ylabel = 'Number of calls retained'

    for index,(motif,mod) in enumerate(run_sets):
        # Filter out irrelevant results
        df = df_a
        df = df[df['motif']==motif]
        df = df[df['mod']==mod]
        df = df[df['sites']==sites]
        df = df[df['llr']==llr]
        df = df[df['kmer_perc']<=max_kmer_perc]
        # print(df[df['modstate']=='mod'])

        # df['anycalls'] = df['poscalls']+df['negcalls']

        if len(run_sets)>1:
            my_ax = axs[0,index]
        else:
            my_ax = axs[0]
        # sns.boxplot(data=df[df['modstate']=='mod'], x="kmer_perc", y=plot_col,hue='impute',ax=my_ax, )
        sns.lineplot(data=df[df['modstate']=='mod'], x="kmer_perc", y=plot_col, ax=my_ax, err_style='bars')
        sns.scatterplot(data=df[df['modstate']=='mod'], x="kmer_perc", y=plot_col, ax=my_ax, label=sites)
        my_ax.set_title(f'{motif} {mod}: modified dataset')
        # my_ax.legend(loc='upper right')
        my_ax.set_xlabel(xlabel)
        my_ax.set_ylabel(ylabel)
        my_ax.grid(True)
        # axs[0,index].set_yscale('log')
        # fix_log_ticks(my_ax)
        
        if len(run_sets)>1:
            my_ax = axs[1,index]
        else:
            my_ax = axs[1]
        # sns.boxplot(data=df[df['modstate']=='canon'], x="kmer_perc", y=plot_col,hue='impute',ax=my_ax)
        sns.lineplot(data=df[df['modstate']=='canon'], x="kmer_perc", y=plot_col, ax=my_ax, err_style='bars')
        sns.scatterplot(data=df[df['modstate']=='canon'], x="kmer_perc", y=plot_col, ax=my_ax, label=sites)
        my_ax.set_title(f'{motif} {mod}: canonical dataset')
        my_ax.set_xlabel(xlabel)
        my_ax.set_ylabel(ylabel)
        # my_ax.legend(loc='upper right')
        my_ax.grid(True)
        # axs[1,index].set_yscale('log')
        # fix_log_ticks(my_ax)

        # print(sites,motif,mod)
        # print(df.groupby(['modstate','kmer_perc'])[plot_col].median())
        # print(df[plot_col].describe())

        my_ax.set_xticks(x_ticks)
        fix_ticks(axs)

    # axs[0,0].set_yscale('log')
    # axs[1,0].set_yscale('log')

xlabel = 'Percentage of modified k-mers in training set'
sns.set(rc={'figure.figsize':(18,12)})
fig, axs = plt.subplots(ncols=3,nrows=2,sharex='col')
x_ticks = np.sort(df_all['kmer_perc'].unique())
df_ni = df_all[df_all['impute']=='missing-imputed']
boxplots_colstat_combined(df_ni,sites='test')
boxplots_colstat_combined(df_ni,sites='train')
boxplots_colstat_combined(df_ni,sites='zero',max_kmer_perc=50)
plt.suptitle('Number sites called in dataset')
plt.tight_layout()
add_labels(fig,[axs[0,0],axs[0,1],axs[0,2]])
plt.savefig('./data/boxplot_callstat_2_'+'all'+'-'+'sitecount'+'.pdf')
plt.savefig('./data/boxplot_callstat_2_'+'all'+'-'+'sitecount'+'.png')

In [None]:
df_ni[(df_ni['perc_repeat']==2) & (df_ni['modstate']=='mod') & (df_ni['sites']=='test') & (df_ni['llr']==2.0) & (df_ni['motif']=='CpG') & (df_ni['mod']=='meth')].sort_values('kmer_perc')

# df_ni = df_all[df_all['impute']=='missing-imputed']
# boxplots_colstat_combined(df_ni,sites='test',plot_col='contexts')
# boxplots_colstat_combined(df_ni,sites='train',plot_col='contexts')
# boxplots_colstat_combined(df_ni,sites='zero',plot_col='contexts',max_kmer_perc=50)

# df_ni.groupby(['motif','mod','modstate','sites'])['sitecount'].describe()[['min','50%']]


# 
# df_ni

In [None]:
def lineplots_callstat(df, title='', label_x='Percentage of modified k-mers kept'):
    # sns.set(rc={'figure.figsize':(8,8)})
    plt.figure(figsize=[8,8])
    sns.set_style(style='white',rc={'figure.figsize':(8,8)})
    ticks=sorted(df['kmer_perc'].unique())

    modstate='mod'
    sns.lineplot(data=df[df['modstate']==modstate], x="kmer_perc", y="pos_freq",hue='impute', err_style='bars')
    sns.scatterplot(data=df[df['modstate']==modstate], x="kmer_perc", y="pos_freq",hue='impute')

    modstate='canon'
    sns.lineplot(data=df[df['modstate']==modstate], x="kmer_perc", y="pos_freq",hue='impute', err_style='bars',linestyle='--')
    g = sns.scatterplot(data=df[df['modstate']==modstate], x="kmer_perc", y="pos_freq",hue='impute')

    g.set_xticks(ticks)

    h, l = g.get_legend_handles_labels()
    g.legend(h[0:3], l[0:3])

    g.set_title(title+' Mod call frequency')
    g.set_xlabel('Percentage of modified k-mers kept')
    g.set_ylabel('Positive call frequency')



df = read_csv('data/debug_callstat2.txt')
lineplots_callstat(df,'CpG Meth: ')

In [None]:

df_min_CpG_meth = read_csv("./data/minimal/CpG_meth_minimal_2_callstat_2a.csv")
df_min_CpG_meth['motif'] = 'CpG'
df_min_CpG_meth['mod'] = 'meth'

# df_min_GpC_meth = read_csv("./data/minimal/GpC_meth_minimal_callstat_2a.csv")
# df_min_GpC_meth['motif'] = 'GpC'
# df_min_GpC_meth['mod'] = 'meth'

# df_min_CpG_gluc = read_csv("./data/minimal/CpG_gluc_minimal_callstat_2a.csv")
# df_min_CpG_gluc['motif'] = 'CpG'
# df_min_CpG_gluc['mod'] = 'gluc'


df_min = pd.concat([df_min_CpG_meth])#,df_min_GpC_meth,df_min_CpG_gluc])



def add_labels_2(fig, axes, labels=['a.','b.']):
    for ax, label in zip(axes, labels):
        bbox = ax.get_tightbbox(fig.canvas.get_renderer())
        fig.text(bbox.x0, bbox.y1, label, fontsize=16, fontweight="bold", va="top", ha="left",
                transform=None)

xlabel = 'Base coverage in training set'
fig,axs = boxplots_callstat_combined(df_min,extra='minimal_',sites='test',run_sets=[('CpG','meth')],fig_width=12,label=False)
add_labels(fig,axs,labels=['a.'])
plt.show()


fig,axs = boxplots_keepstat_combined(df_min,extra='minimal_',sites='test',run_sets=[('CpG','meth')],fig_width=12,label=False)
add_labels(fig,axs,labels=['b.'])
plt.show()


# sns.set(rc={'figure.figsize':(12,12)})
# fig, axs = plt.subplots(ncols=1,nrows=2,sharex='col')
# x_ticks = np.sort(df_min['kmer_perc'].unique())
# df_nim = df_min[df_min['impute']=='missing-imputed']
# boxplots_colstat_combined(df_nim,sites='test',run_sets=[('CpG','meth')])
# boxplots_colstat_combined(df_nim,sites='train',run_sets=[('CpG','meth')])
# boxplots_colstat_combined(df_nim,sites='zero',run_sets=[('CpG','meth')],max_kmer_perc=50)
# plt.suptitle('Number sites called in dataset')
# plt.tight_layout()
