This notebook uses the output from `remora_callstat.py` to visualize how well Remora/Megalodon performed.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style(style='white',rc={'figure.figsize':(16,8)})

color_r = 'tab:red'
color_b = 'tab:blue'
# plt.rcParams['figure.figsize'] = 5,5

def read_csv(filepath):
    data = []
    with open(filepath) as file_stats:
        for line in file_stats:
            # print(line)
            if line.startswith('./'):
                _, kmers, region, _, state, _ = line.split('/')
                kmers = int(kmers)
                state = state.replace('can','canon')
            elif line.startswith('Cutoff'):
                header = line.split(',')
            elif line.startswith('1'):
                neg,uns,pos = [int(x) for x in line.split(',')[1:4]]
                data.append([kmers,region,state,neg,uns,pos])

    df = pd.DataFrame(data,columns=['kmers','region','modstate','neg','uns','pos'])

    df['kmer_perc'] = df['kmers']*100//1280
    df['pos_freq'] = df['pos']/(df['pos']+df['neg'])
    df['drop_freq'] = df['uns']/(df['pos']+df['neg']+df['uns'])

    df.replace({'kmer_perc': {90:85}},inplace=True) 

    return df



def add_labels(fig, axes, labels=['a.','b.']):
    for ax, label in zip(axes, labels):
        bbox = ax.get_tightbbox(fig.canvas.get_renderer())
        fig.text(bbox.x0, bbox.y1, label, fontsize=16, fontweight="bold", va="top", ha="left",
                transform=None)

# sns.set(rc={'figure.figsize':(16,8)})
# sns.set_style("whitegrid")

def boxplots_callstat(df,title='', label_x='Percentage of modified 6-mers in training'):
    def boxplot_callstats(modstate,my_ax,c=color_r):
        sns.boxplot(data=df[df['modstate']==modstate], x="kmer_perc", y="pos_freq", color=c, ax=my_ax)
        modstate = 'Canonical' if modstate == 'canon' else 'CpG methylated'
        my_ax.set_title(title+modstate+' dataset call frequencies')
        my_ax.set_ylabel('Positive call frequency')
        my_ax.set_xlabel(label_x)
        # my_ax.grid(True)

    fig, axs = plt.subplots(ncols=2)
    fig.set_size_inches([9,4.5])
    boxplot_callstats('mod',axs[0])
    boxplot_callstats('canon',axs[1],c=color_b)
    plt.tight_layout()
    add_labels(fig,[axs[0],axs[1]])

def boxplots_keepstat(df,title='', label_x='Percentage of modified 6-mers in training'):
    def boxplot_keepstats(modstate,my_ax,c=color_r):
        sns.boxplot(data=df[df['modstate']==modstate], x="kmer_perc", y="drop_freq", color=c, ax=my_ax)
        modstate = 'Canonical' if modstate == 'canon' else 'CpG methylated'
        my_ax.set_title(title+modstate+' dataset dropped call frequencies')
        my_ax.set_ylabel('Discarded call frequency')
        my_ax.set_xlabel(label_x)
        # my_ax.grid(True)

    fig, axs = plt.subplots(ncols=2)
    fig.set_size_inches([9,4.5])
    boxplot_keepstats('mod',axs[0])
    boxplot_keepstats('canon',axs[1],c=color_b)
    plt.tight_layout()
    add_labels(fig,[axs[0],axs[1]])




df = read_csv("data/remora_callstats.csv")
# print(df)
# df_grouped = df.groupby(['kmers','modstate'])
# df_grouped['pos_freq'].describe()
# df_grouped['drop_freq'].describe()

boxplots_callstat(df)
plt.savefig('./data/remora_callstats_boxplot_calls.pdf')
plt.savefig('./data/remora_callstats_boxplot_calls.png')
boxplots_keepstat(df)
plt.savefig('./data/remora_callstats_boxplot_drops.pdf')
plt.savefig('./data/remora_callstats_boxplot_drops.png')

In [None]:
ticks=sorted(df['kmer_perc'].unique())

# fig=plt.figure()
fig, axs = plt.subplots(ncols=2)
fig.set_size_inches([9,4.5])

modstate='mod'
sns.lineplot(data=df[df['modstate']==modstate], x="kmer_perc", y="pos_freq", err_style='bars',color=color_r,label='CpG Methylated',ax=axs[0])
g = sns.scatterplot(data=df[df['modstate']==modstate], x="kmer_perc", y="pos_freq",color=color_r,ax=axs[0],linewidth=.25)
g.set_xticks(ticks)
g.set_ylabel('Positive call frequency')
g.set_xlabel('Percentage of modified 6-mers in training')
modstate='canon'
sns.lineplot(data=df[df['modstate']==modstate], x="kmer_perc", y="pos_freq", err_style='bars',color=color_b,label='Canonical',ax=axs[1])
g = sns.scatterplot(data=df[df['modstate']==modstate], x="kmer_perc", y="pos_freq",color=color_b,ax=axs[1],linewidth=.25)
g.set_xticks(ticks)
g.set_ylabel('Positive call frequency')
g.set_xlabel('Percentage of modified 6-mers in training')
plt.tight_layout()
plt.savefig('./data/remora_callstats_lineplot_calls.pdf')
plt.savefig('./data/remora_callstats_lineplot_calls.png')

# fig=plt.figure()
fig, axs = plt.subplots(ncols=2)
fig.set_size_inches([9,4.5])
modstate='mod'
sns.lineplot(data=df[df['modstate']==modstate], x="kmer_perc", y="drop_freq", err_style='bars',color=color_r,label='CpG Methylated',ax=axs[0])
g = sns.scatterplot(data=df[df['modstate']==modstate], x="kmer_perc", y="drop_freq",color=color_r,ax=axs[0],linewidth=.25)
g.set_xticks(ticks)
g.set_ylabel('Positive call frequency')
g.set_xlabel('Percentage of modified 6-mers in training')
modstate='canon'
sns.lineplot(data=df[df['modstate']==modstate], x="kmer_perc", y="drop_freq", err_style='bars',color=color_b,label='Canonical',ax=axs[1])
g = sns.scatterplot(data=df[df['modstate']==modstate], x="kmer_perc", y="drop_freq",color=color_b,ax=axs[1],linewidth=.25)
g.set_xticks(ticks)
g.set_ylabel('Discarded call frequency')
g.set_xlabel('Percentage of modified 6-mers in training')
plt.tight_layout()
plt.savefig('./data/remora_callstats_lineplot_drops.pdf')
plt.savefig('./data/remora_callstats_lineplot_drops.png')

In [None]:

def boxplots_perf(df, title='', label_x='Percentage of modified k-mers kept'):
    # sns.set(rc={'figure.figsize':(24,12)})
    def boxplot_perfstats(metric,my_ax):
        # print(df[metric])
        sns.boxplot(data=df, x="kmer_perc", y=metric,ax=my_ax)
        my_ax.set_title(title+metric)
        my_ax.set_ylabel(metric)
        my_ax.set_xlabel(label_x)
        my_ax.grid(True)

    fig, axs = plt.subplots(nrows=2,ncols=2)
    fig.set_size_inches([12,12])
    for i,val in enumerate(['accuracy','precision','recall','f_score']):
        boxplot_perfstats(val,axs[i//2,i%2])
    plt.tight_layout()


def get_perf_df(df):
    perf_table = []
    for kmer_perc in df['kmer_perc'].unique():
        # print(kmer_perc)
        perc_df = df[df['kmer_perc']==kmer_perc]
        for perc_repeat in perc_df['region'].unique():
            # print(perc_repeat)
            rep_df = perc_df[perc_df['region']==perc_repeat]
            # print(rep_df)
        
            # print(imp_df)
            can_df = rep_df[rep_df['modstate']=='canon']
            mod_df = rep_df[rep_df['modstate']=='mod']

            FP = np.array(can_df['pos'])[0]
            TN = np.array(can_df['neg'])[0]

            TP = np.array(mod_df['pos'])[0]
            FN = np.array(mod_df['neg'])[0]

            accuracy  = (TP+TN)/(TP+TN+FP+FN)
            precision = TP/(TP+FP)
            recall    = TP/(TP+FN)

            f_score   = (2*recall*precision)/(recall+precision)

            perf_table.append([kmer_perc,perc_repeat,FN,TN,FP,TP,accuracy,precision,recall,f_score])
    return pd.DataFrame(perf_table,columns=['kmer_perc','perc_repeat','FN','TN','FP','TP','accuracy','precision','recall','f_score'])


boxplots_perf(get_perf_df(df))
plt.savefig('./data/remora_callstats_boxplot_stats.pdf')

In [None]:
motif = 'CG'
mer_size = 6
max_picks = int(np.power(4,mer_size - len(motif))*5*0.05)
max_picks