In [115]:
import json
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from math import log
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 400
# show more dataframe rows
pd.set_option('display.min_rows', 50)
pd.set_option('display.max_rows', 100)

def read(file):
    with open(f'data/{file}', 'r') as f:
        data = json.load(f)
    df = pd.json_normalize(data)

    #df = df[df.w.isin([8,32])]
    
    # Rename column 'tp.minimizer_type' to 'Minimizer type'
    df = df.rename(columns={'tp.minimizer_type': 'Minimizer type'})
    # In type column replace minizer value with random minimizer.
    df['Minimizer type'] = df['Minimizer type'].replace('Minimizer', 'Random minimizer')
    df['Minimizer type'] = df['Minimizer type'].replace('LrMinimizer', 'LR-minimizer')
    df['Minimizer type'] = df['Minimizer type'].replace('ModMinimizer', 'Mod-minimizer')
    df['Minimizer type'] = df['Minimizer type'].replace('MiniceptionNew', 'Modified miniception')


    df = df[df.k >= np.log(df.w)/np.log(df.sigma)]
    df['param'] = df['tp.k0'].fillna(0) #+ df['tp.r'].fillna(0)
    return df


In [None]:
files = ['density_4.json', 'density_256.json']

def plot(file):
    df = read(file)
    s = df['sigma'].unique()[0]

    if True:
        # Draw 1.5/(w+1) lines
        for w in df.w.unique():
            plt.axhline(y=(1.5+1/(2*w))/(w+1), color='black', linewidth=0.5)
            plt.axhline(y=(1.5)/(w+0.5), color='blue', linewidth=0.5)
            # Add the graph of (k/(k+1))/w:
            ks = range(df.k.min(), df.k.max())
            # k = (y-1)w+1 => y = (k-1)/w+1
            # (1+y)/(1+yw)
            plt.plot(ks, [1/w + 1/(w+k) - 1/(w*(w+k)) for k in ks], color='red', linewidth=0.5)
            # k = (y-1)w => y = k/w+1
            # (1+y)/(1+yw)
            # (2+k/w)/(1+k+w)
            # (2w+k)/(w(1+k+w))
            # plt.plot(ks, [(2*w+k)/(w*(w+k+2)) for k in ks], color='purple', linewidth=0.5)
            plt.plot(ks, [(1.5 + max(0, (k-w)//w) + 1/(2*w))/(w+k) for k in ks], color='green', linewidth=0.5)

            plt.plot(ks, [2/(w+k) for k in ks], color='orange', linewidth=0.5)
            plt.plot(ks, [2/(w+(k+1)/2) for k in ks], color='purple', linewidth=0.5)
            plt.plot(ks, [(2+1/w)/(w+(k+2)/2) for k in ks], color='black', linewidth=0.5)
            # plt.plot(ks, [2.5/(w+k +1/2) for k in ks], color='blue', linewidth=0.5)
        # Add a second legend containing formulas for the red and black lines
        offset = 0.05
        plt.text(1.05, offset+0.15, f'(1.5+1/(2w)) / (w+1)', color='black', transform=plt.gca().transAxes)
        plt.text(1.05, offset+0.10, f'1.5 / (w+0.5) (new)', color='blue', transform=plt.gca().transAxes)
        plt.text(1.05, offset+0.05, f'1/w + 1/(w+k) - 1/(w·(w+k)) (?)', color='red', transform=plt.gca().transAxes)
        plt.text(1.05, offset+0.00, f'(1.5+max(0,floor((k-w)/w))+1/2w) / (w+k)', color='green', transform=plt.gca().transAxes)
        plt.text(1.05, offset-0.05, f'2/(w+k) (new)', color='orange', transform=plt.gca().transAxes)
        plt.text(1.05, offset-0.10, f'1.5/(w+(k+1)/2) (?)', color='purple', transform=plt.gca().transAxes)

        sns.lineplot(x='k', y='density', hue='Minimizer type', size='w', sizes=(1,2), data=df, legend='full', marker='.', dashes=False);
        plt.title(f'Minimizer density on random text (alphabet size σ={s}; length=5M)')
        plt.xlabel('Kmer length k')
        plt.ylabel('Density')
        plt.ylim(2**-4.7, 2**-1.95)
        # Tick labels 1/4, 1/8, 1/16
        plt.yscale('log', base=2)
        ws = df.w.unique()
        plt.yticks([2/(w+1) for w in ws]+[1.5/(w+0.5) for w in ws],[f'{2/(w+1):.3f}' for w in ws]+[f'{1.5/(w+0.5):.3f}' for w in ws])
        plt.xscale('log', base=2)
        plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left')

        plt.savefig(f'fig/density_{s}.svg', bbox_inches='tight')
        # plt.savefig(f'fig/{file}.png', bbox_inches='tight', dpi = 400)
        plt.show()

    # Plot optimal parameter choice.
    if False:
        sns.lineplot(x='k', y='param', hue='Minimizer type', size='w', data=df, legend='full');
        plt.title(f'Optimal parameter k0 or r')
        plt.xscale('log', base=2)
        plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left')
        plt.savefig(f'fig/params_{s}.svg', bbox_inches='tight')
        plt.show()

# plot(files[0]);
plot(files[1]);


In [None]:
files = ['stats_4.json', 'stats_256.json']

def plot(file):
    df = read(file)
    s = df['sigma'].unique()[0]

    # Plot position distribution.
    if True:
        # Flatten the data json object and add a new dist column.
        ddf = df.explode('positions').reset_index().rename(columns={'index' : 'position'})
        ddf['position'] = ddf.groupby('position').cumcount()
        g = sns.FacetGrid(ddf, row="k", col="w", sharex="col", sharey=True, margin_titles=True, height=2.5)
        g.map(sns.lineplot, "position", "positions", "Minimizer type")
        # Set y range of FaceGrid
        g.set(ylim=(0.7,1.3))
        g.add_legend()
        g.set_axis_labels("Selected position", "Relative frequency")

        g.fig.subplots_adjust(top=0.92) # adjust the Figure in rp
        g.fig.suptitle('Minimizer position distribution')

        plt.savefig(f'fig/positions_{s}.svg', bbox_inches='tight')
        plt.savefig(f'fig/positions_{s}.png', bbox_inches='tight', dpi=400)
        
        plt.show()

    # Plot distance distribution.
    if True:
        # Flatten the data json object and add a new dist column.
        ddf = df.explode('dists').reset_index().rename(columns={'index' : 'dist'})
        ddf['dist'] = ddf.groupby('dist').cumcount()-ddf.w
        ddf = ddf[(ddf.dists > 0) | (ddf.dist >= 0)]
        g = sns.FacetGrid(ddf, row="k", col="w", sharex="col", sharey="row", margin_titles=True, height=2.5)
        g.map(sns.lineplot, "dist", "dists", "Minimizer type")
        g.add_legend()
        g.set_axis_labels("Anchor distance", "Relative frequency")
        g.fig.subplots_adjust(top=0.92) # adjust the Figure in rp
        g.fig.suptitle('Minimizer distance distribution')
        plt.savefig(f'fig/dists_{s}.svg', bbox_inches='tight')
        plt.savefig(f'fig/dists_{s}.png', bbox_inches='tight', dpi=400)
        plt.show()

# plot(files[0]);
plot(files[1]);