In [43]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import io
import base64
import os
from IPython.display import display, HTML

plt.rcParams.update({
    "text.usetex": True,
    "font.family": "Times"
})

In [2]:
list(filter(lambda x: 'ratio-ollama_' in x, os.listdir('dat/')))
# list(filter(lambda x: 'ratio-' in x, os.listdir('dat/')))

['ratio-ollama_highlight_token_ratio-bart.csv',
 'ratio-ollama_cnn_highlight_words.csv',
 'ratio-ollama_highlight_token_ratio.csv',
 'ratio-ollama_highlight_words_ratio.csv',
 'ratio-ollama_highlight_words-bart.csv',
 'ratio-ollama_cnn_highlight_words_ratio.csv',
 'ratio-ollama_highlight_words.csv',
 'ratio-ollama_cnn_highlight_token_ratio.csv']

In [3]:
ratio_test_names = [
    {'result': 'highlight_token_ratio', 'target' : 'highlighted'},
    {'result': 'highlight_words_ratio', 'target' : 'highlighted'},
    {'result': 'cnn_highlight_token_ratio', 'target' : 'original'},
    {'result': 'cnn_highlight_words_ratio', 'target' : 'original'},
    # {'result': 'ratio-ollama_cnn_highlight_words', 'target' : 'ratio-original'},
    # {'result': 'ratio-ollama_highlight_word', 'target' : 'ratio-highlighted'},
]

dfs = []
for ratio_test_name in ratio_test_names:
    result_name = ratio_test_name['result']
    target_name = ratio_test_name['target']
    df_result = pd.read_csv(f'dat/ratio-ollama_{result_name}.csv')
    df_target = pd.read_csv(f'dat/ratio-{target_name}.csv')
    dfs.append({
        'result-name': result_name,
        'target-name': target_name,
        'result': df_result,
        'target': df_target
    })

In [87]:
clip = True
save = True
red  = [c/255 for c in [208,2,27]]
blue = [c/255 for c in [74,114,226]]

fig_config = {
    'figsize': (3,1),
    'dpi': 220,
}

hist_config = {
    'bins': 35,
    'density': False,
    'alpha': 0.4,
    'edgecolor': 'black',
    'linewidth': 0.5,
    'align': 'mid'
}

html = ''
for df in dfs:
    df_result = df['result']
    df_target = df['target']
    
    is_cnn = df['target-name'] == 'original'
    is_from_token_ratio = 'token_ratio' in df['result-name']
    
    title = f"{'A' if is_cnn else 'B'}-{'1' if is_from_token_ratio else '2'}"
    xmax = 0.51 if is_cnn else 1.01
    clip_max = xmax if clip else None
    target_ratio = 'token_ratio' if is_from_token_ratio else 'words_ratio'
    
    fig, ax = plt.subplots(1,1,**fig_config)
    ax.hist(df_result[target_ratio].clip(upper=clip_max), label='result' , zorder=2, color=red , **hist_config)
    ax.hist(df_target[target_ratio].clip(upper=clip_max),  label='target'  , zorder=1, color=blue, **hist_config)
    ax.set_title(title)
    ax.set_xlim(0,xmax)
    ax.set_xlabel('ratio')
    ax.set_ylabel('number of data')
    ax.legend() 
    stringIObytes = io.BytesIO()
    plt.savefig(stringIObytes, format='svg', bbox_inches='tight')
    if save:
        plt.savefig(f"result/ratio_test({title}).pdf", format='pdf', bbox_inches='tight')
    plt.close(fig)
    img_svg = stringIObytes.getvalue().decode("utf-8")
    html += f"""<div style="width:45%;display:inline-block;border-radius:10px;overflow:hidden;margin:5px;box-shadow:0 0 5px #bbb;padding:5px">
        {img_svg}
    </div>"""
display(HTML(f"<div style='display:flex;flex-wrap: wrap'>{html}</div>"))