In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# plot per query performance for sections/

In [None]:
data_dir = "../data/processed/jbi/sections/"

In [None]:
measures_dict = {
    "P_10": "P@10",
    "ndcg_cut_10": "nDCG@10",
    "ndcg_cut_5": "nDCG@5",
    "recip_rank": "RR"
}

In [None]:
year = '2022'

In [None]:
sections = ['bm25p-inclusion-20230202-results',
            'bm25p-summary-20230202-results',
            'bm25p-summary_description_titles_conditions-20230202-results',
            # 'bm25p-summary_description_titles_conditions_inclusion-20230202-results'
            ]

In [None]:
out_df = pd.DataFrame()
for section in sections:
    df = pd.read_csv(f"{data_dir}/{year}/{section}", sep='\t', names=['measure','query', 'scores'])
    df['sections'] = section.split('-')[1].replace('_', " + ")
    out_df = out_df.append(df, ignore_index=True)

In [None]:
out_df.head()

In [None]:
out_df['measure'] = out_df['measure'].str.strip()
out_df['query'] = out_df['query'].str.strip()

In [None]:
out_df = out_df[out_df['query'] != 'all']
out_df['query'] = out_df['query'].astype(int)

In [None]:
out_df = out_df.drop_duplicates(subset=['measure', 'query', 'sections'], keep='last')

In [None]:
measure = 'recip_rank'

In [None]:
out_df[out_df['measure'] == measure]

In [None]:
for measure, measure_name in measures_dict.items():

    sns.set(rc={'figure.figsize':(20,6)})

    plot = sns.barplot(out_df[out_df['measure'] == measure], x='query', y='scores', hue='sections')
    plot.set(xlabel='',
           ylabel=measure_name,
           title='')
    plot.tick_params(labelsize=17)
    plot.legend(fontsize=17, loc='upper right', title='Sections', title_fontsize=17)
    plot.yaxis.label.set_size(23)

    plot.figure.tight_layout(pad=2.5)
    plot.figure.savefig(f"../reports/performance_plot/sections/sections_{year}_{measure}.pdf")
    plot.clear()

In [None]:
out_df[(out_df['measure'] == measure) & (out_df['query'] == 1)]

In [None]:
out_df[(out_df['measure'] == measure) & (out_df['query'] == 1)]

# analysis of extracted entities

In [None]:
data_dir = "../data/processed/jbi/"

In [None]:
year = '2022'

In [None]:
runs = {"14": f'sections/{year}/bm25p-summary_description_titles_conditions_inclusion-20230202-results',
        "14d":  f'ie/{year}/bm25p-an_cpf-20230207-results',
"14d-AG":  f'ie_filtered/{year}/bm25p-an_cpf-20230207_age_gender-results'
        }

In [None]:
out_df = pd.DataFrame()
for run_name, file in runs.items():
    df = pd.read_csv(f"{data_dir}/{file}", sep='\t', names=['measure','query', 'scores'])
    df['sections'] = run_name
    out_df = out_df.append(df, ignore_index=True)

In [None]:
out_df['measure'] = out_df['measure'].str.strip()
out_df['query'] = out_df['query'].str.strip()

In [None]:
out_df = out_df[out_df['query'] != 'all']
out_df['query'] = out_df['query'].astype(int)

In [None]:
out_df = out_df.drop_duplicates(subset=['measure', 'query', 'sections'], keep='last')

In [None]:
for measure, measure_name in measures_dict.items():

    sns.set(rc={'figure.figsize':(20,6)})

    plot = sns.barplot(out_df[out_df['measure'] == measure], x='query', y='scores', hue='sections')
    plot.set(xlabel='',
           ylabel=measure_name,
           title='')
    plot.figure.savefig(f"../reports/performance_plot/all/{year}_{measure}.pdf")
    plot.clear()

# count number of excluded (2) from runs

In [None]:
year = '2022'

In [None]:
qrels_file = f'../data/external/qrels{year}.txt'

qrel_df = pd.read_csv(qrels_file, sep=' ', names=['qid', 'skip', 'docid', 'qrel'])
qrel_df['qid'] =qrel_df['qid'].astype(int)
qrel_df['qrel'] =qrel_df['qrel'].astype(int)

In [None]:
runs = {"14": f'sections/{year}/bm25p-summary_description_titles_conditions_inclusion-20230202',
        "14d":  f'ie/{year}/bm25p-an_cpf-20230207',
"14d-AG":  f'ie_filtered/{year}/bm25p-an_cpf-20230207_age_gender'
        }

In [None]:
if year == '2022':
    runs['TCRR'] = "neural/2022/DoSSIER_5"

In [None]:
out_df = pd.DataFrame()
for run_name, file in runs.items():
    df = pd.read_csv(f"{data_dir}/{file}", sep=' ', names=['qid','skip', 'docid', 'rank','score', 'run_name'])
    df['sections'] = run_name
    print(len(df))
    out_df = out_df.append(df, ignore_index=True)

In [None]:
out_df

In [None]:
mdf = pd.merge(out_df, qrel_df, left_on=['qid', 'docid'], right_on=['qid', 'docid'], how='left')

In [None]:
mdf

In [None]:
for top_n in [5, 10, 15, 20, 30]:
    pdf = mdf[(mdf['rank'] < top_n) & (mdf['qrel'] == 2.0)].groupby(['sections', 'qid'])['qrel'].count().reset_index()
    pdf['run_name'] =pdf['sections']
    sns.set(rc={'figure.figsize': (20, 6)})

    plot = sns.barplot(pdf, x='qid', y='qrel', hue='run_name')
    plot.set(xlabel='',
             ylabel="",
             title=f"Number of relevant trials in the top {top_n}")
    plot.tick_params(labelsize=17)
    plot.legend(fontsize=17, loc='upper right', title='Run name', title_fontsize=17)
    plot.title.set_size(25)

    plot.figure.tight_layout(pad=2.5)
    plot.figure.savefig(f"../reports/performance_plot/all/relevant_{year}_top_{top_n}.pdf")
    plot.clear()
    # count number of excluded (2) from runs

# aggregated plots per K

In [None]:
qrels = {
    0: 'irrelevant',
    1: "excluded",
    2: "relevant"
}

In [None]:
grouped_by_k = pd.DataFrame()

for top_n in range(0, 100):
    for qrel, qrel_text in qrels.items():
        pdf = mdf[(mdf['rank'] <= top_n) & (mdf['qrel'] == qrel)].groupby(['sections'])['qrel'].count().reset_index()
        pdf['run name'] = pdf['sections']
        pdf['qrel_count'] = pdf['qrel']
        pdf['top_n'] = top_n
        pdf['qrel'] = qrel_text
        grouped_by_k = grouped_by_k.append(pdf, ignore_index=True)

In [None]:
grouped_by_k

In [None]:
for qrel, qrel_text in qrels.items():

    sns.set(rc={'figure.figsize': (20, 7)})

    plot = sns.lineplot(grouped_by_k[grouped_by_k['qrel']==qrel_text], x='top_n', y='qrel_count', hue='run name')
    plot.set(xlabel='',
             ylabel="",
             title=f"Number of {qrel_text} trials per K trials retrieved")
    plot.figure.savefig(f"../reports/performance_plot/all/count_{year}_{qrel_text}.pdf")
    plot.clear()
# count number of excluded (2) from runs

# averaged for each query

In [None]:
sns.lineplot(mdf[(mdf['rank'] <= top_n) & (mdf['qrel'] == qrel)],
             x='rank', y='qrel', hue='sections')

In [None]:
grouped_by_k = pd.DataFrame()

for top_n in range(0, 51):
    for qrel, qrel_text in qrels.items():
        pdf = mdf[(mdf['rank'] <= top_n) & (mdf['qrel'] == qrel)].groupby(['sections', 'qid'], dropna=False)['qrel'].count().reset_index()

        pdf['run name'] = pdf['sections']
        pdf.drop(columns=['sections'], inplace=True)
        pdf['qrel_count'] = pdf['qrel']
        pdf['top_n'] = top_n
        pdf['qrel'] = qrel_text
        grouped_by_k = grouped_by_k.append(pdf, ignore_index=True)


In [None]:
sns.lineplot(grouped_by_k,
             x='top_n', y='qrel', hue='run name')

In [None]:
grouped_by_k = grouped_by_k.set_index(['run name', 'qid', 'top_n', 'qrel']).unstack(fill_value=0).stack().reset_index()

In [None]:
for qrel, qrel_text in qrels.items():

    sns.set(rc={'figure.figsize': (20, 7)})

    plot = sns.lineplot(grouped_by_k[grouped_by_k['qrel']==qrel_text], x='top_n', y='qrel_count', hue='run name')
    plot.set(xlabel='',
             ylabel="",
             title=f"Average count of {qrel_text} trials per K trials retrieved")
    plot.tick_params(labelsize=20)
    plot.legend(fontsize=20, loc='upper left', title='Run name', title_fontsize=21)
    plot.title.set_size(26)

    plot.figure.tight_layout()
    plot.figure.savefig(f"../reports/performance_plot/all/averaged_count_{year}_{qrel_text}.pdf")
    plot.clear()
# count number of excluded (2) from runs