# QPP Evaluation

## Using [qpp-risk-evaluator](https://github.com/RicardoMarcal/qpp-risk-evaluator)

In [2]:
# transferring computed QPP scores into csv files

import glob
import pandas as pd
import json
import os

def evaluate_QPP(target_metric,dataset_name, QPP_type):

    rel_type_list = ["topical_relevance", "readability_relevance", "eduval_relevance", "extended_relevance"]
    retrieval = "porter-lucene-BM25"

    assert QPP_type == "pre" or QPP_type == "post"

    for rel_type in rel_type_list:
        pattern = f"./output/{QPP_type}-retrieval/{dataset_name}/{rel_type}/{dataset_name}*"
        
        if dataset_name == "kid-friend-en":
            ap_path = f"./actual_performance/{dataset_name}/{rel_type}/{dataset_name}.actual-performance-run-OG-Q-bm25-1000.json"
        elif dataset_name == "requik":
            if rel_type == "extended_relevance":
                ap_path = f"./actual_performance/{dataset_name}-qpp-data/{rel_type}/{dataset_name}-qpp-data.actual-performance-run-OG-Q-bm25-1000.json"
            else:
                continue
        ap={}
        with open(ap_path, 'r') as r:
            ap_bank = json.loads(r.read())

        for qid in ap_bank.keys():
            ap[qid]=float(ap_bank[qid][target_metric])

        for pp_path in sorted(glob.glob(pattern)):
            # print(pp_path)
            if QPP_type == "pre":
                predictor = pp_path.split("/")[-1].split(".")[1].removeprefix("queries-")
            elif QPP_type == "post":
                predictor = ".".join(pp_path.split("/")[-1].split(".")[2:]).removeprefix("queries-")
            # print(predictor)
            pp={}
            with open(pp_path, 'r') as r:
                for line in r:
                    qid, pp_value = line.rstrip().split()
                    pp[qid]=float(pp_value)
            # print(pp_path)
            # assert ap.keys() == pp.keys() 
            
            score_data = []
            for qid in ap.keys():
                if qid in pp.keys():
                    score_data.append([qid, pp[qid], ap[qid]])
                else:
                    score_data.append([qid, 0.0, ap[qid]])

            for qid in pp.keys():
                if qid not in ap.keys():
                    print(qid)
                    score_data.append([qid, pp[qid], 0.0])

            df = pd.DataFrame(score_data, columns=["tid", "scores", "value"])

            if not os.path.exists(f"./QPP_scores_new/data/{dataset_name}/{rel_type}/"):
                os.makedirs(f"./QPP_scores_new/data/{dataset_name}/{rel_type}/")

            df.to_csv(f"./QPP_scores_new/data/{dataset_name}/{rel_type}/{dataset_name}_{retrieval}_{predictor}.csv", index=False)


In [3]:
evaluate_QPP(target_metric="ndcg@10", dataset_name="kid-friend-en", QPP_type="pre")
evaluate_QPP(target_metric="ndcg@10", dataset_name="kid-friend-en", QPP_type="post")

evaluate_QPP(target_metric="ndcg@10", dataset_name="requik", QPP_type="pre")
evaluate_QPP(target_metric="ndcg@10", dataset_name="requik", QPP_type="post")

In [4]:
!python3 -u qpp-risk-evaluator/main.py

  0%|          | 0/1 [00:00<?, ?it/s]
Starting GeoRisk bootstrapping (alpha=5)... 
Starting pearson correlation bootstrapping... 
  corr = [correlation(corrType)(scoresDF[predictor].to_numpy()[sample], ndcgDF[predictor].to_numpy()[sample]).statistic for predictor in scoresDF.columns]
Starting spearman correlation bootstrapping... 
Starting kendall correlation bootstrapping... 
Starting sMARE bootstrapping... 
  0%|          | 0/1 [00:00<?, ?it/s]
Starting GeoRisk bootstrapping (alpha=5)... 
Starting pearson correlation bootstrapping... 
  corr = [correlation(corrType)(scoresDF[predictor].to_numpy()[sample], ndcgDF[predictor].to_numpy()[sample]).statistic for predictor in scoresDF.columns]
Starting spearman correlation bootstrapping... 
Starting kendall correlation bootstrapping... 
Starting sMARE bootstrapping... 
  0%|          | 0/1 [00:00<?, ?it/s]
Starting GeoRisk bootstrapping (alpha=5)... 
Starting pearson correlation bootstrapping... 
  corr = [correlation(corrType)(scoresDF[pre

In [16]:
import pandas as pd
import numpy as np

results = []

metric_names = ["corrPearson", "corrKendall", 
                "corrSpearman", "smare",
                "snGeoRiskInvA5"]

dataset_names = ["kid-friend-en", "requik"]
rel_types = ["topical_relevance", "readability_relevance", "eduval_relevance", 
            "extended_relevance"]

for dataset in dataset_names:
    for rel_type in rel_types:
        if dataset == "requik" and rel_type != "extended_relevance":
            continue
        for metric in metric_names:
            path_name = f"./QPP_scores_new/output/{dataset}/{rel_type}/bootstrap/{dataset}_predictors-porter-lucene-BM25_{metric}_bootstrap.csv"
            df = pd.read_csv(path_name)
            for _, row in df.iterrows():
                predictor = row['predictor']
                bootstrap_avg = round(np.mean(row.drop(['predictor']).to_numpy()),3)
                results.append([dataset, rel_type, metric, predictor, bootstrap_avg])
                # print(predictor, bootstrap_avg)

results_df = pd.DataFrame(results, columns=["dataset", "rel_type", "metric", "QPP", "bootstrap_avg"])

In [19]:
results_df.to_csv("./QPP_scores_new/output/bootstrap_averages.csv", index=False)
results_df

Unnamed: 0,dataset,rel_type,metric,QPP,bootstrap_avg
0,kid-friend-en,topical_relevance,corrPearson,avgICTF,-0.054
1,kid-friend-en,topical_relevance,corrPearson,IDF-avg,-0.121
2,kid-friend-en,topical_relevance,corrPearson,IDF-max,0.001
3,kid-friend-en,topical_relevance,corrPearson,IDF-std,0.179
4,kid-friend-en,topical_relevance,corrPearson,IDF-sum,0.123
...,...,...,...,...,...
970,requik,extended_relevance,snGeoRiskInvA5,QPPGenRE-re-trained-n10-ndcg@10,0.416
971,requik,extended_relevance,snGeoRiskInvA5,QPPGenRE-re-trained-n100-ndcg@10,0.435
972,requik,extended_relevance,snGeoRiskInvA5,QPPGenRE-re-trained-n1000-ndcg@10,0.435
973,requik,extended_relevance,snGeoRiskInvA5,QPPGenRE-re-trained-n200-ndcg@10,0.435


In [None]:
import pandas as pd

QPP_reported = ["avgICTF", "IDF-avg", "PMI-avg", "ql", "QS", "SCQ-avg", "SCS-1", "SCS-2", "VAR-std-avg", "clarity-score-k100", "nqc-norm-k100", "sigma_max", "sigma_x0.5", "smv-norm-k100", "wig-norm-k5","BERTQPP-re-trained-ndcg@10-10", "qppBERTPL-re-trained-ndcg@10-4", "NQAQPP-re-trained-ndcg@10-4", "QPPGenRE-re-trained-n1000-ndcg@10"]

QPP_reported_names = {
    "avgICTF":"ICTFavg", 
    "IDF-avg":"IDFavg", 
    "PMI-avg":"PMIavg", 
    "ql":"QL", 
    "QS":"QS", 
    "SCQ-avg": "SCQavg", 
    "SCS-1": "SCS-1", 
    "SCS-2": "SCS-2", 
    "VAR-std-avg": "VARavg", 
    "clarity-score-k100": "Clarity", 
    "nqc-norm-k100": "NQC", 
    "sigma_max": "sigma_max", 
    "sigma_x0.5": "sigma_x", 
    "smv-norm-k100": "SMV", 
    "wig-norm-k5": "WIG",
    "BERTQPP-re-trained-ndcg@10-10": "BERTQPP", 
    "qppBERTPL-re-trained-ndcg@10-4": "qppBERT-PL", 
    "NQAQPP-re-trained-ndcg@10-4": "NQA-QPP", 
    "QPPGenRE-re-trained-n1000-ndcg@10" : "QPPGenRE"
}

rel_type_names = {
    "topical_relevance": "TR",
    "readability_relevance": "READ+TR",
    "eduval_relevance": "EDU+TR",
    "extended_relevance": "ER"
}

df = pd.read_csv("./QPP_scores_new/output/bootstrap_averages.csv")
df = df.fillna(0)
df_reported = df.loc[df["QPP"].isin(QPP_reported)]
df_reported["QPP"] = [QPP_reported_names[qpp] for qpp in df_reported["QPP"]]
df_reported["rel_type"] = [rel_type_names[rel_type] for rel_type in df_reported["rel_type"]]
df_reported.to_csv("./QPP_scores_new/output/reported_values.csv", index=False)
df_reported.QPP.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reported["QPP"] = [QPP_reported_names[qpp] for qpp in df_reported["QPP"]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reported["rel_type"] = [rel_type_names[rel_type] for rel_type in df_reported["rel_type"]]


array(['ICTFavg', 'IDFavg', 'PMIavg', 'QL', 'QS', 'SCQavg', 'SCS-1',
       'SCS-2', 'VARavg', 'Clarity', 'NQC', 'sigma_max', 'sigma_x', 'SMV',
       'WIG', 'BERTQPP', 'qppBERT-PL', 'NQA-QPP', 'QPPGenRE'],
      dtype=object)

In [130]:
## significance tests: change across relevance definitions

import pandas as pd
import numpy as np
from scipy.stats import ttest_rel

results = []

metric_names = ["corrPearson", "corrKendall", 
                "corrSpearman", "smare",
                "snGeoRiskInvA5"]

dataset_names = ["kid-friend-en"]
rel_types = ["topical_relevance", "readability_relevance", "eduval_relevance", 
            "extended_relevance"]

QPP_reported = ["avgICTF", "IDF-avg", "PMI-avg", "ql", "QS", "SCQ-avg", "SCS-1", "SCS-2", "VAR-std-avg", "clarity-score-k100", "nqc-norm-k100", "sigma_max", "sigma_x0.5", "smv-norm-k100", "wig-norm-k5","BERTQPP-re-trained-ndcg@10-10", "qppBERTPL-re-trained-ndcg@10-4", "NQAQPP-re-trained-ndcg@10-4", "QPPGenRE-re-trained-n1000-ndcg@10"]

for dataset in dataset_names:
    for metric in metric_names:
        TR_path_name = f"./QPP_scores_new/output/{dataset}/topical_relevance/bootstrap/{dataset}_predictors-porter-lucene-BM25_{metric}_bootstrap.csv"
        READ_path_name = f"./QPP_scores_new/output/{dataset}/readability_relevance/bootstrap/{dataset}_predictors-porter-lucene-BM25_{metric}_bootstrap.csv"
        EDU_path_name = f"./QPP_scores_new/output/{dataset}/eduval_relevance/bootstrap/{dataset}_predictors-porter-lucene-BM25_{metric}_bootstrap.csv"
        ER_path_name = f"./QPP_scores_new/output/{dataset}/extended_relevance/bootstrap/{dataset}_predictors-porter-lucene-BM25_{metric}_bootstrap.csv"
        
        TR_df = pd.read_csv(TR_path_name)
        READ_df = pd.read_csv(READ_path_name)
        EDU_df = pd.read_csv(EDU_path_name)
        ER_df = pd.read_csv(ER_path_name)
        
        for predictor in QPP_reported:
            TR_row = np.ravel(TR_df.loc[TR_df["predictor"]==predictor].drop(['predictor'], axis=1).to_numpy())
            READ_row = np.ravel(READ_df.loc[READ_df["predictor"]==predictor].drop(['predictor'], axis=1).to_numpy())
            EDU_row = np.ravel(EDU_df.loc[EDU_df["predictor"]==predictor].drop(['predictor'], axis=1).to_numpy())
            ER_row = np.ravel(ER_df.loc[ER_df["predictor"]==predictor].drop(['predictor'], axis=1).to_numpy())

            TR_READ_pval = ttest_rel(TR_row, READ_row).pvalue
            TR_EDU_pval = ttest_rel(TR_row, EDU_row).pvalue
            TR_ER_pval = ttest_rel(TR_row, ER_row).pvalue
            READ_EDU_pval = ttest_rel(READ_row, EDU_row).pvalue
            READ_ER_pval = ttest_rel(READ_row, ER_row).pvalue
            EDU_ER_pval = ttest_rel(EDU_row, ER_row).pvalue

            pval_list = [TR_READ_pval, TR_EDU_pval, TR_ER_pval, READ_EDU_pval, READ_ER_pval, EDU_ER_pval]

            adjusted_significance = [(pval < 0.05/len(pval_list)) and (pval!=np.nan) for pval in pval_list]

            results.append([dataset, metric, predictor]+adjusted_significance)

results_df = pd.DataFrame(results, columns=["dataset", "metric", "QPP", "TR_READ", "TR_EDU", "TR_ER", "READ_EDU", "READ_ER", "EDU_ER"])
results_df
results_df.to_csv("./QPP_scores_new/output/sig_test_rel_def.csv", index=False)
results_df

Unnamed: 0,dataset,metric,QPP,TR_READ,TR_EDU,TR_ER,READ_EDU,READ_ER,EDU_ER
0,kid-friend-en,corrPearson,avgICTF,True,True,True,True,True,True
1,kid-friend-en,corrPearson,IDF-avg,True,True,True,True,True,True
2,kid-friend-en,corrPearson,PMI-avg,True,True,True,True,True,True
3,kid-friend-en,corrPearson,ql,True,True,True,True,True,True
4,kid-friend-en,corrPearson,QS,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...
90,kid-friend-en,snGeoRiskInvA5,wig-norm-k5,True,True,True,True,True,True
91,kid-friend-en,snGeoRiskInvA5,BERTQPP-re-trained-ndcg@10-10,True,True,True,True,True,True
92,kid-friend-en,snGeoRiskInvA5,qppBERTPL-re-trained-ndcg@10-4,True,True,True,True,True,True
93,kid-friend-en,snGeoRiskInvA5,NQAQPP-re-trained-ndcg@10-4,True,True,True,True,True,False


In [1]:
## significance tests: difference across QPP ANOVA

import pandas as pd
from scipy.stats import f_oneway, tukey_hsd
import numpy as np

results = []

metric_names = ["corrPearson", "corrKendall", 
                "corrSpearman", "smare",
                "snGeoRiskInvA5"]

dataset_names = ["kid-friend-en", "requik"]
rel_types = ["topical_relevance", "readability_relevance", "eduval_relevance", 
            "extended_relevance"]

QPP_reported = ["avgICTF", "IDF-avg", "PMI-avg", "ql", "QS", "SCQ-avg", "SCS-1", "SCS-2", "VAR-std-avg", "clarity-score-k100", "nqc-norm-k100", "sigma_max", "sigma_x0.5", "smv-norm-k100", "wig-norm-k5","BERTQPP-re-trained-ndcg@10-10", "qppBERTPL-re-trained-ndcg@10-4", "NQAQPP-re-trained-ndcg@10-4", "QPPGenRE-re-trained-n1000-ndcg@10"]

QPP_reported_dict = dict(enumerate(QPP_reported))

QPP_reported_names = {
    "avgICTF":"ICTFavg", 
    "IDF-avg":"IDFavg", 
    "PMI-avg":"PMIavg", 
    "ql":"QL", 
    "QS":"QS", 
    "SCQ-avg": "SCQavg", 
    "SCS-1": "SCS-1", 
    "SCS-2": "SCS-2", 
    "VAR-std-avg": "VARavg", 
    "clarity-score-k100": "Clarity", 
    "nqc-norm-k100": "NQC", 
    "sigma_max": "sigma_max", 
    "sigma_x0.5": "sigma_x", 
    "smv-norm-k100": "SMV", 
    "wig-norm-k5": "WIG",
    "BERTQPP-re-trained-ndcg@10-10": "BERTQPP", 
    "qppBERTPL-re-trained-ndcg@10-4": "qppBERT-PL", 
    "NQAQPP-re-trained-ndcg@10-4": "NQA-QPP", 
    "QPPGenRE-re-trained-n1000-ndcg@10" : "QPPGenRE"
}

for dataset in dataset_names:
    for rel_type in rel_types:
        if dataset == "requik" and rel_type!="extended_relevance":
            continue
        for metric in metric_names:
            path_name = f"./QPP_scores_new/output/{dataset}/{rel_type}/bootstrap/{dataset}_predictors-porter-lucene-BM25_{metric}_bootstrap.csv"
            df = pd.read_csv(path_name)
            df_reported = df.loc[df["predictor"].isin(QPP_reported)].fillna(0)
            
            ictf = np.ravel(df_reported.loc[df_reported["predictor"]=="avgICTF"].drop(["predictor"], axis=1).to_numpy())
            idf = np.ravel(df_reported.loc[df_reported["predictor"]=="IDF-avg"].drop(["predictor"], axis=1).to_numpy())
            pmi = np.ravel(df_reported.loc[df_reported["predictor"]=="PMI-avg"].drop(["predictor"], axis=1).to_numpy())
            ql = np.ravel(df_reported.loc[df_reported["predictor"]=="ql"].drop(["predictor"], axis=1).to_numpy())
            qs = np.ravel(df_reported.loc[df_reported["predictor"]=="QS"].drop(["predictor"], axis=1).to_numpy())
            scq = np.ravel(df_reported.loc[df_reported["predictor"]=="SCQ-avg"].drop(["predictor"], axis=1).to_numpy())
            scs1 = np.ravel(df_reported.loc[df_reported["predictor"]=="SCS-1"].drop(["predictor"], axis=1).to_numpy())
            scs2 = np.ravel(df_reported.loc[df_reported["predictor"]=="SCS-2"].drop(["predictor"], axis=1).to_numpy())
            var = np.ravel(df_reported.loc[df_reported["predictor"]=="VAR-std-avg"].drop(["predictor"], axis=1).to_numpy())
            clarity = np.ravel(df_reported.loc[df_reported["predictor"]=="clarity-score-k100"].drop(["predictor"], axis=1).to_numpy())
            nqc = np.ravel(df_reported.loc[df_reported["predictor"]=="nqc-norm-k100"].drop(["predictor"], axis=1).to_numpy())
            sigma_max = np.ravel(df_reported.loc[df_reported["predictor"]=="sigma_max"].drop(["predictor"], axis=1).to_numpy())
            sigma_x = np.ravel(df_reported.loc[df_reported["predictor"]=="sigma_x0.5"].drop(["predictor"], axis=1).to_numpy())
            smv = np.ravel(df_reported.loc[df_reported["predictor"]=="smv-norm-k100"].drop(["predictor"], axis=1).to_numpy())
            wig = np.ravel(df_reported.loc[df_reported["predictor"]=="wig-norm-k5"].drop(["predictor"], axis=1).to_numpy())
            bertqpp = np.ravel(df_reported.loc[df_reported["predictor"]=="BERTQPP-re-trained-ndcg@10-10"].drop(["predictor"], axis=1).to_numpy())
            qppbertpl = np.ravel(df_reported.loc[df_reported["predictor"]=="qppBERTPL-re-trained-ndcg@10-4"].drop(["predictor"], axis=1).to_numpy())
            nqaqpp = np.ravel(df_reported.loc[df_reported["predictor"]=="NQAQPP-re-trained-ndcg@10-4"].drop(["predictor"], axis=1).to_numpy())
            qppgenre = np.ravel(df_reported.loc[df_reported["predictor"]=="QPPGenRE-re-trained-n1000-ndcg@10"].drop(["predictor"], axis=1).to_numpy())
            
            result = f_oneway(ictf, idf, pmi, ql, qs, scq, scs1, scs2, var, clarity, nqc, sigma_max, sigma_x, smv, wig, bertqpp, qppbertpl, nqaqpp, qppgenre)

            results.append([dataset, rel_type, metric, result.pvalue, result.pvalue<0.05])

results_df = pd.DataFrame(results, columns=["dataset", "rel_type", "metric", "pval", "significant"])
results_df.to_csv("./QPP_scores_new/output/sig_test_ANOVA_QPP.csv", index=False)
results_df

Unnamed: 0,dataset,rel_type,metric,pval,significant
0,kid-friend-en,topical_relevance,corrPearson,0.0,True
1,kid-friend-en,topical_relevance,corrKendall,0.0,True
2,kid-friend-en,topical_relevance,corrSpearman,0.0,True
3,kid-friend-en,topical_relevance,smare,0.0,True
4,kid-friend-en,topical_relevance,snGeoRiskInvA5,0.0,True
5,kid-friend-en,readability_relevance,corrPearson,0.0,True
6,kid-friend-en,readability_relevance,corrKendall,0.0,True
7,kid-friend-en,readability_relevance,corrSpearman,0.0,True
8,kid-friend-en,readability_relevance,smare,0.0,True
9,kid-friend-en,readability_relevance,snGeoRiskInvA5,0.0,True


In [2]:
## significance tests: difference across QPP posthoc

import pandas as pd
from scipy.stats import f_oneway, tukey_hsd
import numpy as np

results = []

metric_names = ["corrPearson", "corrKendall", 
                "corrSpearman", "smare",
                "snGeoRiskInvA5"]

dataset_names = ["kid-friend-en", "requik"]
rel_types = ["topical_relevance", "readability_relevance", "eduval_relevance", 
            "extended_relevance"]

QPP_reported = ["avgICTF", "IDF-avg", "PMI-avg", "ql", "QS", "SCQ-avg", "SCS-1", "SCS-2", "VAR-std-avg", "clarity-score-k100", "nqc-norm-k100", "sigma_max", "sigma_x0.5", "smv-norm-k100", "wig-norm-k5","BERTQPP-re-trained-ndcg@10-10", "qppBERTPL-re-trained-ndcg@10-4", "NQAQPP-re-trained-ndcg@10-4", "QPPGenRE-re-trained-n1000-ndcg@10"]

QPP_reported_dict = dict(enumerate(QPP_reported))

QPP_reported_names = {
    "avgICTF":"ICTFavg", 
    "IDF-avg":"IDFavg", 
    "PMI-avg":"PMIavg", 
    "ql":"QL", 
    "QS":"QS", 
    "SCQ-avg": "SCQavg", 
    "SCS-1": "SCS-1", 
    "SCS-2": "SCS-2", 
    "VAR-std-avg": "VARavg", 
    "clarity-score-k100": "Clarity", 
    "nqc-norm-k100": "NQC", 
    "sigma_max": "sigma_max", 
    "sigma_x0.5": "sigma_x", 
    "smv-norm-k100": "SMV", 
    "wig-norm-k5": "WIG",
    "BERTQPP-re-trained-ndcg@10-10": "BERTQPP", 
    "qppBERTPL-re-trained-ndcg@10-4": "qppBERT-PL", 
    "NQAQPP-re-trained-ndcg@10-4": "NQA-QPP", 
    "QPPGenRE-re-trained-n1000-ndcg@10" : "QPPGenRE"
}

for dataset in dataset_names:
    for rel_type in rel_types:
        if dataset == "requik" and rel_type != "extended_relevance":
            continue
        for metric in metric_names:
            path_name = f"./QPP_scores_new/output/{dataset}/{rel_type}/bootstrap/{dataset}_predictors-porter-lucene-BM25_{metric}_bootstrap.csv"
            df = pd.read_csv(path_name)
            df_reported = df.loc[df["predictor"].isin(QPP_reported)].fillna(0)
            
            ictf = np.ravel(df_reported.loc[df_reported["predictor"]=="avgICTF"].drop(["predictor"], axis=1).to_numpy())
            idf = np.ravel(df_reported.loc[df_reported["predictor"]=="IDF-avg"].drop(["predictor"], axis=1).to_numpy())
            pmi = np.ravel(df_reported.loc[df_reported["predictor"]=="PMI-avg"].drop(["predictor"], axis=1).to_numpy())
            ql = np.ravel(df_reported.loc[df_reported["predictor"]=="ql"].drop(["predictor"], axis=1).to_numpy())
            qs = np.ravel(df_reported.loc[df_reported["predictor"]=="QS"].drop(["predictor"], axis=1).to_numpy())
            scq = np.ravel(df_reported.loc[df_reported["predictor"]=="SCQ-avg"].drop(["predictor"], axis=1).to_numpy())
            scs1 = np.ravel(df_reported.loc[df_reported["predictor"]=="SCS-1"].drop(["predictor"], axis=1).to_numpy())
            scs2 = np.ravel(df_reported.loc[df_reported["predictor"]=="SCS-2"].drop(["predictor"], axis=1).to_numpy())
            var = np.ravel(df_reported.loc[df_reported["predictor"]=="VAR-std-avg"].drop(["predictor"], axis=1).to_numpy())
            clarity = np.ravel(df_reported.loc[df_reported["predictor"]=="clarity-score-k100"].drop(["predictor"], axis=1).to_numpy())
            nqc = np.ravel(df_reported.loc[df_reported["predictor"]=="nqc-norm-k100"].drop(["predictor"], axis=1).to_numpy())
            sigma_max = np.ravel(df_reported.loc[df_reported["predictor"]=="sigma_max"].drop(["predictor"], axis=1).to_numpy())
            sigma_x = np.ravel(df_reported.loc[df_reported["predictor"]=="sigma_x0.5"].drop(["predictor"], axis=1).to_numpy())
            smv = np.ravel(df_reported.loc[df_reported["predictor"]=="smv-norm-k100"].drop(["predictor"], axis=1).to_numpy())
            wig = np.ravel(df_reported.loc[df_reported["predictor"]=="wig-norm-k5"].drop(["predictor"], axis=1).to_numpy())
            bertqpp = np.ravel(df_reported.loc[df_reported["predictor"]=="BERTQPP-re-trained-ndcg@10-10"].drop(["predictor"], axis=1).to_numpy())
            qppbertpl = np.ravel(df_reported.loc[df_reported["predictor"]=="qppBERTPL-re-trained-ndcg@10-4"].drop(["predictor"], axis=1).to_numpy())
            nqaqpp = np.ravel(df_reported.loc[df_reported["predictor"]=="NQAQPP-re-trained-ndcg@10-4"].drop(["predictor"], axis=1).to_numpy())
            qppgenre = np.ravel(df_reported.loc[df_reported["predictor"]=="QPPGenRE-re-trained-n1000-ndcg@10"].drop(["predictor"], axis=1).to_numpy())
            
            result = tukey_hsd(ictf, idf, pmi, ql, qs, scq, scs1, scs2, var, clarity, nqc, sigma_max, sigma_x, smv, wig, bertqpp, qppbertpl, nqaqpp, qppgenre)

            for i in range(len(QPP_reported)):
                for j in range(len(QPP_reported)):
                    if j>i:
                        results.append([dataset, rel_type, metric, QPP_reported_names[QPP_reported_dict[i]], QPP_reported_names[QPP_reported_dict[j]], result.pvalue[i][j], result.pvalue[i][j]<0.05])

results_df = pd.DataFrame(results, columns=["dataset", "rel_type", "metric", "QPP1", "QPP2", "pval", "significant"])
results_df.to_csv("./QPP_scores_new/output/sig_test_QPP.csv", index=False)
results_df

  quad_r = quad(f, low, high, args=args, full_output=self.full_output,


Unnamed: 0,dataset,rel_type,metric,QPP1,QPP2,pval,significant
0,kid-friend-en,topical_relevance,corrPearson,ICTFavg,IDFavg,0.0,True
1,kid-friend-en,topical_relevance,corrPearson,ICTFavg,PMIavg,0.0,True
2,kid-friend-en,topical_relevance,corrPearson,ICTFavg,QL,0.0,True
3,kid-friend-en,topical_relevance,corrPearson,ICTFavg,QS,0.0,True
4,kid-friend-en,topical_relevance,corrPearson,ICTFavg,SCQavg,0.0,True
...,...,...,...,...,...,...,...
4270,requik,extended_relevance,snGeoRiskInvA5,BERTQPP,NQA-QPP,0.0,True
4271,requik,extended_relevance,snGeoRiskInvA5,BERTQPP,QPPGenRE,0.0,True
4272,requik,extended_relevance,snGeoRiskInvA5,qppBERT-PL,NQA-QPP,0.0,True
4273,requik,extended_relevance,snGeoRiskInvA5,qppBERT-PL,QPPGenRE,0.0,True


In [7]:
## significance tests: difference across datasets

import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

results = []

metric_names = ["corrPearson", "corrKendall", 
                "corrSpearman", "smare",
                "snGeoRiskInvA5"]

dataset1 = "kid-friend-en"
dataset2 = "requik"

QPP_reported = ["avgICTF", "IDF-avg", "PMI-avg", "ql", "QS", "SCQ-avg", "SCS-1", "SCS-2", "VAR-std-avg", "clarity-score-k100", "nqc-norm-k100", "sigma_max", "sigma_x0.5", "smv-norm-k100", "wig-norm-k5","BERTQPP-re-trained-ndcg@10-10", "qppBERTPL-re-trained-ndcg@10-4", "NQAQPP-re-trained-ndcg@10-4", "QPPGenRE-re-trained-n1000-ndcg@10"]


for metric in metric_names:
    kid_friend_path_name = f"./QPP_scores_new/output/{dataset1}/extended_relevance/bootstrap/{dataset1}_predictors-porter-lucene-BM25_{metric}_bootstrap.csv"
    requik_path_name = f"./QPP_scores_new/output/{dataset2}/extended_relevance/bootstrap/{dataset2}_predictors-porter-lucene-BM25_{metric}_bootstrap.csv"
    
    kid_friend_df = pd.read_csv(kid_friend_path_name)
    requik_df = pd.read_csv(requik_path_name)
    
    for predictor in QPP_reported:
        kid_friend_row = np.ravel(kid_friend_df.loc[kid_friend_df["predictor"]==predictor].drop(['predictor'], axis=1).to_numpy())
        requik_row = np.ravel(requik_df.loc[requik_df["predictor"]==predictor].drop(['predictor'], axis=1).to_numpy())

        pval = ttest_ind(kid_friend_row, requik_row).pvalue

        significance = (pval < 0.05) and (pval!=np.nan)

        results.append([metric, predictor, pval, significance])

results_df = pd.DataFrame(results, columns=["metric", "QPP", "pval", "signficant"])
results_df.to_csv("./QPP_scores_new/output/sig_test_datasets.csv", index=False)
results_df

Unnamed: 0,metric,QPP,pval,signficant
0,corrPearson,avgICTF,2.854401e-63,True
1,corrPearson,IDF-avg,2.839915e-211,True
2,corrPearson,PMI-avg,2.663201e-26,True
3,corrPearson,ql,6.012750e-35,True
4,corrPearson,QS,3.564084e-32,True
...,...,...,...,...
90,snGeoRiskInvA5,wig-norm-k5,9.224365e-89,True
91,snGeoRiskInvA5,BERTQPP-re-trained-ndcg@10-10,1.031597e-19,True
92,snGeoRiskInvA5,qppBERTPL-re-trained-ndcg@10-4,0.000000e+00,True
93,snGeoRiskInvA5,NQAQPP-re-trained-ndcg@10-4,0.000000e+00,True
