In [11]:
import pandas as pd
from scipy.stats import pearsonr, spearmanr, kendalltau

In [12]:
df = pd.read_csv("hp_search_results.csv", index_col=0).fillna("none")

params = [x for x in df.columns if not any(x.startswith(y) for y in ["rouge", "meteor"])]
metrics = [x for x in df.columns if any(x.startswith(y) for y in ["rouge", "meteor"])]
for p in params:
    print(f"{p}\t{df[p].unique()}")
df.head(3)

summary_method	['none' 'chat-gpt' 'lex-rank']
summary_percentage	['none' 15.0 30.0]
ranking	['page\\_rank' 'word2vec' 'none']
ranking_how	['all' 'none']
ranking_perc_threshold	[15.0 30.0 'none']
entity	['dbpedia\\_spotlight' 'nps']
relation	['corenlp' 'rebel\\_hf' 'rebel\\_ft']


Unnamed: 0,summary_method,summary_percentage,ranking,ranking_how,ranking_perc_threshold,entity,relation,meteor_pr,meteor_re,meteor_f1,rouge-2_pr,rouge-2_re,rouge-2_f1
15,none,none,page\_rank,all,15.0,dbpedia\_spotlight,corenlp,37.0,21.2,26.2,1.8,23.6,3.0
116,none,none,page\_rank,all,15.0,dbpedia\_spotlight,rebel\_hf,35.3,20.4,25.4,2.1,23.7,3.8
129,none,none,page\_rank,all,15.0,dbpedia\_spotlight,rebel\_ft,33.0,21.4,25.4,3.6,21.6,6.0


In [15]:
method_to_func = {"pearson": pearsonr, "kendall": kendalltau, "spearman": spearmanr}

def build_cols(metrics):
    """ Columns for df """
    columns = []
    for x in metrics:
        columns += [f"{x}_corr", f"{x}_pval"]
    return columns

def get_correlations(df, method, params, metrics):
    """ Retrieve correlations between params and metrics """
    data, mappings = [], {}
    for p in params:
        curr_data = []
        for m in metrics:
            if all (isinstance(x, float) for x in df[p].values):
                x = list(df[p].values)
            else:
                curr_mapping = {val: index for index, val in enumerate(df[p].unique())}
                mappings[p] = curr_mapping
                x = [curr_mapping[elt] for elt in df[p].values]
            curr_data += list(method_to_func[method](x, list(df[m].values)))
        data.append(curr_data)
    return pd.DataFrame(data, columns=build_cols(metrics), index=params), mappings

In [18]:
corrs, mappings = get_correlations(df[(df.summary_method != "none") & (df.ranking != 'none')], 'pearson', params, metrics)
for k, v in mappings.items():
    print(f"{k}:\t{v}")
corrs

summary_method:	{'chat-gpt': 0, 'lex-rank': 1}
ranking:	{'page\\_rank': 0, 'word2vec': 1}
ranking_how:	{'all': 0}
entity:	{'dbpedia\\_spotlight': 0, 'nps': 1}
relation:	{'rebel\\_ft': 0, 'corenlp': 1, 'rebel\\_hf': 2}




Unnamed: 0,meteor_pr_corr,meteor_pr_pval,meteor_re_corr,meteor_re_pval,meteor_f1_corr,meteor_f1_pval,rouge-2_pr_corr,rouge-2_pr_pval,rouge-2_re_corr,rouge-2_re_pval,rouge-2_f1_corr,rouge-2_f1_pval
summary_method,0.270497,0.007687618,-0.370778,0.0002002158,-0.006495,0.9499242,-0.380048,0.000134,0.309673,0.002138682,-0.214963,0.035443
summary_percentage,0.314392,0.001811288,-0.000757,0.994158,0.207331,0.04267358,-0.227694,0.025674,0.339991,0.0007011391,-0.072934,0.480074
ranking,-0.00128,0.9901229,-0.139073,0.1765798,-0.071941,0.4860856,-0.097942,0.342445,-0.012993,0.9000127,-0.094046,0.362078
ranking_how,,,,,,,,,,,,
ranking_perc_threshold,0.493992,3.142141e-07,-0.021454,0.8356374,0.327733,0.001115824,-0.322287,0.001363,0.479452,7.723788e-07,-0.00096,0.992596
entity,-0.629332,6.538059e-12,-0.620655,1.529271e-11,-0.742395,4.883483e-18,-0.128915,0.210647,-0.570838,1.259399e-09,-0.414571,2.7e-05
relation,0.240124,0.01844839,0.025348,0.8063411,0.169795,0.09815115,-0.3127,0.001923,0.279812,0.005761575,-0.096965,0.347304


In [19]:
df.groupby("relation").agg({x: "mean" for x in metrics})

Unnamed: 0_level_0,meteor_pr,meteor_re,meteor_f1,rouge-2_pr,rouge-2_re,rouge-2_f1
relation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
corenlp,28.019565,20.958696,22.382609,2.745652,15.345652,3.223913
rebel\_ft,26.025,22.452083,22.877083,6.202083,14.36875,6.745833
rebel\_hf,28.816667,22.158333,23.785417,4.46875,17.26875,5.735417


## When both summarisation + ranking
- `summary_method`: conflicting results for precision and recall. lex-rank correlates with higher precision, chat-gpt correlates with higher recall. f1 correlation for meteor null, chat-gpt better for rouge2-f1.
- `summary_percentage`: 30 better for meteor precision+f1, recall rouge, 15 better for recall rouge
- `ranking`: no real difference for meteor nor rouge
- `ranking_how`: not applicable, only `single` for now
- `ranking_perc_threshold`: 30 better for meteor precision+f1, rouge recall, 15 better for rouge precision
- `entity`: dbpedia better for meteor precision+recall+f1, rouge recall+f1
- `relation`: rebel_hf better for meteor recall+f1, rebel_ft better for rouge precision 

In [21]:
corrs, mappings = get_correlations(df[df.summary_method == "none"], 'pearson', params, metrics)
for k, v in mappings.items():
    print(f"{k}:\t{v}")
corrs

summary_method:	{'none': 0}
summary_percentage:	{'none': 0}
ranking:	{'page\\_rank': 0, 'word2vec': 1}
ranking_how:	{'all': 0}
entity:	{'dbpedia\\_spotlight': 0, 'nps': 1}
relation:	{'corenlp': 0, 'rebel\\_hf': 1, 'rebel\\_ft': 2}




Unnamed: 0,meteor_pr_corr,meteor_pr_pval,meteor_re_corr,meteor_re_pval,meteor_f1_corr,meteor_f1_pval,rouge-2_pr_corr,rouge-2_pr_pval,rouge-2_re_corr,rouge-2_re_pval,rouge-2_f1_corr,rouge-2_f1_pval
summary_method,,,,,,,,,,,,
summary_percentage,,,,,,,,,,,,
ranking,0.076438,0.735293,-0.536633,0.010028,-0.50262,0.017124,-0.238535,0.285048,0.016149,0.943136,-0.257574,0.2471632
ranking_how,,,,,,,,,,,,
ranking_perc_threshold,0.731257,0.00011,-0.124569,0.580721,0.220339,0.32446,-0.358692,0.101153,0.82814,2e-06,-0.325387,0.1394957
entity,-0.172542,0.442582,-0.366054,0.093845,-0.453173,0.03417,-0.063375,0.779334,-0.072551,0.748322,-0.081152,0.7195875
relation,-0.451431,0.034952,0.514555,0.014278,0.34692,0.113695,0.813643,4e-06,-0.315785,0.152246,0.846022,7.047395e-07


## When only ranking
- `ranking`: page_rank best for meteor recall+f1, rouge f1
- `ranking_how`: not applicable, only `single` for now
- `ranking_perc_threshold`: 15 best for meteor precision, rouge f1, 30 best for rouge recall
- `entity`: dbpedia better for meteor precision+recall+f1
- `relation`: corenlp better for meteor precision, rebel_ft better for meteor recall, rouge precision+f1

In [22]:
corrs, mappings = get_correlations(df[df.ranking == "none"], 'pearson', params, metrics)
for k, v in mappings.items():
    print(f"{k}:\t{v}")
corrs

summary_method:	{'chat-gpt': 0, 'lex-rank': 1}
ranking:	{'none': 0}
ranking_how:	{'none': 0}
ranking_perc_threshold:	{'none': 0}
entity:	{'dbpedia\\_spotlight': 0, 'nps': 1}
relation:	{'rebel\\_hf': 0, 'rebel\\_ft': 1, 'corenlp': 2}




Unnamed: 0,meteor_pr_corr,meteor_pr_pval,meteor_re_corr,meteor_re_pval,meteor_f1_corr,meteor_f1_pval,rouge-2_pr_corr,rouge-2_pr_pval,rouge-2_re_corr,rouge-2_re_pval,rouge-2_f1_corr,rouge-2_f1_pval
summary_method,0.553891,0.004982,-0.439453,0.031664,-0.154276,0.471665,-0.441781,0.030669,0.557192,0.004677,-0.434902,0.033683
summary_percentage,0.547782,0.00559,-0.052385,0.807928,0.17801,0.405301,-0.334682,0.109917,0.555473,0.004834,-0.30864,0.142265
ranking,,,,,,,,,,,,
ranking_how,,,,,,,,,,,,
ranking_perc_threshold,,,,,,,,,,,,
entity,-0.346182,0.0975,-0.60534,0.001723,-0.717127,8e-05,-0.040162,0.852191,-0.237323,0.264157,-0.074822,0.728238
relation,0.078562,0.715192,-0.087327,0.684926,-0.062291,0.772466,-0.250039,0.238646,-0.031593,0.883488,-0.312141,0.137562


## When only summarising
- `summary_method`: chat-gpt better for meteor+rouge precision, rouge f1, lex-rank better for meteor+rouge recall
- `summary_percentage`: 30 better for meteor precision, rouge recall
- `entity`: dbpedia better for meteor recall+f1
- `relation`: no strong results

In [23]:
for p in params:
    print(df.groupby(p).agg({x: "mean" for x in metrics}))

                meteor_pr  meteor_re  meteor_f1  rouge-2_pr  rouge-2_re  \
summary_method                                                            
chat-gpt        24.428333  23.983333  23.001667    5.841667   12.251667   
lex-rank        27.800000  21.031667  22.806667    4.108333   15.715000   
none            35.800000  18.386364  23.677273    1.886364   24.840909   

                rouge-2_f1  
summary_method              
chat-gpt          6.150000  
lex-rank          5.093333  
none              3.309091  
                    meteor_pr  meteor_re  meteor_f1  rouge-2_pr  rouge-2_re  \
summary_percentage                                                            
15.0                24.233333  22.540000  22.125000    5.511667   12.136667   
30.0                27.995000  22.475000  23.683333    4.438333   15.830000   
none                35.800000  18.386364  23.677273    1.886364   24.840909   

                    rouge-2_f1  
summary_percentage              
15.0             