In [245]:
import pandas as pd
from scipy import stats
import itertools

In [246]:
P_ROUND = 3
C_ROUND = 3

In [247]:
all_sheets = pd.read_excel("./data/gemma-3-12b-single-sun.xlsx", sheet_name=None)
# List to hold dataframes with prefixed columns
dfs = []
for sheet_name, df in all_sheets.items():
    # Prefix columns with sheet name
    df_prefixed = df.add_prefix(f"{sheet_name}_")
    dfs.append(df_prefixed)
# Concatenate all dataframes column-wise
gemma = pd.concat(dfs, axis=1)
del dfs  # Free up memory

In [248]:
gemma = gemma.set_index("letters_new_names")
gemma.index.rename("names")
# Extract scores from Gemma spreadsheet
gemma = gemma.loc[:, gemma.filter(regex="score").columns]

gemma.info()

<class 'pandas.core.frame.DataFrame'>
Index: 562 entries, Oliviu Barbu to Loredana Diaconu
Data columns (total 90 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   letters_formal_and_structural_score_1                   562 non-null    int64  
 1   letters_formal_and_structural_score_2                   562 non-null    int64  
 2   letters_formal_and_structural_score_3                   562 non-null    int64  
 3   letters_formal_and_structural_score_4                   562 non-null    int64  
 4   letters_formal_and_structural_score_5                   562 non-null    int64  
 5   letters_formal_and_structural_final_score               562 non-null    object 
 6   letters_relevance_and_thematic_fit_score_1              562 non-null    int64  
 7   letters_relevance_and_thematic_fit_score_2              562 non-null    int64  
 8   letters_relevance_and

In [249]:
gemma_scores = pd.DataFrame()
# Recalculate mean scores
for score in gemma.filter(regex="final_score").columns:
    label = score.replace("_final_score", "")
    cols = gemma.filter(regex=f"{label}_score").columns
    gemma_scores[label] = gemma[cols].mean(axis=1)

gemma_scores.fillna(0, inplace=True)
gemma_scores.info()

<class 'pandas.core.frame.DataFrame'>
Index: 562 entries, Oliviu Barbu to Loredana Diaconu
Data columns (total 15 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   letters_formal_and_structural               562 non-null    float64
 1   letters_relevance_and_thematic_fit          562 non-null    float64
 2   letters_professionalism_and_research        562 non-null    float64
 3   letters_uniqueness_and_individuality        562 non-null    float64
 4   letters_potential_and_developmental         562 non-null    float64
 5   presentations_formal_and_structural         562 non-null    float64
 6   presentations_relevance_and_thematic_fit    562 non-null    float64
 7   presentations_professionalism_and_research  562 non-null    float64
 8   presentations_uniqueness_and_individuality  562 non-null    float64
 9   presentations_potential_and_developmental   562 non-null    float64


In [250]:
human_scores = pd.read_parquet("./data/dataset.parquet")
human_scores.info()

<class 'pandas.core.frame.DataFrame'>
Index: 577 entries, Adam Mazilescu to Ștefania Albu
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   cv                          567 non-null    object 
 1   letter                      568 non-null    object 
 2   presentation                564 non-null    object 
 3   cv_phd_1                    567 non-null    float64
 4   cv_phd_2                    567 non-null    float64
 5   letter_phd_1                567 non-null    float64
 6   letter_phd_2                567 non-null    float64
 7   pres_phd_1                  567 non-null    float64
 8   pres_phd_2                  567 non-null    float64
 9   pres_class                  567 non-null    float64
 10  video_phd_1                 383 non-null    float64
 11  video_phd_2                 409 non-null    float64
 12  all_phd_1                   567 non-null    float64
 13  all_phd_2        

In [251]:
# Compute mean scores
human_scores["phd_score"] = human_scores[["all_phd_1", "all_phd_2"]].mean(axis=1)
human_scores["cv_score"] = human_scores[["cv_phd_1", "cv_phd_2"]].mean(axis=1)
human_scores["pres_score"] = human_scores[["pres_phd_1", "pres_phd_2"]].mean(axis=1)
human_scores["letter_score"] = human_scores[["letter_phd_1", "letter_phd_2"]].mean(
    axis=1
)

human_scores.info()

<class 'pandas.core.frame.DataFrame'>
Index: 577 entries, Adam Mazilescu to Ștefania Albu
Data columns (total 29 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   cv                          567 non-null    object 
 1   letter                      568 non-null    object 
 2   presentation                564 non-null    object 
 3   cv_phd_1                    567 non-null    float64
 4   cv_phd_2                    567 non-null    float64
 5   letter_phd_1                567 non-null    float64
 6   letter_phd_2                567 non-null    float64
 7   pres_phd_1                  567 non-null    float64
 8   pres_phd_2                  567 non-null    float64
 9   pres_class                  567 non-null    float64
 10  video_phd_1                 383 non-null    float64
 11  video_phd_2                 409 non-null    float64
 12  all_phd_1                   567 non-null    float64
 13  all_phd_2        

### LLM scores vs PhD and Professors scores correlation

In [252]:
SELECTED_COLS = ["phd_score", "prof_score"]

In [253]:
def compute_correlations(
    output_table: pd.DataFrame,
    input_table: pd.DataFrame,
    type: str = "pearson",
    decimals: int = 3,
) -> pd.DataFrame:
    CORRELATIONS = {
        "pearson": stats.pearsonr,
        "spearman": stats.spearmanr,
        "kendall": stats.kendalltau,
    }
    assert type in CORRELATIONS.keys(), (
        f"The type of correlation must be one of them: {list(CORRELATIONS.keys())}. ",
        f"'{type}' given instead",
    )
    assert decimals > 0, "The number of decimals must be positive!"

    for row, col in itertools.product(output_table.index, output_table.columns):
        # Calculate Pearson correlation coefficient and p-value
        selection = input_table[[row, col]].copy().dropna(axis=0)
        coef, pvalue = stats.pearsonr(selection[row], selection[col])
        output_table.loc[row, col] = f"{coef:.{C_ROUND}f} ({pvalue:.{P_ROUND}f})"

    return output_table

In [254]:
overview_table = pd.merge(
    human_scores, gemma_scores, left_index=True, right_index=True, how="inner"
)

In [255]:
pearson = pd.DataFrame(index=gemma_scores.columns, columns=SELECTED_COLS)
pearson = compute_correlations(pearson, overview_table, "pearson")

In [256]:
spearman = pd.DataFrame(index=gemma_scores.columns, columns=SELECTED_COLS)
spearman = compute_correlations(spearman, overview_table, "spearman")

In [257]:
kendall = pd.DataFrame(index=gemma_scores.columns, columns=SELECTED_COLS)
kendall = compute_correlations(kendall, overview_table, "kendall")

In [258]:
pd.concat(
    [spearman, pearson, kendall], axis=1, keys=["spearman", "pearson", "kendalltau"]
)

Unnamed: 0_level_0,spearman,spearman,pearson,pearson,kendalltau,kendalltau
Unnamed: 0_level_1,phd_score,prof_score,phd_score,prof_score,phd_score,prof_score
letters_formal_and_structural,0.231 (0.000),-0.106 (0.298),0.231 (0.000),-0.106 (0.298),0.231 (0.000),-0.106 (0.298)
letters_relevance_and_thematic_fit,0.380 (0.000),0.100 (0.329),0.380 (0.000),0.100 (0.329),0.380 (0.000),0.100 (0.329)
letters_professionalism_and_research,0.317 (0.000),0.043 (0.677),0.317 (0.000),0.043 (0.677),0.317 (0.000),0.043 (0.677)
letters_uniqueness_and_individuality,0.371 (0.000),0.046 (0.652),0.371 (0.000),0.046 (0.652),0.371 (0.000),0.046 (0.652)
letters_potential_and_developmental,0.376 (0.000),0.017 (0.865),0.376 (0.000),0.017 (0.865),0.376 (0.000),0.017 (0.865)
presentations_formal_and_structural,0.035 (0.408),0.070 (0.495),0.035 (0.408),0.070 (0.495),0.035 (0.408),0.070 (0.495)
presentations_relevance_and_thematic_fit,0.030 (0.470),0.080 (0.432),0.030 (0.470),0.080 (0.432),0.030 (0.470),0.080 (0.432)
presentations_professionalism_and_research,0.036 (0.389),0.057 (0.575),0.036 (0.389),0.057 (0.575),0.036 (0.389),0.057 (0.575)
presentations_uniqueness_and_individuality,0.052 (0.220),0.098 (0.337),0.052 (0.220),0.098 (0.337),0.052 (0.220),0.098 (0.337)
presentations_potential_and_developmental,0.042 (0.319),0.062 (0.546),0.042 (0.319),0.062 (0.546),0.042 (0.319),0.062 (0.546)


### CV, Presenation and letters scores vs Phd and Professors scores Kendall tau

In [259]:
kendall_interdata = pd.DataFrame(
    index=["cv_score", "letter_score", "pres_score"],
    columns=["phd_score", "prof_score"],
)
for row, col in itertools.product(kendall_interdata.index, kendall_interdata.columns):
    # Calculate Kendall tau correlation coefficient and p-value
    selection = overview_table[[row, col]].copy().dropna(axis=0)
    coef, pvalue = stats.kendalltau(selection[row], selection[col])
    kendall_interdata.loc[row, col] = f"{coef:.{C_ROUND}f} ({pvalue:.{P_ROUND}f})"

kendall_interdata

Unnamed: 0,phd_score,prof_score
cv_score,0.486 (0.000),0.168 (0.024)
letter_score,0.497 (0.000),0.094 (0.222)
pres_score,0.677 (0.000),0.053 (0.480)


### Predictive strength of LLM scores

The predictive strength is defined as the correlation between LLM scores and results of online and offline tests.

In [260]:
spearman_test = pd.DataFrame(
    index=list(gemma_scores.columns) + ["phd_score", "prof_score"],
    columns=["online_test_score", "offline_test_total"],
)

spearman_test = compute_correlations(spearman_test, overview_table, "spearman")

In [261]:
kendall_test = pd.DataFrame(
    index=list(gemma_scores.columns) + ["phd_score", "prof_score"],
    columns=["online_test_score", "offline_test_total"],
)

kendall_test = compute_correlations(kendall_test, overview_table, "spearman")

In [262]:
pd.concat([spearman_test, kendall_test], axis=1, keys=["spearman", "kendalltau"] )

Unnamed: 0_level_0,spearman,spearman,kendalltau,kendalltau
Unnamed: 0_level_1,online_test_score,offline_test_total,online_test_score,offline_test_total
letters_formal_and_structural,0.051 (0.681),0.018 (0.902),0.051 (0.681),0.018 (0.902)
letters_relevance_and_thematic_fit,0.187 (0.127),0.003 (0.984),0.187 (0.127),0.003 (0.984)
letters_professionalism_and_research,0.124 (0.313),-0.056 (0.706),0.124 (0.313),-0.056 (0.706)
letters_uniqueness_and_individuality,0.089 (0.469),-0.077 (0.601),0.089 (0.469),-0.077 (0.601)
letters_potential_and_developmental,0.157 (0.200),-0.185 (0.208),0.157 (0.200),-0.185 (0.208)
presentations_formal_and_structural,-0.002 (0.985),-0.126 (0.395),-0.002 (0.985),-0.126 (0.395)
presentations_relevance_and_thematic_fit,-0.049 (0.691),-0.196 (0.183),-0.049 (0.691),-0.196 (0.183)
presentations_professionalism_and_research,-0.037 (0.767),-0.054 (0.716),-0.037 (0.767),-0.054 (0.716)
presentations_uniqueness_and_individuality,-0.094 (0.448),-0.181 (0.219),-0.094 (0.448),-0.181 (0.219)
presentations_potential_and_developmental,-0.055 (0.654),-0.184 (0.210),-0.055 (0.654),-0.184 (0.210)
