In [2]:
import sys 
import os 

sys.path.append('../..')

DATA_DIR = '../../data'
os.environ['DATA_DIR'] = DATA_DIR

from utils.things import calc_correlation, get_all_vectorized
from utils.correlation import calc_semi_partial_correlation
from utils.data import load_gpt, load_cslb, load_mcrae, load_behav
import matplotlib.pyplot as plt 

# Load data 

In [3]:
group_to_one_concept = True
duplicates = True 
min_amount_runs_feature_occured = 4
min_amount_runs_feature_occured_within_concept = 1

feature_norms = {
    'McRae': load_mcrae(group_to_one_concept, duplicates),
    'CSLB': load_cslb(group_to_one_concept),
    'GPT-McRae': load_gpt(min_amount_runs_feature_occured, group_to_one_concept, min_amount_runs_feature_occured_within_concept, duplicates)
}

behav_sim = load_behav()
feature_norms_vec, behav_sim_matched = get_all_vectorized(feature_norms, behav_sim, 'count')


../../data


  warn(msg)


317 concepts are present in all feature norms




# Predicting human similarity judgements

## THINGS

In [4]:
corr = calc_correlation(feature_norms_vec, behav_sim_matched)
corr.style.background_gradient(cmap='coolwarm')

McRae
(317, 2524)
50086
CSLB
(317, 5929)
50086
GPT-McRae
(317, 11720)
50086


Unnamed: 0,THINGS,McRae,CSLB,GPT-McRae
THINGS,1.0,0.559284,0.733578,0.621724
McRae,0.559284,1.0,0.707363,0.78917
CSLB,0.733578,0.707363,1.0,0.796077
GPT-McRae,0.621724,0.78917,0.796077,1.0


## Model performance with more runs

In [None]:
min_amount_runs_feature_occured = 1
group_to_one_concept = False
min_amount_runs_feature_occured_within_concept = 1
duplicates = True 

gpt_df = load_gpt(min_amount_runs_feature_occured, group_to_one_concept, min_amount_runs_feature_occured_within_concept, duplicates)
mc_df = load_mcrae(True)
clsb_df = load_cslb(True)
        
r = []
for i in range(1, 31):
    runs = list(range(1, i+1))
    print(runs)
    gpt_df_temp = gpt_df[gpt_df['run_nr'].isin(runs)]
    gpt_df_temp = gpt_df_temp.groupby('concept_id', as_index=False).agg({'feature': lambda x: ';'.join(x)})
    
    gpt_vec, cslb_vec, mc_vec, behv_sim_matched = get_all_vectorized(gpt_df_temp, clsb_df, mc_df, behv_sim, 'binary')
    r_gpt_behav, r_cslb_behav, r_mc_behav, r_gpt_mc, r_cslb_gpt = calc_correlation(gpt_vec, mc_vec, behv_sim_matched, cslb_vec)
    r.append(r_gpt_behav)

In [None]:
plt.plot(range(1, len(r) + 1), r)

# Variance Partioning Analyses

In [9]:
def vpa(corr_with_things_1, corr_with_things_2, corr_1_with_2):
    explained_variance_gpt = calc_semi_partial_correlation(corr_with_things_1, corr_with_things_2, corr_1_with_2)
    print('unique variance GPT (partial out McRae): {:.4f}'.format(explained_variance_gpt ** 2))

    explained_variance_mc = calc_semi_partial_correlation(corr_with_things_2, corr_with_things_1, corr_1_with_2)
    print('unique variance McRae (partial out GPT): {:.4f}'.format(explained_variance_mc ** 2))

    shared_variance = (corr_with_things_2 ** 2) - (explained_variance_mc ** 2)
    print('shared variance between GPT and McRae: {:.4f}'.format(shared_variance))

In [10]:
vpa(corr['THINGS']['GPT-McRae'], corr['THINGS']['McRae'], corr['GPT-McRae']['McRae'])

unique variance GPT (partial out McRae): 0.0862
unique variance McRae (partial out GPT): 0.0125
shared variance between GPT and McRae: 0.3003


In [11]:
vpa(corr['THINGS']['GPT-McRae'], corr['THINGS']['CSLB'], corr['GPT-McRae']['CSLB'])

unique variance GPT (partial out McRae): 0.0039
unique variance McRae (partial out GPT): 0.1555
shared variance between GPT and McRae: 0.3827
