## Context Matters: A Theory of Semantic Discriminability for Perceptual Encoding Systems
### Laurent Lessard & Kushin Mukherjee

In [None]:
%matplotlib inline 
import numpy as np
import pandas as pd
import scipy
from scipy.optimize import linear_sum_assignment
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import cm
import math
import seaborn as sns
import warnings
import os
from scipy import stats
import seaborn as sns

warnings.filterwarnings('ignore')





In [None]:
"""
Input is a m x n array of (concepts) x (colors) filled with association ratings
Output is a list of length m with the index of the color that should be associated with each concept
"""
def assignment_solve( ratings, method="balanced" ):
    m,n = ratings.shape
    assert m <= n, "More concepts than colors, assignment impossible!"
    
    # isolated merit function (only considers target in isolation)
    if method == "isolated":
        merit_matrix = ratings
    
    # balanced merit function (balances ratings of target vs off-target objects with each color)
    elif method == "balanced":
        t = 1  # this is the penalty parameter. t=1 for balanced, t=0 recovers isolated case.
        merit_matrix = np.zeros((m,n))
        for i in range(m):
            for j in range(n):
                merit_matrix[i,j] = ratings[i,j] - t*ratings[np.arange(m)!=i,j].max()
    
    # baseline merit function (uniformly bad assignment)
    elif method == "baseline":
        merit_matrix = np.zeros((m,n))
        for i in range(m):
            for j in range(n):
                merit_matrix[i,j] = -abs( ratings[i,j] - ratings[np.arange(m)!=i,j].max() )
                
    else:
        assert False, "unknown method in assignment problem"
        
    row_ind, col_ind = linear_sum_assignment( merit_matrix, maximize=True )
    return col_ind


In [None]:
p1 = ['corn','carrot','grape','banana']
p2 =['sleeping','driving','peach','cherry']
p3 = ['working','leisure','safety','comfort']
p4 = ['eggplant','celery','efficiency','speed']
p5= ['working','leisure','grape','banana']
p6 = ['eggplant','celery','peach','cherry']
p7 = ['corn','carrot','safety','comfort']
p8 = ['sleeping','driving','efficiency','speed']


p9 = ['corn','carrot','peach','cherry']
p10 = ['sleeping','driving','grape','banana']
p11 = ['working','leisure','efficiency','speed']
p12 = ['eggplant','celery','safety','comfort']
p13 = ['working','leisure','peach','cherry']
p14 = ['eggplant','celery','grape','banana']
p15 = ['corn','carrot','efficiency','speed',]
p16 = ['sleeping','driving','safety','comfort']
plist=[p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15,p16]

true_assignments =[
    {"banana": "#d0b85a", "carrot": "#cc4f1b", "corn": "#ffffff", "grape": "#512d5f"},
{"cherry": "#ea1d1d", "driving": "#3efe44", "peach": "#f1a78a", "sleeping": "#512d5f"},
{"comfort": "#a06776", "working": "#000000", "safety": "#55824d", "leisure": "#e81a4b"},
{"celery": "#608218", "efficiency": "#5e78a1", "eggplant": "#600b84", "speed": "#e81a4b"},
{"banana": "#fcdb42", "grape": "#600b84", "leisure": "#a0bae6", "working": "#3b3b3b"},
{"celery": "#73cf10", "cherry": "#ea1d1d", "eggplant": "#600b84", "peach": "#f7a75a"},
{"carrot": "#cc4f1b", "comfort": "#a06776", "corn": "#fcdb42", "safety": "#55824d"},
{"driving": "#184415", "efficiency": "#7ec6ba", "sleeping": "#512d5f", "speed": "#e81a4b"},

{"carrot": "#cc4f1b", "cherry": "#ea1d1d", "corn": "#d5b811", "peach": "#f1a78a"},
{"banana": "#d0b85a", "driving": "#ea1d1d", "grape": "#b62ef2", "sleeping": "#1c3d61"},
{"efficiency": "#ffffff", "leisure": "#90689f", "speed": "#e81a4b", "working": "#3b3b3b"},
{"celery": "#0e8a19", "comfort": "#a06776", "eggplant": "#600b84", "safety": "#d0e942"},
{"cherry": "#ea1d1d", "leisure": "#d5a9e4", "peach": "#f1a78a", "working": "#3b3b3b"},
{"banana": "#fcdb42", "celery": "#8cf47e", "eggplant": "#000000", "grape": "#5e2b3a"},
{"carrot": "#ac6619", "corn": "#d0b85a", "efficiency": "#5e78a1", "speed": "#e81a4b"},
{"comfort": "#a06776", "driving": "#ea1d1d", "safety": "#d0e942", "sleeping": "#000000"}
]
 

In [None]:
clean_df_f=pd.read_csv('data/clean_df_f.csv')

In [None]:
pal = []

for i, row in clean_df_f.iterrows():
    if row.condition == '0':
        if row.category == '"p1"':
            pal.append(0)
        if row.category == '"p2"':
            pal.append(1)
        if row.category == '"p3"':
            pal.append(2)
        if row.category == '"p4"':
            pal.append(3)
        if row.category == '"p5"':
            pal.append(4)
        if row.category == '"p6"':
            pal.append(5)
        if row.category == '"p7"':
            pal.append(6)
        if row.category == '"p8"':
            pal.append(7)
    if row.condition == '1':
        if row.category == '"p1"':
            pal.append(8)
        if row.category == '"p2"':
            pal.append(9)
        if row.category == '"p3"':
            pal.append(10)
        if row.category == '"p4"':
            pal.append(11)
        if row.category == '"p5"':
            pal.append(12)
        if row.category == '"p6"':
            pal.append(13)
        if row.category == '"p7"':
            pal.append(14)
        if row.category == '"p8"':
            pal.append(15)


clean_df_f['pal'] = pal

In [None]:
plot_df = clean_df_f.groupby(['bar_col','con_pal_conc','subject_id','category','condition','response' ]).\
apply(lambda x: len(x)/8).reset_index(name= 'num_times_picked')
col_dict= {a:a for a in plot_df.bar_col.unique()}

plot_df_backup = plot_df
for pal in plot_df_backup.category.unique():
    dp = plot_df_backup[plot_df_backup.category == pal]
    for cond in dp.condition.unique():
        dpp = dp[dp.condition==cond]
        concepts = dpp.response.unique()
        for col in dpp.bar_col.unique():
            dppp = dpp[dpp.bar_col==col]
            #concepts = dppp.response.unique()
            for subj in dppp.subject_id.unique():
                dp_ = dppp[dppp.subject_id==subj]
                for conc in concepts:
                    if not conc in dp_.response.unique():
                        plot_df = plot_df.append(pd.DataFrame([[col,'NA',subj,pal,cond,conc,0]],columns = plot_df.columns))
                        

In [None]:
fig =plt.figure(figsize=(3,25))
plt.rcParams['pdf.fonttype'] = 42
sp=1
for i in range(2):
    for j in range(8):
       
    
        pdf = plot_df[(plot_df.condition == f'{i}')&(plot_df.category ==f'"p{j+1}"')]
        
        hue_ord=[]
        this_p = plist[((i*8)+j)]
        for p in this_p:
            hue_ord.append(true_assignments[((i*8)+j)][p])
        
    
        #sns.set_theme(style="white",rc = {'patch.linewidth': 0.5, 'patch.edgecolor':'black',})

        g = sns.barplot(ax = fig.add_subplot(16,1,sp),
            data=pdf,color = 'black',
            x="response", y="num_times_picked", hue="bar_col", palette =col_dict,
            ci=68, order = plist[(i*8)+j], hue_order= hue_ord
        )
        plt.axhline( y=0.25, ls='--', c='black')
        plt.xlabel('')
        ax = plt.gca()
        ax.get_legend().remove()
        plt.ylabel('')
        plt.ylim(0,1)
        plt.yticks([0,0.5,1])
        plt.tick_params(labelsize=10)
        #plt.show()

        
        sp+=1
plt.tight_layout()

#plt.savefig(f'../human_data_faceted.pdf',format='pdf')
    


In [None]:
set_acc_df = clean_df_f.groupby(['condition','category'])\
.agg('mean').filter(items=['total_accuracy']).reset_index()
model_set_acc = pd.read_csv('data/model_set_acc.csv')
model_set_acc= model_set_acc.drop(columns= ['Unnamed: 0','mciter'])

In [None]:
p_df = set_acc_df.join(model_set_acc)
#p_df.filter(['pal','total_accuracy']).to_csv('../setwise_accuracies.csv')

In [None]:
plt.figure(figsize=(8,5))
#sns.scatterplot(data=plot_df, x="total_accuracy", y="total_acc", marker = 'o')
plt.scatter(p_df.total_accuracy, p_df.total_acc, marker= 'o',s=80)
plt.xlim(1,4)
plt.ylim(1,4)
plt.tick_params(labelsize=20)
plt.xlabel("setwise accuracy",fontsize= 20)
plt.ylabel("model prediction",fontsize= 20)
plt.axline((1, 1), (4, 4), linewidth=1, color='k', linestyle ='--')
plt.title('human data vs. model predictions \n for setwise accuracy', fontsize= 20)
#plt.savefig('../setwise_fit.pdf', format = 'pdf')
np.corrcoef(p_df['total_accuracy'],p_df['total_acc'])

In [None]:
all_item_accs = pd.read_csv('data/all_item_acc.csv')
all_item_accs = all_item_accs.drop(columns= ['Unnamed: 0'])
all_item_accs= all_item_accs.rename(columns = {'num_times_picked':'mean_prop_picked'})

In [None]:
pred_proportions = all_item_accs.mean_prop_picked

pred_prop_0 = all_item_accs[all_item_accs.pal.isin([0,1,2,3,4,5,6,7])].mean_prop_picked
pred_prop_1 = all_item_accs[all_item_accs.pal.isin([8,9,10,11,12,13,14,15])].mean_prop_picked



In [None]:

data_proportions = plot_df.groupby(['condition','category','response','bar_col']).\
apply(lambda x: np.mean(x.num_times_picked)).\
reset_index(name ='mean_prop_picked').mean_prop_picked


In [None]:
plt.figure(figsize=(6,4))

plt.scatter(data_proportions,pred_proportions, marker= 'o',s=80)
plt.xlim(0,1)
plt.ylim(0,1)
plt.tick_params(labelsize=20)
plt.xlabel("data",fontsize= 20)
plt.ylabel("model",fontsize= 20)
plt.axline((1, 1), (4, 4), linewidth=1, color='k', linestyle ='--')
plt.title('human data vs. model predictions \n for individual assignments', fontsize= 20)
plt.savefig('../overall_fit.pdf', format = 'pdf')
np.corrcoef(data_proportions,pred_proportions)


In [None]:
plt.figure(figsize=(6,4))

plt.scatter(data_prop_1,pred_prop_1, marker= 'o',s=80)
plt.xlim(0,1)
plt.ylim(0,1)
plt.tick_params(labelsize=20)
plt.xlabel("data",fontsize= 20)
plt.ylabel("model",fontsize= 20)
plt.axline((1, 1), (4, 4), linewidth=1, color='k', linestyle ='--')
plt.title('human data vs. model predictions \n for individual assignments', fontsize= 20)
#plt.savefig('../overall_fit.pdf', format = 'pdf')
np.corrcoef(data_prop_1,pred_prop_1)


In [None]:
plt.figure(figsize=(6,4))

plt.scatter(data_prop_0,pred_prop_0, marker= 'o',s=80)
plt.xlim(0,1)
plt.ylim(0,1)
plt.tick_params(labelsize=20)
plt.xlabel("data",fontsize= 20)
plt.ylabel("model",fontsize= 20)
plt.axline((1, 1), (4, 4), linewidth=1, color='k', linestyle ='--')
plt.title('human data vs. model predictions \n for individual assignments', fontsize= 20)
#plt.savefig('../overall_fit.pdf', format = 'pdf')
np.corrcoef(data_prop_0,pred_prop_0)


In [None]:
t = clean_df_f.filter(items=['bar_col','answer','pal','con_pal_conc','accuracy']).sort_values(['answer','pal'])
t['pal']= t['pal'].astype('string')
t['pal_conc'] = t['pal']+t['answer']

cdict_df = t.groupby(['pal_conc'])['bar_col'].agg(['max']).reset_index()
cdict = dict(zip(cdict_df['pal_conc'].values.tolist(), cdict_df['max'].values.tolist()))
#pd.DataFrame.from_dict(cdict,orient='index',columns=['hex']).rename_axis('pal_conc').reset_index().to_csv('../cdict.csv')
item_df = clean_df_f.filter(items=['answer','pal','accuracy']).sort_values(['answer','pal'])

In [None]:
item_df['source'] = 'data'
item_df['pal'] = item_df['pal'].astype('string')
item_df = item_df.rename(columns = {'answer':'concept'})
item_df['pal_conc'] = item_df['pal']+item_df['concept']
model_item_df = pd.read_csv('data/model_item_df.csv')
model_item_df = model_item_df.drop(columns = ['Unnamed: 0'])
item_df_full = pd.concat([item_df, model_item_df])

In [None]:
item_df_full.reset_index(inplace=True) ### has model and data rows vertically stacked`

In [None]:
reg_df = clean_df_f.filter(items=['subject_id','answer','repetition','bar_col','pal','accuracy']).sort_values(['answer','pal'])

reg_df['pal'] = reg_df['pal'].astype('string')
reg_df = reg_df.rename(columns = {'answer':'concept'})
reg_df['pal_conc'] = reg_df['pal']+reg_df['concept']
reg_df_avg = reg_df.merge(model_item_df,on="pal_conc")
reg_df_avg = reg_df_avg.rename(columns = {'accuracy_y':'distance'})
#reg_df_avg.to_csv('../reg_df_avg.csv')

In [None]:
quartet_TVs = pd.read_csv('data/data/quartet_TVs_Hs.csv').iloc[:, 1:]
all_sem_d = pd.read_csv('data/data/all_semdists.csv').iloc[:, 1:]
all_sem_d.color1 = all_sem_d.apply(lambda x: x.color1.split('V')[1], axis=1)
all_sem_d.color2 = all_sem_d.apply(lambda x: x.color2.split('V')[1], axis=1)
