In [1]:
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [2]:
# mapping experiment setting to variance and std on early node
mapping = {
            'cogsci_learning': [-8, -4, 4, 8],
            'mini_variance'  : [-2, -1, 1, 2],
            'zero_variance'  : [1, 1, 1, 1],
            'large_variance' : [-48, -24, 24, 48],
            'high_increasing': [-4, -2, 2, 4]
                          }

In [3]:
considered_experiment_settings = ["high_increasing", "large_variance", "cogsci_learning", "mini_variance", "zero_variance"]

In [4]:
# load data
data = pd.DataFrame()
for experiment_setting in considered_experiment_settings:
    data_in = pd.read_csv(f"data/processed/simulated/{experiment_setting}/MCL/linear_depth/search_space/1729_depth_only_baseline_null.csv")
    data_in['experiment_setting']= experiment_setting
    data = pd.concat([data, data_in])


In [5]:
# create a new column with variance and std of the experiment settings
data["std_early_nodes"] = data["experiment_setting"].apply(lambda experiment_setting: np.std(mapping[experiment_setting]))
data["variance_early_nodes"] = data["experiment_setting"].apply(lambda experiment_setting: np.var(mapping[experiment_setting]))

In [6]:
## normalization
# variance
data["norm_variance_early_nodes"] = (
                                    (data["variance_early_nodes"]-data["variance_early_nodes"].min())/
                                    (data["variance_early_nodes"].max()-data["variance_early_nodes"].min()))
# std
data["norm_std_early_nodes"]      = (
                                    (data["std_early_nodes"]-data["std_early_nodes"].min())/
                                    (data["std_early_nodes"].max()-data["std_early_nodes"].min()))
# data["norm_std_early_nodes"] = preprocessing.Normalizer(data["std_early_nodes"])
# data["norm_variance_early_nodes"] = preprocessing.Normalizer(data["variance_early_nodes"])

In [11]:
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,w,taken_paths,costs,loss,ground_truth,trial_id,i_episode,actions,...,num_late,num_clicks,unbounded_present_bias,unbounded_loss,state,experiment_setting,std_early_nodes,variance_early_nodes,norm_variance_early_nodes,norm_std_early_nodes
0,0,0,"[0, 0, 0, 0, 0]","[1, 2, 3]",[54.0],,"[0.0, -2.0, 8.0, 48.0, -24.0, 2.0, -4.0, -48.0...",3827219843471436276,0,0,...,False,False,0.0,0.0,"(0, Cat, Cat, Cat, Cat, Cat, Cat, Cat, Cat, Ca...",high_increasing,3.162278,10.0,0.006944,0.083333
1,1,1,"[0.0, 0.0, 0.0, 15.5454392236338, 48.857094702...","[5, 6, 8]","[-1.0, -1.0, -1.0, 20.0]",,"[0.0, -2.0, -4.0, -24.0, -48.0, 4.0, -8.0, 24....",-4289689371375290671,1,9,...,False,True,8.0,-28.0,"(0, Cat, Cat, Cat, Cat, Cat, Cat, Cat, Cat, Ca...",high_increasing,3.162278,10.0,0.006944,0.083333
2,2,1,"[0.0, 0.0, 0.0, 15.5454392236338, 48.857094702...","[5, 6, 8]","[-1.0, -1.0, -1.0, 20.0]",,"[0.0, -2.0, -4.0, -24.0, -48.0, 4.0, -8.0, 24....",-4289689371375290671,1,1,...,False,True,8.0,-28.0,"(0, Cat, Cat, Cat, Cat, Cat, Cat, Cat, Cat, -4...",high_increasing,3.162278,10.0,0.006944,0.083333
3,3,1,"[0.0, 0.0, 0.0, 15.5454392236338, 48.857094702...","[5, 6, 8]","[-1.0, -1.0, -1.0, 20.0]",,"[0.0, -2.0, -4.0, -24.0, -48.0, 4.0, -8.0, 24....",-4289689371375290671,1,5,...,False,True,8.0,-28.0,"(0, -2.0, Cat, Cat, Cat, Cat, Cat, Cat, Cat, -...",high_increasing,3.162278,10.0,0.006944,0.083333
4,4,1,"[0.0, 0.0, 0.0, 15.5454392236338, 48.857094702...","[5, 6, 8]","[-1.0, -1.0, -1.0, 20.0]",,"[0.0, -2.0, -4.0, -24.0, -48.0, 4.0, -8.0, 24....",-4289689371375290671,1,0,...,False,False,8.0,-28.0,"(0, -2.0, Cat, Cat, Cat, 4.0, Cat, Cat, Cat, -...",high_increasing,3.162278,10.0,0.006944,0.083333


In [14]:
data.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'w', 'taken_paths', 'costs', 'loss',
       'ground_truth', 'trial_id', 'i_episode', 'actions', 'return',
       'full_actions', 'sim_experiment_setting', 'sim_model_yaml',
       'sim_feature_yaml', 'sim_prior_json', 'sim_constant_yaml',
       'sim_cost_function', 'sim_cost_parameter_values', 'sim_num_simulated',
       'sim_num_trials', 'static_cost_weight', 'depth_cost_weight', 'pid',
       'num_early', 'num_middle', 'num_late', 'num_clicks',
       'unbounded_present_bias', 'unbounded_loss', 'state',
       'experiment_setting', 'std_early_nodes', 'variance_early_nodes',
       'norm_variance_early_nodes', 'norm_std_early_nodes'],
      dtype='object')

In [41]:
# group data by factors
grouped_data = data.groupby(["pid", "i_episode", "sim_cost_parameter_values", "sim_experiment_setting"], as_index=False).sum()

In [42]:
grouped_data.head()

Unnamed: 0.2,pid,i_episode,sim_cost_parameter_values,sim_experiment_setting,Unnamed: 0,Unnamed: 0.1,loss,trial_id,actions,return,...,num_early,num_middle,num_late,num_clicks,unbounded_present_bias,unbounded_loss,std_early_nodes,variance_early_nodes,norm_variance_early_nodes,norm_std_early_nodes
0,0,0,"-1.0,2.5",cogsci_learning,5699016.0,0.0,0.0,2.439592e+19,8.0,120.0,...,1.0,0.0,1.0,2.0,0.0,0.0,18.973666,120.0,0.083333,0.5
1,0,0,"-1.0,2.5",high_increasing,3495651.0,0.0,0.0,-1.702437e+19,11.0,31.0,...,0.0,0.0,1.0,1.0,0.0,-16.0,6.324555,20.0,0.013889,0.166667
2,0,0,"-1.0,2.5",large_variance,4156537.0,0.0,0.0,-1.118975e+19,7.0,-77.0,...,0.0,0.0,1.0,1.0,0.0,-224.0,75.894664,2880.0,2.0,2.0
3,0,0,"-1.0,2.5",mini_variance,5119206.0,0.0,0.0,-1.149878e+19,8.0,-189.0,...,1.0,0.0,1.0,2.0,6.0,-210.0,4.743416,7.5,0.005208,0.125
4,0,0,"-1.0,2.5",zero_variance,3645513.0,0.0,0.0,1.813679e+19,2.0,-46.0,...,0.0,1.0,0.0,1.0,0.0,-144.0,0.0,0.0,0.0,0.0


### 1. early_node ~ depth_cost_weight and i_episode 

In [64]:
formula = "num_early ~ i_episode + i_episode:depth_cost_weight + depth_cost_weight + 1"
res = smf.ols(formula=formula, data=grouped_data).fit(
                    missing="drop"
                )
res.summary()

0,1,2,3
Dep. Variable:,num_early,R-squared:,0.005
Model:,OLS,Adj. R-squared:,0.005
Method:,Least Squares,F-statistic:,5183.0
Date:,"Fri, 06 Jan 2023",Prob (F-statistic):,0.0
Time:,21:20:17,Log-Likelihood:,-5242100.0
No. Observations:,3400000,AIC:,10480000.0
Df Residuals:,3399996,BIC:,10480000.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.7834,0.001,643.347,0.000,0.781,0.786
i_episode,-0.0015,2.13e-05,-68.849,0.000,-0.002,-0.001
i_episode:depth_cost_weight,-2.859e-05,1.85e-06,-15.440,0.000,-3.22e-05,-2.5e-05
depth_cost_weight,-0.0043,0.000,-40.353,0.000,-0.004,-0.004

0,1,2,3
Omnibus:,559576.601,Durbin-Watson:,1.946
Prob(Omnibus):,0.0,Jarque-Bera (JB):,895813.867
Skew:,1.257,Prob(JB):,0.0
Kurtosis:,2.939,Cond. No.,1290.0


In [65]:
# 1. cogsci_learning
cogsci_learning_grouped_data = grouped_data.loc[grouped_data['sim_experiment_setting'] == 'cogsci_learning']
res = smf.ols(formula=formula, data=cogsci_learning_grouped_data).fit(
                    missing="drop"
                )
res.summary()

0,1,2,3
Dep. Variable:,num_early,R-squared:,0.016
Model:,OLS,Adj. R-squared:,0.016
Method:,Least Squares,F-statistic:,3639.0
Date:,"Fri, 06 Jan 2023",Prob (F-statistic):,0.0
Time:,21:20:25,Log-Likelihood:,-1045500.0
No. Observations:,680000,AIC:,2091000.0
Df Residuals:,679996,BIC:,2091000.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.7766,0.003,285.595,0.000,0.771,0.782
i_episode,-0.0024,4.75e-05,-50.243,0.000,-0.002,-0.002
i_episode:depth_cost_weight,3.22e-06,3.82e-06,0.844,0.399,-4.26e-06,1.07e-05
depth_cost_weight,-0.0103,0.000,-47.457,0.000,-0.011,-0.010

0,1,2,3
Omnibus:,123625.866,Durbin-Watson:,1.986
Prob(Omnibus):,0.0,Jarque-Bera (JB):,206557.958
Skew:,1.347,Prob(JB):,0.0
Kurtosis:,3.183,Cond. No.,1420.0


In [66]:
high_increasing_grouped_data = grouped_data.loc[grouped_data['sim_experiment_setting'] == 'high_increasing']
res = smf.ols(formula=formula, data=high_increasing_grouped_data).fit(
                    missing="drop"
                )
res.summary()

0,1,2,3
Dep. Variable:,num_early,R-squared:,0.016
Model:,OLS,Adj. R-squared:,0.016
Method:,Least Squares,F-statistic:,3626.0
Date:,"Fri, 06 Jan 2023",Prob (F-statistic):,0.0
Time:,21:20:31,Log-Likelihood:,-1008000.0
No. Observations:,680000,AIC:,2016000.0
Df Residuals:,679996,BIC:,2016000.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.7231,0.003,281.930,0.000,0.718,0.728
i_episode,-0.0029,4.48e-05,-65.052,0.000,-0.003,-0.003
i_episode:depth_cost_weight,-0.0001,4.41e-06,-26.155,0.000,-0.000,-0.000
depth_cost_weight,-0.0046,0.000,-18.436,0.000,-0.005,-0.004

0,1,2,3
Omnibus:,160977.288,Durbin-Watson:,1.932
Prob(Omnibus):,0.0,Jarque-Bera (JB):,301599.642
Skew:,1.57,Prob(JB):,0.0
Kurtosis:,3.883,Cond. No.,1130.0


In [67]:
# 3. large_variance
large_variance_grouped_data = grouped_data.loc[grouped_data['sim_experiment_setting'] == 'large_variance']
res = smf.ols(formula=formula, data=large_variance_grouped_data).fit(
                    missing="drop"
                )
res.summary()

0,1,2,3
Dep. Variable:,num_early,R-squared:,0.005
Model:,OLS,Adj. R-squared:,0.005
Method:,Least Squares,F-statistic:,1160.0
Date:,"Fri, 06 Jan 2023",Prob (F-statistic):,0.0
Time:,21:20:35,Log-Likelihood:,-1098600.0
No. Observations:,680000,AIC:,2197000.0
Df Residuals:,679996,BIC:,2197000.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.0617,0.003,361.682,0.000,1.056,1.067
i_episode,0.0026,5.13e-05,51.596,0.000,0.003,0.003
i_episode:depth_cost_weight,7.885e-05,3.7e-06,21.306,0.000,7.16e-05,8.61e-05
depth_cost_weight,-0.0062,0.000,-29.171,0.000,-0.007,-0.006

0,1,2,3
Omnibus:,7290789.236,Durbin-Watson:,1.995
Prob(Omnibus):,0.0,Jarque-Bera (JB):,76724.692
Skew:,0.446,Prob(JB):,0.0
Kurtosis:,1.617,Cond. No.,1580.0


In [68]:
# 4. mini_variance
mini_variance_grouped_data = grouped_data.loc[grouped_data['sim_experiment_setting'] == 'mini_variance']
res = smf.ols(formula=formula, data=mini_variance_grouped_data).fit(
                    missing="drop"
                )
res.summary()

0,1,2,3
Dep. Variable:,num_early,R-squared:,0.009
Model:,OLS,Adj. R-squared:,0.009
Method:,Least Squares,F-statistic:,2068.0
Date:,"Fri, 06 Jan 2023",Prob (F-statistic):,0.0
Time:,21:20:39,Log-Likelihood:,-1000800.0
No. Observations:,680000,AIC:,2002000.0
Df Residuals:,679996,BIC:,2002000.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6920,0.003,272.639,0.000,0.687,0.697
i_episode,-0.0024,4.43e-05,-53.733,0.000,-0.002,-0.002
i_episode:depth_cost_weight,-9.712e-05,4.18e-06,-23.212,0.000,-0.000,-8.89e-05
depth_cost_weight,-0.0021,0.000,-8.812,0.000,-0.003,-0.002

0,1,2,3
Omnibus:,164057.367,Durbin-Watson:,1.876
Prob(Omnibus):,0.0,Jarque-Bera (JB):,310655.061
Skew:,1.588,Prob(JB):,0.0
Kurtosis:,3.939,Cond. No.,1170.0


In [69]:
# 5. zero_variance
zero_variance_grouped_data = grouped_data.loc[grouped_data['sim_experiment_setting'] == 'zero_variance']
res = smf.ols(formula=formula, data=zero_variance_grouped_data).fit(
                    missing="drop"
                )
res.summary()

0,1,2,3
Dep. Variable:,num_early,R-squared:,0.007
Model:,OLS,Adj. R-squared:,0.007
Method:,Least Squares,F-statistic:,1623.0
Date:,"Fri, 06 Jan 2023",Prob (F-statistic):,0.0
Time:,21:20:42,Log-Likelihood:,-986470.0
No. Observations:,680000,AIC:,1973000.0
Df Residuals:,679996,BIC:,1973000.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6474,0.002,260.199,0.000,0.643,0.652
i_episode,-0.0021,4.34e-05,-47.357,0.000,-0.002,-0.002
i_episode:depth_cost_weight,-0.0001,4.41e-06,-23.092,0.000,-0.000,-9.31e-05
depth_cost_weight,0.0107,0.000,42.088,0.000,0.010,0.011

0,1,2,3
Omnibus:,187883.566,Durbin-Watson:,1.94
Prob(Omnibus):,0.0,Jarque-Bera (JB):,388989.177
Skew:,1.719,Prob(JB):,0.0
Kurtosis:,4.383,Cond. No.,1100.0


In [None]:
#print(cogsci_learning_grouped_data["sim_experiment_setting"].unique())
#print(high_increasing_grouped_data["sim_experiment_setting"].unique())
#print(large_variance_grouped_data["sim_experiment_setting"].unique())
#print(mini_variance_grouped_data["sim_experiment_setting"].unique())
#print(zero_variance_grouped_data["sim_experiment_setting"].unique())

### 2. early_node ~ experiment_setting


In [44]:
formula = "num_early ~ i_episode + i_episode:depth_cost_weight + depth_cost_weight + sim_experiment_setting + sim_experiment_setting:i_episode + sim_experiment_setting:i_episode:depth_cost_weight + sim_experiment_setting:depth_cost_weight + 1"
res = smf.ols(formula=formula, data=grouped_data).fit(
                    missing="drop"
                )
res.summary()

0,1,2,3
Dep. Variable:,num_early,R-squared:,0.056
Model:,OLS,Adj. R-squared:,0.056
Method:,Least Squares,F-statistic:,10630.0
Date:,"Fri, 06 Jan 2023",Prob (F-statistic):,0.0
Time:,20:42:04,Log-Likelihood:,-5151800.0
No. Observations:,3400000,AIC:,10300000.0
Df Residuals:,3399980,BIC:,10300000.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.7766,0.003,292.018,0.000,0.771,0.782
sim_experiment_setting[T.high_increasing],-0.0534,0.004,-14.228,0.000,-0.061,-0.046
sim_experiment_setting[T.large_variance],0.2851,0.004,75.876,0.000,0.278,0.292
sim_experiment_setting[T.mini_variance],-0.0845,0.004,-22.515,0.000,-0.092,-0.077
sim_experiment_setting[T.zero_variance],-0.1292,0.004,-34.385,0.000,-0.137,-0.122
i_episode,-0.0024,4.65e-05,-51.373,0.000,-0.002,-0.002
sim_experiment_setting[T.high_increasing]:i_episode,-0.0005,6.56e-05,-8.025,0.000,-0.001,-0.000
sim_experiment_setting[T.large_variance]:i_episode,0.0050,6.57e-05,76.660,0.000,0.005,0.005
sim_experiment_setting[T.mini_variance]:i_episode,4.513e-06,6.56e-05,0.069,0.945,-0.000,0.000

0,1,2,3
Omnibus:,562601.603,Durbin-Watson:,1.916
Prob(Omnibus):,0.0,Jarque-Bera (JB):,898401.102
Skew:,1.255,Prob(JB):,0.0
Kurtosis:,3.199,Cond. No.,7580.0




### 3. early_node ~ norm_variance_early_nodes

- All environment_settings

In [45]:
formula = "num_early ~ i_episode + i_episode:depth_cost_weight + depth_cost_weight + norm_variance_early_nodes + norm_variance_early_nodes:i_episode + norm_variance_early_nodes:i_episode:depth_cost_weight + norm_variance_early_nodes:depth_cost_weight + 1"
res = smf.ols(formula=formula, data=grouped_data).fit(
                    missing="drop"
                )
res.summary()

0,1,2,3
Dep. Variable:,num_early,R-squared:,0.119
Model:,OLS,Adj. R-squared:,0.119
Method:,Least Squares,F-statistic:,65650.0
Date:,"Fri, 06 Jan 2023",Prob (F-statistic):,0.0
Time:,20:42:23,Log-Likelihood:,-5034400.0
No. Observations:,3400000,AIC:,10070000.0
Df Residuals:,3399992,BIC:,10070000.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6485,0.001,525.696,0.000,0.646,0.651
i_episode,-0.0019,2.16e-05,-88.906,0.000,-0.002,-0.002
i_episode:depth_cost_weight,-4.081e-05,2.03e-06,-20.108,0.000,-4.48e-05,-3.68e-05
depth_cost_weight,-0.0015,0.000,-12.816,0.000,-0.002,-0.001
norm_variance_early_nodes,0.1581,0.001,294.696,0.000,0.157,0.159
norm_variance_early_nodes:i_episode,0.0004,9.37e-06,40.127,0.000,0.000,0.000
norm_variance_early_nodes:i_episode:depth_cost_weight,1.916e-05,4.3e-07,44.563,0.000,1.83e-05,2e-05
norm_variance_early_nodes:depth_cost_weight,0.0004,2.35e-05,17.179,0.000,0.000,0.000

0,1,2,3
Omnibus:,619902.746,Durbin-Watson:,1.929
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1018112.631
Skew:,1.308,Prob(JB):,0.0
Kurtosis:,3.583,Cond. No.,6770.0


### 4. early_node ~ norm_std_early_nodes

- All experiment settings

In [63]:
formula = "num_early ~ i_episode + i_episode:depth_cost_weight + depth_cost_weight + norm_std_early_nodes + norm_std_early_nodes:i_episode + norm_std_early_nodes:i_episode:depth_cost_weight + norm_std_early_nodes:depth_cost_weight + 1"
res = smf.ols(formula=formula, data=grouped_data).fit(
                    missing="drop"
                )
res.summary()

0,1,2,3
Dep. Variable:,num_early,R-squared:,0.152
Model:,OLS,Adj. R-squared:,0.152
Method:,Least Squares,F-statistic:,86840.0
Date:,"Fri, 06 Jan 2023",Prob (F-statistic):,0.0
Time:,21:04:46,Log-Likelihood:,-4970300.0
No. Observations:,3400000,AIC:,9941000.0
Df Residuals:,3399992,BIC:,9941000.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.5810,0.001,461.654,0.000,0.579,0.583
i_episode,-0.0019,2.21e-05,-87.072,0.000,-0.002,-0.002
i_episode:depth_cost_weight,-4.242e-05,2.11e-06,-20.117,0.000,-4.66e-05,-3.83e-05
depth_cost_weight,0.0004,0.000,3.634,0.000,0.000,0.001
norm_std_early_nodes,0.1918,0.001,356.089,0.000,0.191,0.193
norm_std_early_nodes:i_episode,0.0003,9.42e-06,28.334,0.000,0.000,0.000
norm_std_early_nodes:i_episode:depth_cost_weight,1.892e-05,4.4e-07,42.959,0.000,1.81e-05,1.98e-05
norm_std_early_nodes:depth_cost_weight,0.0003,2.4e-05,12.879,0.000,0.000,0.000

0,1,2,3
Omnibus:,605881.398,Durbin-Watson:,1.938
Prob(Omnibus):,0.0,Jarque-Bera (JB):,983329.869
Skew:,1.277,Prob(JB):,0.0
Kurtosis:,3.647,Cond. No.,7170.0


- Individual experiment setting