In [29]:
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [3]:
# mapping experiment setting to variance and std on early node
mapping = {
            'cogsci_learning': [-8, -4, 4, 8],
            'mini_variance'  : [-2, -1, 1, 2],
            'zero_variance'  : [1, 1, 1, 1],
            'large_variance' : [-48, -24, 24, 48],
            'high_increasing': [-4, -2, 2, 4]
                          }

In [4]:
considered_experiment_settings = ["high_increasing", "large_variance", "cogsci_learning", "mini_variance", "zero_variance"]

In [5]:
# load data
data = pd.DataFrame()
for experiment_setting in considered_experiment_settings:
    data_in = pd.read_csv(f"data/processed/simulated/{experiment_setting}/MCL/linear_depth/search_space/1729_depth_only_baseline_null.csv")
    data_in['experiment_setting']= experiment_setting
    data = pd.concat([data, data_in])


In [6]:
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,w,taken_paths,costs,loss,ground_truth,trial_id,i_episode,actions,...,depth_cost_weight,pid,num_early,num_middle,num_late,num_clicks,unbounded_present_bias,unbounded_loss,state,experiment_setting
0,0,0,"[0, 0, 0, 0, 0]","[1, 2, 3]",[54.0],,"[0.0, -2.0, 8.0, 48.0, -24.0, 2.0, -4.0, -48.0...",3827219843471436276,0,0,...,2.5,0,False,False,False,False,0.0,0.0,"(0, Cat, Cat, Cat, Cat, Cat, Cat, Cat, Cat, Ca...",high_increasing
1,1,1,"[0.0, 0.0, 0.0, 15.5454392236338, 48.857094702...","[5, 6, 8]","[-1.0, -1.0, -1.0, 20.0]",,"[0.0, -2.0, -4.0, -24.0, -48.0, 4.0, -8.0, 24....",-4289689371375290671,1,9,...,2.5,0,True,False,False,True,8.0,-28.0,"(0, Cat, Cat, Cat, Cat, Cat, Cat, Cat, Cat, Ca...",high_increasing
2,2,1,"[0.0, 0.0, 0.0, 15.5454392236338, 48.857094702...","[5, 6, 8]","[-1.0, -1.0, -1.0, 20.0]",,"[0.0, -2.0, -4.0, -24.0, -48.0, 4.0, -8.0, 24....",-4289689371375290671,1,1,...,2.5,0,True,False,False,True,8.0,-28.0,"(0, Cat, Cat, Cat, Cat, Cat, Cat, Cat, Cat, -4...",high_increasing
3,3,1,"[0.0, 0.0, 0.0, 15.5454392236338, 48.857094702...","[5, 6, 8]","[-1.0, -1.0, -1.0, 20.0]",,"[0.0, -2.0, -4.0, -24.0, -48.0, 4.0, -8.0, 24....",-4289689371375290671,1,5,...,2.5,0,True,False,False,True,8.0,-28.0,"(0, -2.0, Cat, Cat, Cat, Cat, Cat, Cat, Cat, -...",high_increasing
4,4,1,"[0.0, 0.0, 0.0, 15.5454392236338, 48.857094702...","[5, 6, 8]","[-1.0, -1.0, -1.0, 20.0]",,"[0.0, -2.0, -4.0, -24.0, -48.0, 4.0, -8.0, 24....",-4289689371375290671,1,0,...,2.5,0,False,False,False,False,8.0,-28.0,"(0, -2.0, Cat, Cat, Cat, 4.0, Cat, Cat, Cat, -...",high_increasing


In [7]:
# create a new column with variance and std of the experiment settings
data["std_early_nodes"] = data["experiment_setting"].apply(lambda experiment_setting: np.std(mapping[experiment_setting]))
data["variance_early_nodes"] = data["experiment_setting"].apply(lambda experiment_setting: np.var(mapping[experiment_setting]))

In [51]:
## normalization
# variance
data["norm_variance_early_nodes"] = (
                                    (data["variance_early_nodes"]-data["variance_early_nodes"].min())/
                                    (data["variance_early_nodes"].max()-data["variance_early_nodes"].min()))
# std
data["norm_std_early_nodes"]      = (
                                    (data["std_early_nodes"]-data["std_early_nodes"].min())/
                                    (data["std_early_nodes"].max()-data["std_early_nodes"].min()))
# data["norm_std_early_nodes"] = preprocessing.Normalizer(data["std_early_nodes"])
# data["norm_variance_early_nodes"] = preprocessing.Normalizer(data["variance_early_nodes"])

In [55]:
# group data by factors
grouped_data = data.groupby(["pid", "i_episode", "sim_cost_parameter_values", "sim_experiment_setting"], as_index=False).sum()

In [57]:
formula = "num_early ~ i_episode + i_episode:depth_cost_weight + depth_cost_weight + 1"
res = smf.ols(formula=formula, data=grouped_data).fit(
                    missing="drop"
                )
res.summary()

0,1,2,3
Dep. Variable:,num_early,R-squared:,0.005
Model:,OLS,Adj. R-squared:,0.005
Method:,Least Squares,F-statistic:,5183.0
Date:,"Mon, 19 Dec 2022",Prob (F-statistic):,0.0
Time:,10:52:32,Log-Likelihood:,-5242100.0
No. Observations:,3400000,AIC:,10480000.0
Df Residuals:,3399996,BIC:,10480000.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.7834,0.001,643.347,0.000,0.781,0.786
i_episode,-0.0015,2.13e-05,-68.849,0.000,-0.002,-0.001
i_episode:depth_cost_weight,-2.859e-05,1.85e-06,-15.440,0.000,-3.22e-05,-2.5e-05
depth_cost_weight,-0.0043,0.000,-40.353,0.000,-0.004,-0.004

0,1,2,3
Omnibus:,559576.601,Durbin-Watson:,1.946
Prob(Omnibus):,0.0,Jarque-Bera (JB):,895813.867
Skew:,1.257,Prob(JB):,0.0
Kurtosis:,2.939,Cond. No.,1290.0


In [58]:
# early_node_variance ~ experiment_setting
formula = "num_early ~ i_episode + i_episode:depth_cost_weight + depth_cost_weight + sim_experiment_setting + sim_experiment_setting:i_episode + sim_experiment_setting:i_episode:depth_cost_weight + sim_experiment_setting:depth_cost_weight + 1"
res = smf.ols(formula=formula, data=grouped_data).fit(
                    missing="drop"
                )
res.summary()

0,1,2,3
Dep. Variable:,num_early,R-squared:,0.056
Model:,OLS,Adj. R-squared:,0.056
Method:,Least Squares,F-statistic:,10630.0
Date:,"Mon, 19 Dec 2022",Prob (F-statistic):,0.0
Time:,10:53:00,Log-Likelihood:,-5151800.0
No. Observations:,3400000,AIC:,10300000.0
Df Residuals:,3399980,BIC:,10300000.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.7766,0.003,292.018,0.000,0.771,0.782
sim_experiment_setting[T.high_increasing],-0.0534,0.004,-14.228,0.000,-0.061,-0.046
sim_experiment_setting[T.large_variance],0.2851,0.004,75.876,0.000,0.278,0.292
sim_experiment_setting[T.mini_variance],-0.0845,0.004,-22.515,0.000,-0.092,-0.077
sim_experiment_setting[T.zero_variance],-0.1292,0.004,-34.385,0.000,-0.137,-0.122
i_episode,-0.0024,4.65e-05,-51.373,0.000,-0.002,-0.002
sim_experiment_setting[T.high_increasing]:i_episode,-0.0005,6.56e-05,-8.025,0.000,-0.001,-0.000
sim_experiment_setting[T.large_variance]:i_episode,0.0050,6.57e-05,76.660,0.000,0.005,0.005
sim_experiment_setting[T.mini_variance]:i_episode,4.513e-06,6.56e-05,0.069,0.945,-0.000,0.000

0,1,2,3
Omnibus:,562601.603,Durbin-Watson:,1.916
Prob(Omnibus):,0.0,Jarque-Bera (JB):,898401.102
Skew:,1.255,Prob(JB):,0.0
Kurtosis:,3.199,Cond. No.,7580.0


In [59]:
# early_node_variance ~ norm_variance_early_nodes
formula = "num_early ~ i_episode + i_episode:depth_cost_weight + depth_cost_weight + norm_variance_early_nodes + norm_variance_early_nodes:i_episode + norm_variance_early_nodes:i_episode:depth_cost_weight + norm_variance_early_nodes:depth_cost_weight + 1"
res = smf.ols(formula=formula, data=grouped_data).fit(
                    missing="drop"
                )
res.summary()

0,1,2,3
Dep. Variable:,num_early,R-squared:,0.119
Model:,OLS,Adj. R-squared:,0.119
Method:,Least Squares,F-statistic:,65650.0
Date:,"Mon, 19 Dec 2022",Prob (F-statistic):,0.0
Time:,10:53:09,Log-Likelihood:,-5034400.0
No. Observations:,3400000,AIC:,10070000.0
Df Residuals:,3399992,BIC:,10070000.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6485,0.001,525.696,0.000,0.646,0.651
i_episode,-0.0019,2.16e-05,-88.906,0.000,-0.002,-0.002
i_episode:depth_cost_weight,-4.081e-05,2.03e-06,-20.108,0.000,-4.48e-05,-3.68e-05
depth_cost_weight,-0.0015,0.000,-12.816,0.000,-0.002,-0.001
norm_variance_early_nodes,0.1581,0.001,294.696,0.000,0.157,0.159
norm_variance_early_nodes:i_episode,0.0004,9.37e-06,40.127,0.000,0.000,0.000
norm_variance_early_nodes:i_episode:depth_cost_weight,1.916e-05,4.3e-07,44.563,0.000,1.83e-05,2e-05
norm_variance_early_nodes:depth_cost_weight,0.0004,2.35e-05,17.179,0.000,0.000,0.000

0,1,2,3
Omnibus:,619902.746,Durbin-Watson:,1.929
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1018112.631
Skew:,1.308,Prob(JB):,0.0
Kurtosis:,3.583,Cond. No.,6770.0


In [60]:
# early_node_variance ~ norm_std_early_nodes
formula = "num_early ~ i_episode + i_episode:depth_cost_weight + depth_cost_weight + norm_std_early_nodes + norm_std_early_nodes:i_episode + norm_std_early_nodes:i_episode:depth_cost_weight + norm_std_early_nodes:depth_cost_weight + 1"
res = smf.ols(formula=formula, data=grouped_data).fit(
                    missing="drop"
                )
res.summary()

0,1,2,3
Dep. Variable:,num_early,R-squared:,0.152
Model:,OLS,Adj. R-squared:,0.152
Method:,Least Squares,F-statistic:,86840.0
Date:,"Mon, 19 Dec 2022",Prob (F-statistic):,0.0
Time:,10:53:38,Log-Likelihood:,-4970300.0
No. Observations:,3400000,AIC:,9941000.0
Df Residuals:,3399992,BIC:,9941000.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.5810,0.001,461.654,0.000,0.579,0.583
i_episode,-0.0019,2.21e-05,-87.072,0.000,-0.002,-0.002
i_episode:depth_cost_weight,-4.242e-05,2.11e-06,-20.117,0.000,-4.66e-05,-3.83e-05
depth_cost_weight,0.0004,0.000,3.634,0.000,0.000,0.001
norm_std_early_nodes,0.1918,0.001,356.089,0.000,0.191,0.193
norm_std_early_nodes:i_episode,0.0003,9.42e-06,28.334,0.000,0.000,0.000
norm_std_early_nodes:i_episode:depth_cost_weight,1.892e-05,4.4e-07,42.959,0.000,1.81e-05,1.98e-05
norm_std_early_nodes:depth_cost_weight,0.0003,2.4e-05,12.879,0.000,0.000,0.000

0,1,2,3
Omnibus:,605881.398,Durbin-Watson:,1.938
Prob(Omnibus):,0.0,Jarque-Bera (JB):,983329.869
Skew:,1.277,Prob(JB):,0.0
Kurtosis:,3.647,Cond. No.,7170.0
