In [23]:
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np

In [5]:
# mapping experiment setting to variance and std on early node
mapping = {
            'cogsci_learning': [-8, -4, 4, 8],
            'mini_variance'  : [-2, -1, 1, 2],
            'zero_variance'  : [1, 1, 1, 1],
            'large_variance' : [-48, -24, 24, 48],
            'high_increasing': [-4, -2, 2, 4]
                          }

In [6]:
considered_experiment_settings = ["high_increasing", "large_variance", "cogsci_learning", "mini_variance", "zero_variance"]

In [20]:
# load data
data = pd.DataFrame()
for experiment_setting in considered_experiment_settings:
    data_in = pd.read_csv(f"data/processed/simulated/{experiment_setting}/MCL/linear_depth/search_space/1729_depth_only_baseline_null.csv")
    data_in['experiment_setting']= experiment_setting
    data = pd.concat([data, data_in])


high_increasing
large_variance
cogsci_learning
mini_variance
zero_variance


In [21]:
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,w,taken_paths,costs,loss,ground_truth,trial_id,i_episode,actions,...,depth_cost_weight,pid,num_early,num_middle,num_late,num_clicks,unbounded_present_bias,unbounded_loss,state,experiment_setting
0,0,0,"[0, 0, 0, 0, 0]","[1, 2, 3]",[54.0],,"[0.0, -2.0, 8.0, 48.0, -24.0, 2.0, -4.0, -48.0...",3827219843471436276,0,0,...,2.5,0,False,False,False,False,0.0,0.0,"(0, Cat, Cat, Cat, Cat, Cat, Cat, Cat, Cat, Ca...",high_increasing
1,1,1,"[0.0, 0.0, 0.0, 15.5454392236338, 48.857094702...","[5, 6, 8]","[-1.0, -1.0, -1.0, 20.0]",,"[0.0, -2.0, -4.0, -24.0, -48.0, 4.0, -8.0, 24....",-4289689371375290671,1,9,...,2.5,0,True,False,False,True,8.0,-28.0,"(0, Cat, Cat, Cat, Cat, Cat, Cat, Cat, Cat, Ca...",high_increasing
2,2,1,"[0.0, 0.0, 0.0, 15.5454392236338, 48.857094702...","[5, 6, 8]","[-1.0, -1.0, -1.0, 20.0]",,"[0.0, -2.0, -4.0, -24.0, -48.0, 4.0, -8.0, 24....",-4289689371375290671,1,1,...,2.5,0,True,False,False,True,8.0,-28.0,"(0, Cat, Cat, Cat, Cat, Cat, Cat, Cat, Cat, -4...",high_increasing
3,3,1,"[0.0, 0.0, 0.0, 15.5454392236338, 48.857094702...","[5, 6, 8]","[-1.0, -1.0, -1.0, 20.0]",,"[0.0, -2.0, -4.0, -24.0, -48.0, 4.0, -8.0, 24....",-4289689371375290671,1,5,...,2.5,0,True,False,False,True,8.0,-28.0,"(0, -2.0, Cat, Cat, Cat, Cat, Cat, Cat, Cat, -...",high_increasing
4,4,1,"[0.0, 0.0, 0.0, 15.5454392236338, 48.857094702...","[5, 6, 8]","[-1.0, -1.0, -1.0, 20.0]",,"[0.0, -2.0, -4.0, -24.0, -48.0, 4.0, -8.0, 24....",-4289689371375290671,1,0,...,2.5,0,False,False,False,False,8.0,-28.0,"(0, -2.0, Cat, Cat, Cat, 4.0, Cat, Cat, Cat, -...",high_increasing


In [24]:
# create a new column with variance and std of the experiment settings
data["std_early_nodes"] = data["experiment_setting"].apply(lambda experiment_setting: np.std(mapping[experiment_setting]))
data["variance_early_nodes"] = data["experiment_setting"].apply(lambda experiment_setting: np.var(mapping[experiment_setting]))

In [26]:
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,w,taken_paths,costs,loss,ground_truth,trial_id,i_episode,actions,...,num_early,num_middle,num_late,num_clicks,unbounded_present_bias,unbounded_loss,state,experiment_setting,std_early_nodes,variance_early_nodes
0,0,0,"[0, 0, 0, 0, 0]","[1, 2, 3]",[54.0],,"[0.0, -2.0, 8.0, 48.0, -24.0, 2.0, -4.0, -48.0...",3827219843471436276,0,0,...,False,False,False,False,0.0,0.0,"(0, Cat, Cat, Cat, Cat, Cat, Cat, Cat, Cat, Ca...",high_increasing,3.162278,10.0
1,1,1,"[0.0, 0.0, 0.0, 15.5454392236338, 48.857094702...","[5, 6, 8]","[-1.0, -1.0, -1.0, 20.0]",,"[0.0, -2.0, -4.0, -24.0, -48.0, 4.0, -8.0, 24....",-4289689371375290671,1,9,...,True,False,False,True,8.0,-28.0,"(0, Cat, Cat, Cat, Cat, Cat, Cat, Cat, Cat, Ca...",high_increasing,3.162278,10.0
2,2,1,"[0.0, 0.0, 0.0, 15.5454392236338, 48.857094702...","[5, 6, 8]","[-1.0, -1.0, -1.0, 20.0]",,"[0.0, -2.0, -4.0, -24.0, -48.0, 4.0, -8.0, 24....",-4289689371375290671,1,1,...,True,False,False,True,8.0,-28.0,"(0, Cat, Cat, Cat, Cat, Cat, Cat, Cat, Cat, -4...",high_increasing,3.162278,10.0
3,3,1,"[0.0, 0.0, 0.0, 15.5454392236338, 48.857094702...","[5, 6, 8]","[-1.0, -1.0, -1.0, 20.0]",,"[0.0, -2.0, -4.0, -24.0, -48.0, 4.0, -8.0, 24....",-4289689371375290671,1,5,...,True,False,False,True,8.0,-28.0,"(0, -2.0, Cat, Cat, Cat, Cat, Cat, Cat, Cat, -...",high_increasing,3.162278,10.0
4,4,1,"[0.0, 0.0, 0.0, 15.5454392236338, 48.857094702...","[5, 6, 8]","[-1.0, -1.0, -1.0, 20.0]",,"[0.0, -2.0, -4.0, -24.0, -48.0, 4.0, -8.0, 24....",-4289689371375290671,1,0,...,False,False,False,False,8.0,-28.0,"(0, -2.0, Cat, Cat, Cat, 4.0, Cat, Cat, Cat, -...",high_increasing,3.162278,10.0


In [25]:
grouped_data = data.groupby(["pid", "i_episode", "sim_cost_parameter_values", "sim_experiment_setting"], as_index=False).sum()

In [3]:
formula = "num_early ~ i_episode + i_episode:depth_cost_weight + depth_cost_weight + 1"
res = smf.ols(formula=formula, data=grouped_data).fit(
                    missing="drop"
                )
res.summary()

0,1,2,3
Dep. Variable:,num_early,R-squared:,0.002
Model:,OLS,Adj. R-squared:,0.002
Method:,Least Squares,F-statistic:,1182.0
Date:,"Fri, 21 Oct 2022",Prob (F-statistic):,0.0
Time:,13:13:39,Log-Likelihood:,-3371500.0
No. Observations:,2140000,AIC:,6743000.0
Df Residuals:,2139996,BIC:,6743000.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.8378,0.002,527.868,0.000,0.835,0.841
i_episode,-0.0015,2.77e-05,-52.451,0.000,-0.002,-0.001
i_episode:depth_cost_weight,-2.95e-05,2.18e-06,-13.545,0.000,-3.38e-05,-2.52e-05
depth_cost_weight,-0.0002,0.000,-1.812,0.070,-0.000,1.83e-05

0,1,2,3
Omnibus:,330752.63,Durbin-Watson:,1.936
Prob(Omnibus):,0.0,Jarque-Bera (JB):,487616.906
Skew:,1.155,Prob(JB):,0.0
Kurtosis:,2.633,Cond. No.,1420.0


In [4]:
formula = "num_early ~ i_episode + i_episode:depth_cost_weight + depth_cost_weight + sim_experiment_setting + sim_experiment_setting:i_episode + sim_experiment_setting:i_episode:depth_cost_weight + sim_experiment_setting:depth_cost_weight + 1"
res = smf.ols(formula=formula, data=grouped_data).fit(
                    missing="drop"
                )
res.summary()

0,1,2,3
Dep. Variable:,num_early,R-squared:,0.067
Model:,OLS,Adj. R-squared:,0.067
Method:,Least Squares,F-statistic:,8078.0
Date:,"Fri, 21 Oct 2022",Prob (F-statistic):,0.0
Time:,13:13:50,Log-Likelihood:,-3299100.0
No. Observations:,2140000,AIC:,6598000.0
Df Residuals:,2139980,BIC:,6599000.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.9328,0.004,227.647,0.000,0.925,0.941
sim_experiment_setting[T.high_increasing],-0.1692,0.005,-32.052,0.000,-0.180,-0.159
sim_experiment_setting[T.large_variance],0.2334,0.005,42.514,0.000,0.223,0.244
sim_experiment_setting[T.mini_variance],-0.2099,0.005,-42.028,0.000,-0.220,-0.200
sim_experiment_setting[T.zero_variance],-0.3013,0.006,-54.618,0.000,-0.312,-0.291
i_episode,-0.0025,7.15e-05,-34.820,0.000,-0.003,-0.002
sim_experiment_setting[T.high_increasing]:i_episode,0.0001,9.21e-05,1.321,0.186,-5.88e-05,0.000
sim_experiment_setting[T.large_variance]:i_episode,0.0053,9.59e-05,55.767,0.000,0.005,0.006
sim_experiment_setting[T.mini_variance]:i_episode,4.043e-06,8.72e-05,0.046,0.963,-0.000,0.000

0,1,2,3
Omnibus:,309941.694,Durbin-Watson:,1.924
Prob(Omnibus):,0.0,Jarque-Bera (JB):,469696.851
Skew:,1.147,Prob(JB):,0.0
Kurtosis:,2.926,Cond. No.,9760.0
