### Prelude:

In [1]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col

In [2]:
# import data
data = pd.read_csv("./../Datasets/ssi-data-cleaned.csv")

In [3]:
# Set up
treatments = {0: "No framing",
              1: "Negative science",
              2: "Religious",
              3: "Equity",
              4: "Efficiency",
              5: "Secular"}
data["treatment_frame"] = data["treatment_value"].map(treatments)
data_rep = data[data['party'] == 1]
data_dem = data[data['party'] == -1]
data_ind = data[data['party'] == 0]

set_seed = 42
num_folds = 5
covariates_pre = ['gastax', 'carbtax', 'treaty', 'regcarb']

outcome_var = 'post_test'
covariates = ['age', 'party_id', 'employment_status', 'race_white', 'income_level', 
              'relationship', 'college', 'sex_id', 'prosociality', 'gastax', 
              'carbtax', 'treaty', 'regcarb', 'ideology', 'scientific_confidence', 
              'reward_consequence', 'religiosity', 'rel_freq', 'economic_reasoning']
treatment_vars = [f"treatment_{i}" for i in range(1, 6)]
control_var = 'pre_test'

In [4]:
# Distribution of subjects across treatment conditions (like Table 1 from paper)
treatment_freq = data[["treatment_value", "treatment_frame"]].value_counts()
treatment_rel_freq = data["treatment_frame"].value_counts(normalize=True)
treatment_freq.to_frame().sort_index().join(treatment_rel_freq)

Unnamed: 0_level_0,Unnamed: 1_level_0,count,proportion
treatment_value,treatment_frame,Unnamed: 2_level_1,Unnamed: 3_level_1
0,No framing,396,0.168942
1,Negative science,395,0.168515
2,Religious,358,0.15273
3,Equity,402,0.171502
4,Efficiency,411,0.175341
5,Secular,382,0.162969


### Means tables:

In [5]:
pd.pivot_table(data, values=['pre_test', 'post_test', ],
               index=['treatment_value','treatment_frame'],
               aggfunc=['mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,mean
Unnamed: 0_level_1,Unnamed: 1_level_1,post_test,pre_test
treatment_value,treatment_frame,Unnamed: 2_level_2,Unnamed: 3_level_2
0,No framing,1.632576,1.630682
1,Negative science,1.613291,1.543038
2,Religious,1.590084,1.556564
3,Equity,1.606343,1.558458
4,Efficiency,1.633212,1.546229
5,Secular,1.618455,1.546466


#### By political party (-1 id Dem, 1 is Rep, 0 is independent)

In [6]:
pd.pivot_table(data, values=["post_test", 'pre_test'],
               index=["party", "treatment_frame"], aggfunc=['mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,mean
Unnamed: 0_level_1,Unnamed: 1_level_1,post_test,pre_test
party,treatment_frame,Unnamed: 2_level_2,Unnamed: 3_level_2
-1,Efficiency,2.16276,2.039062
-1,Equity,2.048429,1.971204
-1,Negative science,2.096591,1.984375
-1,No framing,2.032552,2.005208
-1,Religious,2.08642,2.07716
-1,Secular,2.119505,2.020604
0,Efficiency,1.4125,1.325
0,Equity,1.526515,1.462121
0,Negative science,1.527778,1.531746
0,No framing,1.479839,1.5


### Functions

#### Lin estimators

In [7]:
# with multiple treatments
def lin_estimator_mult_treat_formula(data, y_var, treatment_vars, covariates):
    """
    Inputs:
        data: pandas dataframe containing all x and y columns
        y_var: name of y variable
        treatment_vars: treatment dummy variables
        covariates: list of string names of covariate

    Returns: Lin estimator model, formula
    """
    # Demean the covariates
    df = data.copy()
    for cov in covariates:
        # ignore binary variables
        if (df[cov].max() == 1 and df[cov].max() == 0) :
            df[cov + '_demeaned'] = df[cov]
        else:
            df[cov + '_demeaned'] = df[cov].dropna() - df[cov].dropna().mean()

    # Define the regression formula
    # Include each treatment indicator
    treatments_formula = " + ".join(treatment_vars)

    # Include each interaction term (automatically includes individual covariates)
    interactions = []
    for treatment in treatment_vars:
        for cov in covariates:
            interactions.append(f"{cov+ '_demeaned'} * {treatment}")
    
    interactions_formula = " + ".join(interactions)

    # Full formula -- include any other control(s)
    formula = f"{y_var} ~ {treatments_formula} + {interactions_formula}"

    # Fit the regression model and save results object
    model = sm.OLS.from_formula(formula, data=df).fit()

    # Return results object with robust covariance type
    return model.get_robustcov_results(cov_type="HC3"), formula

# with one treatment
def lin_estimator_formula(data, y_var, treatment_var, covariates):
    """
    Inputs:
        data: pandas dataframe containing all x and y columns
        y_var: name of y variable
        treatment_var: single treatment variable 
        covariates: list of string names of covariate

    Returns: Lin estimator model, formula
    """
    df = data.copy()
    # Demean the covariates
    for cov in covariates:
        # ignore binary variables
        if (df[cov].max() == 1 and df[cov].max() == 0) :
            df[cov + '_demeaned'] = df[cov]
        else:
            df[cov + '_demeaned'] = df[cov].dropna() - df[cov].dropna().mean()

    # Define the regression formula

    # Include each interaction term (automatically includes individual covariates)
    interactions = []
    for cov in covariates:
        interactions.append(f"{cov+ '_demeaned'} + {cov+ '_demeaned'} * {treatment_var}")
    
    interactions_formula = " + ".join(interactions)

    # Full formula -- include any other control(s)
    formula = f"{y_var} ~ {treatment_var} + {interactions_formula}"

    # Fit the regression model and save results object
    model = sm.OLS.from_formula(formula, data=df).fit()

    # Return results object with robust covariance type
    return model.get_robustcov_results(cov_type="HC3"), formula

#### Other estimation helpers

In [8]:
# Function to extract treatment effects from model
def extract_treatment_effects(model, treatment_vars):
    coefs = dict(zip(model.model.exog_names, model.params))
    effects = {var: coefs.get(var, 0) if coefs.get(var) is not None else 0 for var in treatment_vars}
    return effects

# Function to find the best treatment
def find_best_treatment(effects):
    if effects:
        return max(effects, key=effects.get)
    return None

# Function to assign the best treatment indicator
def assign_best_treatment_indicator(test_data, best_treatment):
    if best_treatment:
        test_data['best_treatment_indicator'] = (test_data[best_treatment] == 1).astype(int)
        test_data['not_best_treatment_indicator'] = ((test_data[best_treatment] != 1) * (test_data['treatment_value']!=0) ).astype(int)
    else:
        data['best_treatment_indicator'] = 0

# Train models and predict outcomes
def train_and_predict(train_data, test_data, features, random_state):
    predictions = {}

    # Train a model for each treatment condition and predict for the test data
    for treatment in treatment_vars:
        # Assuming binary treatment, filter data where treatment is active
        treated_data = train_data[train_data[treatment] == 1]
        rf = RandomForestRegressor(n_estimators=100, random_state=42)
        rf.fit(treated_data[features], treated_data[outcome_var])
        # Store predictions for each treatment
        predictions[treatment] = rf.predict(test_data[features])

    return predictions

# Function to check if the treatment equals 1
def check_treatment(row, column):
    treatment_col = row[column]
    return int(row[treatment_col] == 1)

# Function to assing the best personalized treatment indicator
def assign_best_personalized_treatment_indicator(test_data, results):
    # Create a DataFrame from the results with appropriate indexing
    results_df = pd.DataFrame(results, index=test_data.index)
    # Use np.argmax on axis=1 to find the indices of maximum values along the horizontal axis
    best_treatment_indices = np.argmax(results_df.values, axis=1)
    # Convert indices to a Series to use the map function
    best_treatment_series = pd.Series(best_treatment_indices, index=test_data.index)
    # Map indices to treatment variable names
    best_treatment = best_treatment_series.map(dict(enumerate(treatment_vars)))
    test_data['best_personalized_treatment'] = best_treatment
    # Apply the function across the DataFrame rows
    test_data['best_personalized_treatment_indicator'] = test_data.apply(check_treatment, column = 'best_personalized_treatment', axis=1)
    test_data['not_best_personalized_treatment_indicator'] = ((test_data['best_personalized_treatment_indicator'] != 1) * (test_data['treatment_value']!=0) ).astype(int)


#### Difference in means

In [9]:
# Simple difference in means estimator
treatments_formula = " + ".join(treatment_vars)
formula = f"post_test ~ {treatments_formula}"

# Fit the regression model and save results object
model = sm.OLS.from_formula(formula, data=data).fit()

model0_results = model.get_robustcov_results(cov_type="HC3")
model0_results.summary()

0,1,2,3
Dep. Variable:,post_test,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.002
Method:,Least Squares,F-statistic:,0.1457
Date:,"Wed, 01 May 2024",Prob (F-statistic):,0.981
Time:,19:55:52,Log-Likelihood:,-2933.1
No. Observations:,2344,AIC:,5878.0
Df Residuals:,2338,BIC:,5913.0
Df Model:,5,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.6326,0.040,41.070,0.000,1.555,1.711
treatment_1,-0.0193,0.059,-0.328,0.743,-0.135,0.096
treatment_2,-0.0425,0.060,-0.711,0.477,-0.160,0.075
treatment_3,-0.0262,0.057,-0.457,0.647,-0.139,0.086
treatment_4,0.0006,0.059,0.011,0.991,-0.115,0.116
treatment_5,-0.0141,0.060,-0.236,0.813,-0.131,0.103

0,1,2,3
Omnibus:,188.811,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,126.104
Skew:,-0.452,Prob(JB):,4.14e-28
Kurtosis:,2.311,Cond. No.,6.82


In [10]:
# Fit the regression model and save results object
model_rep = sm.OLS.from_formula(formula, data=data_rep).fit()
model_dem = sm.OLS.from_formula(formula, data=data_dem).fit()
model_ind = sm.OLS.from_formula(formula, data=data_ind).fit()

model0_results_rep = model_rep.get_robustcov_results(cov_type="HC3")
model0_results_dem = model_dem.get_robustcov_results(cov_type="HC3")
model0_results_ind = model_ind.get_robustcov_results(cov_type="HC3")

print (summary_col([model0_results_rep, model0_results_dem, model0_results_ind],stars=True,float_format='%0.3f',
                  model_names=['Difference-in-means\nRep\n(1)','Difference-in-means\nDem\n(2)','Difference-in-means\nInd\n(3)'],
                  info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                             'R2':lambda x: "{:.2f}".format(x.rsquared)},
                  regressor_order=['Intercept'] + [f"treatment_{i}" for i in range(1, 6)],
                  drop_omitted=True))


               Difference-in-means Difference-in-means Difference-in-means
                       Rep                 Dem                 Ind        
                       (1)                 (2)                 (3)        
--------------------------------------------------------------------------
Intercept      1.158***            2.033***            1.480***           
               (0.068)             (0.040)             (0.093)            
treatment_1    -0.056              0.064               0.048              
               (0.095)             (0.059)             (0.143)            
treatment_2    -0.107              0.054               -0.009             
               (0.097)             (0.059)             (0.139)            
treatment_3    -0.098              0.016               0.047              
               (0.098)             (0.056)             (0.130)            
treatment_4    -0.081              0.130**             -0.067             
               (0.098)  

In [11]:
# Lin estimator
model1_results, model1_formula = lin_estimator_mult_treat_formula(data,
                                                          "post_test",
                                                          treatment_vars,
                                                          covariates)

model1_pre_results, model1_formula = lin_estimator_mult_treat_formula(data,
                                                          "post_test",
                                                          treatment_vars,
                                                          covariates_pre)


model1_pre_results_dem = lin_estimator_mult_treat_formula(data_dem,
                                                          "post_test",
                                                          treatment_vars,
                                                          covariates_pre)[0]

model1_pre_results_rep = lin_estimator_mult_treat_formula(data_rep,
                                                          "post_test",
                                                          treatment_vars,
                                                          covariates_pre)[0]

model1_pre_results_ind = lin_estimator_mult_treat_formula(data_ind,
                                                          "post_test",
                                                          treatment_vars,
                                                          covariates_pre)[0]
# model1_results.summary()

In [12]:
print (summary_col([model0_results, model1_results, model1_pre_results],stars=True,float_format='%0.3f',
                  model_names=['Difference-in-means\n(1)','Lin (all covariates)\n(2)','Lin (pre-test only)\n(3)'],
                  info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                             'R2':lambda x: "{:.2f}".format(x.rsquared)},
                  regressor_order=['Intercept'] + [f"treatment_{i}" for i in range(1, 6)],
                  drop_omitted=True))


               Difference-in-means Lin (all covariates) Lin (pre-test only)
                       (1)                 (2)                  (3)        
---------------------------------------------------------------------------
Intercept      1.633***            1.565***             1.567***           
               (0.040)             (0.012)              (0.012)            
treatment_1    -0.019              0.069***             0.068***           
               (0.059)             (0.019)              (0.019)            
treatment_2    -0.042              0.037*               0.030              
               (0.060)             (0.019)              (0.019)            
treatment_3    -0.026              0.042**              0.041**            
               (0.057)             (0.020)              (0.019)            
treatment_4    0.001               0.089***             0.079***           
               (0.059)             (0.020)              (0.019)            
treatment_5

#### Separate estimates by party

In [13]:
print (summary_col([model1_pre_results_rep, model1_pre_results_dem, model1_pre_results_ind],stars=True,float_format='%0.3f',
                  model_names=['Lin (pre-test only)\nRep\n(1)','Lin (pre-test only)\nDem\n(2)','Lin (pre-test only)\nInd\n(3)'],
                  info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                             'R2':lambda x: "{:.2f}".format(x.rsquared)},
                  regressor_order=['Intercept'] + [f"treatment_{i}" for i in range(1, 6)],
                  drop_omitted=True))


               Lin (pre-test only) Lin (pre-test only) Lin (pre-test only)
                       Rep                 Dem                 Ind        
                       (1)                 (2)                 (3)        
--------------------------------------------------------------------------
Intercept      1.028***            2.041***            1.445***           
               (0.015)             (0.020)             (0.026)            
treatment_1    0.069**             0.084***            0.022              
               (0.031)             (0.029)             (0.041)            
treatment_2    0.064**             -0.013              0.084              
               (0.028)             (0.029)             (0.055)            
treatment_3    0.018               0.041               0.082              
               (0.027)             (0.029)             (0.051)            
treatment_4    0.067**             0.098***            0.115***           
               (0.027)  

### Cross-Validation Regression:

In [14]:
# Shuffle data and split into folds
shuffled = data.sample(frac=1, random_state=set_seed)
folds = np.array_split(shuffled, num_folds)

# Initialize storage for results and effects
combined_data = data.iloc[:0,:].copy()
all_effects = []
best_treatments = []

# Iterate over each fold, using it as the test set, and the others as the training set
for i in range(num_folds):
    test_fold = folds[i]
    training_folds = pd.concat([folds[j] for j in range(num_folds) if j != i])
    
    # Train model on the combined training folds
    training_model = lin_estimator_mult_treat_formula(training_folds, outcome_var, treatment_vars, covariates_pre)[0]
    training_effects = extract_treatment_effects(training_model, treatment_vars)
    
    # Find the best treatment from the training model
    best_treatment = find_best_treatment(training_effects)
    assign_best_treatment_indicator(test_fold, best_treatment)

    all_effects = all_effects + [training_effects]
    best_treatments = best_treatments + [best_treatment]
    combined_data = pd.concat([combined_data, test_fold])

# Simple difference in means estimator
treatments_formula = " + ".join(['best_treatment_indicator', 
                                                   'not_best_treatment_indicator'])
formula = f"post_test ~ {treatments_formula}"

# Fit the regression model and save results object
model = sm.OLS.from_formula(formula, data=combined_data).fit()

model3_results = model.get_robustcov_results(cov_type="HC3")

model3_pre_results = lin_estimator_mult_treat_formula(combined_data, 
                                                  outcome_var, 
                                                  # we include `not_best_personalized_treatment_indicator` so that the 
                                                  # `best_personalized_treatment_indicator` is compared to control only
                                                  ['best_treatment_indicator', 
                                                   'not_best_treatment_indicator'], 
                                                  covariates_pre)[0]

  return bound(*args, **kwds)


#### Best treatments

In [15]:
best_treatments

['treatment_4', 'treatment_4', 'treatment_4', 'treatment_4', 'treatment_4']

### Random Forest:

In [16]:
# Shuffle data and split into folds
shuffled = combined_data.sample(frac=1, random_state=set_seed).copy()
folds = np.array_split(shuffled, num_folds)

features = covariates

# Initialize storage for results and effects
combined_data = data.iloc[:0,:].copy()
all_effects = []
best_treatments = []

# Iterate over each fold, using it as the test set, and the others as the training set
for i in range(num_folds):
    test_fold = folds[i]
    training_folds = pd.concat([folds[j] for j in range(num_folds) if j != i])
    
    # Train model on the combined training folds, predict on test data
    test_results = train_and_predict(training_folds, test_fold, features, set_seed)
    assign_best_personalized_treatment_indicator(test_fold, test_results)
    for fold_number in range(0, num_folds):
        test_fold[f'fold_{fold_number}'] = 0
    test_fold[f'fold_{i}'] = 1

    combined_data = pd.concat([combined_data, test_fold])

# Simple difference in means estimator
treatments_formula = " + ".join(['best_personalized_treatment_indicator', 
                                                   'not_best_personalized_treatment_indicator'])
formula = f"post_test ~ {treatments_formula}"

# Fit the regression model and save results object
model = sm.OLS.from_formula(formula, data=combined_data).fit()

model4_results = model.get_robustcov_results(cov_type="HC3")

model4_pre_results = lin_estimator_mult_treat_formula(combined_data, 
                                                  outcome_var, 
                                                  # we include `not_best_personalized_treatment_indicator` so that the 
                                                  # `best_personalized_treatment_indicator` is compared to control only
                                                  ['best_personalized_treatment_indicator', 
                                                   'not_best_personalized_treatment_indicator'], 
                                                  covariates_pre)[0]

  return bound(*args, **kwds)


In [17]:
print (summary_col([model3_pre_results, model4_pre_results],stars=True,float_format='%0.3f',
                  model_names=['Best fixed arm\n(1)', 'Best personalized arm\n(2)'],
                  info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                             'R2':lambda x: "{:.2f}".format(x.rsquared)},
                  regressor_order=['Intercept', 'best_treatment_indicator', 'best_personalized_treatment_indicator'],
                  drop_omitted=True))


                                      Best fixed arm Best personalized arm
                                           (1)                (2)         
--------------------------------------------------------------------------
Intercept                             1.567***       1.567***             
                                      (0.012)        (0.012)              
best_treatment_indicator              0.079***                            
                                      (0.019)                             
best_personalized_treatment_indicator                0.073***             
                                                     (0.020)              
R-squared                             0.897          0.896                
R-squared Adj.                        0.896          0.896                
N                                     2344           2344                 
R2                                    0.90           0.90                 
Standard errors in paren

In [18]:
# check that approximately 1/6 of people were assigned the best treatment and best personalized treatment
combined_data['best_treatment_indicator'].mean()

0.17534129692832764

In [19]:
combined_data['best_personalized_treatment_indicator'].mean()

0.16936860068259385

In [20]:
1/6

0.16666666666666666