### Prelude:

In [1]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
import statsmodels.api as sm
set_seed = 42

In [2]:
data = pd.read_csv("./../Datasets/cleaned_data.csv")

In [3]:
#from Tiffanie's code
# distribution of subjects across treatment conditions (like Table 1 from paper)
# N = 186
treatment_freq = data[["treatment_value", "treatment_frame"]].value_counts()
treatment_rel_freq = data["treatment_frame"].value_counts(normalize=True)
treatment_freq.to_frame().sort_index().join(treatment_rel_freq)

Unnamed: 0_level_0,Unnamed: 1_level_0,count,proportion
treatment_value,treatment_frame,Unnamed: 2_level_1,Unnamed: 3_level_1
0,No framing,22,0.11828
1,Positive science,22,0.11828
2,Negative science,18,0.096774
3,Religious,35,0.188172
4,Equity,30,0.16129
5,Efficiency,32,0.172043
6,Secular,27,0.145161


### Difference of Means Tables:

In [4]:
pd.pivot_table(data, values=['mean_climate_support_before', 'mean_climate_support_after', 'mean_climate_support_change'],
               index=['treatment_value','treatment_frame'],
               aggfunc=['mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,mean,mean
Unnamed: 0_level_1,Unnamed: 1_level_1,mean_climate_support_after,mean_climate_support_before,mean_climate_support_change
treatment_value,treatment_frame,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,No framing,1.590909,1.545455,0.045455
1,Positive science,1.795455,1.670455,0.125
2,Negative science,1.972222,1.763889,0.208333
3,Religious,1.857143,1.75,0.107143
4,Equity,1.916667,1.866667,0.05
5,Efficiency,1.953125,1.914062,0.039062
6,Secular,1.990741,1.87963,0.111111


In [5]:
pd.pivot_table(data, values=["mean_climate_support_after", 'mean_climate_support_before', 'mean_climate_support_change'],
               index=["party_id", "treatment_frame"], aggfunc=['mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,mean,mean
Unnamed: 0_level_1,Unnamed: 1_level_1,mean_climate_support_after,mean_climate_support_before,mean_climate_support_change
party_id,treatment_frame,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Democrat,Efficiency,2.333333,2.270833,0.0625
Democrat,Equity,1.946429,1.946429,0.0
Democrat,Negative science,2.55,2.2,0.35
Democrat,No framing,1.725,1.6,0.125
Democrat,Positive science,2.34375,2.1875,0.15625
Democrat,Religious,2.026316,1.947368,0.078947
Democrat,Secular,1.980769,1.865385,0.115385
Independent,Efficiency,1.946429,1.803571,0.142857
Independent,Equity,1.9,1.9,0.0
Independent,Negative science,2.0625,2.0,0.0625


### Linear Regression:

In [6]:
def lin_estimator_mult_treat_formula(data, y_var, treatment_vars, covariates):
    """
    Inputs:
        data: pandas dataframe containing all x and y columns
        y_var: name of y variable
        treatment_vars: 
        covariates: list of string names of covariate

    Returns: Lin estimator model, formula
    """
    # Demean the covariates
    for cov in covariates:
        data[cov + '_demeaned'] = data[cov].dropna() - data[cov].dropna().mean()

    # Define the regression formula
    # Include each treatment indicator
    treatments_formula = " + ".join(treatment_vars)

    # Include each interaction term (automatically includes individual covariates)
    interactions = []
    for treatment in treatment_vars:
        for cov in covariates:
            interactions.append(f"{cov+ '_demeaned'} * {treatment}")
    
    interactions_formula = " + ".join(interactions)

    # Full formula -- include any other control(s)
    formula = f"{y_var} ~ {treatments_formula} + {interactions_formula}"

    # Fit the regression model and save results object
    model = sm.OLS.from_formula(formula, data=data).fit()

    # Return results object with robust covariance type
    return model.get_robustcov_results(cov_type="HC3"), formula

def lin_estimator_mult_treat(data, y_var, treatment_vars, covariates):
    """
    Inputs:
        data: pandas dataframe containing all x and y columns
        y_var: name of y variable
        treatment_vars: 
        covariates: list of string names of covariate

    Returns: Lin estimator model NO FORMULA
    """
    # Demean the covariates
    for cov in covariates:
        data[cov + '_demeaned'] = data[cov].dropna() - data[cov].dropna().mean()

    # Define the regression formula
    # Include each treatment indicator
    treatments_formula = " + ".join(treatment_vars)

    # Include each interaction term (automatically includes individual covariates)
    interactions = []
    for treatment in treatment_vars:
        for cov in covariates:
            interactions.append(f"{cov+ '_demeaned'} * {treatment}")
    
    interactions_formula = " + ".join(interactions)

    # Full formula -- include any other control(s)
    formula = f"{y_var} ~ {treatments_formula} + {interactions_formula}"

    # Fit the regression model and save results object
    model = sm.OLS.from_formula(formula, data=data).fit()

    # Return results object with robust covariance type
    return model.get_robustcov_results(cov_type="HC3")

In [7]:
# method 2: "party" -- group by Democrat/Republican-leaning, then include or exclude pure Independents/no preference
data.loc[(data["party_id"] == "Democrat") | (data["QID74"] == 2), "party"] = "D"
data.loc[(data["party_id"] == "Republican") | (data["QID74"] == 4), "party"] = "R"
data.loc[(data["QID74"] == 3), "party"] = "I"

# party ID
party_id = data.loc[:, "party_id.1"]
data["party_cov"] = party_id.fillna(data["party_id.2"]).fillna(data["QID74"])
data.party_cov = data.party_cov.astype(int)

In [8]:
# unsure if I am encoding variables for treatment conditions correctly
# for now creating indicator variable per treatment condition
treat_data = pd.get_dummies(data, columns=["treatment_value"])
treat_data["pre_avg_policy_support"] = treat_data[['GasTax', 'CarbTax', 'Treaty',
                                                   'RegCarb']].mean(axis=1)



In [9]:
#NOT SURE IF THESE ARE RIGHT (ESPECIALLY THE TREATMENT VALUES) - NEED TO CHECK
# IF YOU EDIT THESE IT WILL EDIT INPUT FOR ALL REGRESSIONS
outcome_var = 'mean_climate_support_after'
covariates = ['Religiosity', 'Economic_Reasoning', 'ScientificConfidence', 'party_cov', 'pre_avg_policy_support']
treatment_vars = [f"treatment_value_{i}" for i in range(1, 7)]
control_var = 'mean_climate_support_before'

In [10]:
# first pass at using Lin estimator for regression
model1_results, model1_formula = lin_estimator_mult_treat_formula(treat_data,
                                                          "mean_climate_support_after",
                                                          treatment_vars,
                                                          covariates)
model1_results.summary()

0,1,2,3
Dep. Variable:,mean_climate_support_after,R-squared:,0.885
Model:,OLS,Adj. R-squared:,0.852
Method:,Least Squares,F-statistic:,50.47
Date:,"Fri, 26 Apr 2024",Prob (F-statistic):,8.54e-67
Time:,13:32:21,Log-Likelihood:,-9.2106
No. Observations:,185,AIC:,102.4
Df Residuals:,143,BIC:,237.7
Df Model:,41,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.7840,0.406,4.390,0.000,0.981,2.587
treatment_value_1[T.True],0.1177,0.410,0.287,0.774,-0.692,0.928
treatment_value_2[T.True],0.2162,0.423,0.511,0.610,-0.620,1.053
treatment_value_3[T.True],0.1228,0.412,0.298,0.766,-0.692,0.938
treatment_value_4[T.True],0.0434,0.409,0.106,0.916,-0.766,0.852
treatment_value_5[T.True],0.0423,0.413,0.102,0.919,-0.773,0.858
treatment_value_6[T.True],0.0431,0.444,0.097,0.923,-0.834,0.921
Religiosity_demeaned,0.1098,0.041,2.668,0.009,0.028,0.191
Religiosity_demeaned:treatment_value_1[T.True],-0.1898,0.082,-2.324,0.022,-0.351,-0.028

0,1,2,3
Omnibus:,18.533,Durbin-Watson:,2.247
Prob(Omnibus):,0.0,Jarque-Bera (JB):,71.178
Skew:,-0.075,Prob(JB):,3.5e-16
Kurtosis:,6.035,Cond. No.,36.6


### Cross-Validation Regression:

In [11]:
#Randomly split data into two folds
shuffled = treat_data.sample(frac=1, random_state=set_seed)
split = np.array_split(shuffled, 2)
fold1 = split[0]
fold2 = split[1]

  return bound(*args, **kwds)


In [12]:
# Function to extract treatment effects from model
def extract_treatment_effects(model, treatment_vars):
    coefs = dict(zip(model.model.exog_names, model.params))
    effects = {var: coefs.get(var, 0) if coefs.get(var) is not None else 0 for var in treatment_vars}
    return effects

# Function to find the best treatment
def find_best_treatment(effects):
    if effects:
        return max(effects, key=effects.get)
    return None

# Function to assign the best treatment indicator
def assign_best_treatment_indicator(data, best_treatment):
    if best_treatment:
        data['best_treatment_indicator'] = (data[best_treatment] == 1).astype(int)
    else:
        data['best_treatment_indicator'] = 0

# Function to estimate the effect of the best treatment condition
def estimate_best_treatment_effect(data, outcome, indicator, covariates):
    formula = f"{outcome} ~ {indicator} + {' + '.join(covariates)}"
    model = sm.OLS.from_formula(formula, data=data).fit()
    return model.summary()


In [13]:
# Example usage
effects_fold1 = extract_treatment_effects(lin_estimator_mult_treat(fold1, outcome_var, treatment_vars, covariates), treatment_vars)
effects_fold2 = extract_treatment_effects(lin_estimator_mult_treat(fold2, outcome_var, treatment_vars, covariates), treatment_vars)

best_treatment_fold1 = find_best_treatment(effects_fold1)
best_treatment_fold2 = find_best_treatment(effects_fold2)

assign_best_treatment_indicator(fold1, best_treatment_fold2)
assign_best_treatment_indicator(fold2, best_treatment_fold1)

combined_data = pd.concat([fold1, fold2])
final_result = estimate_best_treatment_effect(combined_data, outcome_var, 'best_treatment_indicator', covariates)

print(final_result)


                                OLS Regression Results                                
Dep. Variable:     mean_climate_support_after   R-squared:                       0.843
Model:                                    OLS   Adj. R-squared:                  0.838
Method:                         Least Squares   F-statistic:                     159.6
Date:                        Fri, 26 Apr 2024   Prob (F-statistic):           6.88e-69
Time:                                13:32:21   Log-Likelihood:                -38.045
No. Observations:                         185   AIC:                             90.09
Df Residuals:                             178   BIC:                             112.6
Df Model:                                   6                                         
Covariance Type:                    nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------

  self.het_scale = (self.wresid / (1 - h))**2


### Random Forest:

In [14]:
features = covariates + treatment_vars
folds = [fold1, fold2]

# Train models and predict outcomes
def train_and_predict(train_data, test_data):
    predictions = {}

    # Train a model for each treatment condition and predict for the test data
    for treatment in treatment_vars:
        # Assuming binary treatment, filter data where treatment is active
        treated_data = train_data[train_data[treatment] == 1]
        rf = RandomForestRegressor(n_estimators=100, random_state=42)
        rf.fit(treated_data[covariates], treated_data[outcome_var])
        # Store predictions for each treatment
        predictions[treatment] = rf.predict(test_data[covariates])

    return predictions

# Analyze both folds
results_fold1 = train_and_predict(fold2, fold1)  # Train on fold2, predict for fold1
results_fold2 = train_and_predict(fold1, fold2)  # Train on fold1, predict for fold2

def assign_best_condition(test_data, results):
    # Create a DataFrame from the results with appropriate indexing
    results_df = pd.DataFrame(results, index=test_data.index)
    # Use np.argmax on axis=1 to find the indices of maximum values along the horizontal axis
    best_treatment_indices = np.argmax(results_df.values, axis=1)
    # Convert indices to a Series to use the map function
    best_treatment_series = pd.Series(best_treatment_indices, index=test_data.index)
    # Map indices to treatment variable names
    best_treatment = best_treatment_series.map(dict(enumerate(treatment_vars)))
    test_data['best_personalized_treatment'] = best_treatment
    test_data['best_treatment_indicator'] = 1  # This assumes the personalized treatment is applied


# Assign best treatment based on opposite fold predictions
assign_best_condition(fold1, results_fold2)
assign_best_condition(fold2, results_fold1)

# Estimate the effect of the best personalized condition
def estimate_personalized_effect(data, treatment_indicator, outcome, covariates):
    formula = f"{outcome} ~ {treatment_indicator} + {' + '.join(covariates)}"
    model = sm.OLS.from_formula(formula, data=data).fit()
    return model.summary()

# Combining data for final analysis
combined_data = pd.concat([fold1, fold2])
effect_summary = estimate_personalized_effect(combined_data, 'best_treatment_indicator', outcome_var, covariates)
print(effect_summary)

                                OLS Regression Results                                
Dep. Variable:     mean_climate_support_after   R-squared:                       0.843
Model:                                    OLS   Adj. R-squared:                  0.839
Method:                         Least Squares   F-statistic:                     192.2
Date:                        Fri, 26 Apr 2024   Prob (F-statistic):           5.42e-70
Time:                                13:32:22   Log-Likelihood:                -38.192
No. Observations:                         185   AIC:                             88.38
Df Residuals:                             179   BIC:                             107.7
Df Model:                                   5                                         
Covariance Type:                    nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------