In [None]:
import sys
sys.path.append('../..')
import os

from functools import partial
from tqdm import tqdm
import datetime

import pandas as pd
import numpy as np

from src.data.STAR.star import STARDataset

if not os.path.exists('star_results'):
    os.makedirs('star_results')

path_star_dataset = "../../src/data/STAR/STAR_Students.csv"

import warnings
warnings.filterwarnings(
    "ignore",
    message="'force_all_finite' was renamed to 'ensure_all_finite'",
    category=FutureWarning,
)


In [None]:
from sklearn.linear_model import (
    LogisticRegressionCV,
    RidgeCV,
)
from sklearn.ensemble import HistGradientBoostingRegressor
from src.randomization_aware.combine_cate import CATECombiner
from src.randomization_aware.learners import (
    DRLearner,
    QuasiOptimizedLearner,
)
from src.baselines.asaiee import AsaieeCATE
from src.baselines.ksp import KSPCATE
from src.baselines.pooling import TLearnerPooling
from src.baselines.trial_only import TrialCATE
from econml.metalearners import TLearner


crossfit_folds = 2


alphas = np.logspace(-2,2,25)
regressor_cate = lambda: RidgeCV(alphas=alphas)
regressor_outcome = lambda: HistGradientBoostingRegressor(max_depth=3, min_samples_leaf=5, max_iter=100)
study_classifier = lambda: LogisticRegressionCV(
    max_iter=1000, Cs=[1 / a for a in [0.1, 1.0, 10.0, 100.0]], cv=2, solver="liblinear"
)

cate_estimator_tlearner = lambda: TLearner(models=regressor_outcome())


def get_drlearner_star(propensity_score):
    return DRLearner(
        propensity_score=propensity_score,
        regressor_cate=regressor_cate(),
        regressor_control=regressor_outcome(),
        regressor_treated=regressor_outcome(),
        crossfit_folds=crossfit_folds,
    )


def get_quasioptimized_star(propensity_score):
    return QuasiOptimizedLearner(
        propensity_score=propensity_score,
        regressor_cate=regressor_cate(),
        regressor_control=regressor_outcome(),
        regressor_treated=regressor_outcome(),
        study_classifier=study_classifier(),
        crossfit_folds=crossfit_folds,
    )


def get_quasioptimized_star_unweighted(propensity_score):
    return QuasiOptimizedLearner(
        propensity_score=propensity_score,
        regressor_cate=regressor_cate(),
        regressor_control=regressor_outcome(),
        regressor_treated=regressor_outcome(),
        study_classifier=study_classifier(),
        crossfit_folds=crossfit_folds,
        remove_study_weighting=True,
    )


def get_combined_star(propensity_score, n_splits_cv=10):
    return CATECombiner(
        propensity_score=propensity_score,
        cate_learner_1=get_drlearner_star(propensity_score),
        cate_learner_2=get_quasioptimized_star(propensity_score),
        n_splits_cv=n_splits_cv
    )


def get_asaiee_star(propensity_score):
    return AsaieeCATE(
        propensity_score=propensity_score,
        regressor_cate=regressor_cate(),
        regressor_control=regressor_outcome(),
        regressor_treated=regressor_outcome(),
        crossfit_folds=crossfit_folds,
    )


def get_ksp_star(propensity_score):
    return KSPCATE(
        propensity_score,
        cate_estimator=cate_estimator_tlearner(),
        bias_correction_model=RidgeCV(alphas=alphas),
    )


def get_pooling_star(propensity_score=None):
    return TLearnerPooling(
        regressor_control=regressor_outcome(),
        regressor_treated=regressor_outcome(),
        study_classifier=study_classifier(),
    )


def get_tlearner_star(propensity_score=None):
    return TrialCATE(cate_estimator=cate_estimator_tlearner())



In [None]:
def get_max_sample_sizes_independent(
    target_fraction_of_main_loc, eval_fraction_of_target, target_label
):
    dgp = STARDataset(path_star_dataset, target_label=target_label)

    max_n1 = 0
    for n1 in range(1, 10000, 10):  # Increment n1 in steps of 100
        try:
            dgp.sample(
                n1,
                1,
                target_fraction_of_main_loc=target_fraction_of_main_loc,
                eval_fraction_of_target=eval_fraction_of_target,
            )
            max_n1 = n1
        except Exception as e:
            print(f"Error for n1={n1}: {e}")
            break  # Stop increasing n1 if an error occurs

    max_n0 = 0
    for n0 in range(1, 10000, 10):  # Increment n0 in steps of 100
        try:
            dgp.sample(
                1,
                n0,
                target_fraction_of_main_loc=target_fraction_of_main_loc,
                eval_fraction_of_target=eval_fraction_of_target,
            )
            max_n0 = n0
        except Exception as e:
            print(f"Error for n0={n0}: {e}")
            break  # Stop increasing n0 if an error occurs

    return max_n1, max_n0


target_fraction_of_main_loc = 1.0
eval_fraction_of_target = 0.5
get_max_sample_sizes_independent(
    target_fraction_of_main_loc, eval_fraction_of_target, target_label="urban"
)

In [None]:
methods = {
    "DR-learner": get_drlearner_star,
    "QR-learner": get_quasioptimized_star,
    "QR-learner (unweighted)" : get_quasioptimized_star_unweighted,
    "Combined learner": partial(get_combined_star, n_splits_cv=10),
    "Asaiee et al. (2023)": get_asaiee_star,
    "Kallus et al. (2018)": get_ksp_star,
    "Pooled T-learner": get_pooling_star,
    "T-learner": get_tlearner_star,
}

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
print(f"timestamp: {timestamp}")
iterations = 100
n1_list = [100, 200, 300, 400, 500, 600, 700]  
n0_list = [1000]  
fraction_rural_list = [1.0]
target_label = "rural"
covar_to_drop = [
    "g1surban",
]
rows = []

full_covar_list = [
    "g1surban",
    "gender",
    "race",
    "birthmonth",
    "birthday",
    "birthyear",
    "gkfreelunch",
    "g1tchid",
    "g1freelunch",
]

for dropped_covar in covar_to_drop:
    if dropped_covar is None:
        covar_list = full_covar_list
    else:
        if isinstance(dropped_covar, list):
            if all(covar in full_covar_list for covar in dropped_covar):
                covar_list = [
                    covar for covar in full_covar_list if covar not in dropped_covar
                ]
            else:
                missing = [
                    covar for covar in dropped_covar if covar not in full_covar_list
                ]
                raise ValueError(f"{missing} not found in full_covar_list")
        elif dropped_covar in full_covar_list:
            covar_list = [covar for covar in full_covar_list if covar != dropped_covar]
        else:
            raise ValueError(f"{dropped_covar} not found in full_covar_list")
    dgp = STARDataset(
        path_star_dataset,
        cat_covar_columns=covar_list,
        target_label=target_label,
    )
    propensity_score_rct = dgp.get_propensity_score()
    for fraction_rural in fraction_rural_list:
        for n1 in n1_list:
            for n0 in n0_list:
                print(f"dropped_covar = {dropped_covar}, n1 = {n1}; n0 = {n0}")
                for i in tqdm(range(iterations)):

                    X_train, S_train, A_train, Y_train, X_eval, gt_adjusted_ite_eval = (
                        dgp.sample(
                            n1,
                            n0,
                            target_fraction_of_main_loc=fraction_rural,
                            eval_fraction_of_target=eval_fraction_of_target,
                        )
                    )

                    for method_name, method_func in methods.items():

                        estimator = method_func(propensity_score_rct)

                        try:
                            estimator.fit(X_train, S_train, A_train, Y_train)
                            predictions = estimator.predict(X_eval)

                            assert predictions.shape == gt_adjusted_ite_eval.shape
                            rmse = np.sqrt(
                                np.mean((gt_adjusted_ite_eval - predictions) ** 2)
                            )
                            abs_bias = np.mean(
                                np.abs(gt_adjusted_ite_eval - predictions)
                            )
                            var = np.var(predictions)
                            rows.append(
                                {
                                    "i": i,
                                    "n1": n1,
                                    "n0": n0,
                                    "dropped_covar": (
                                        dropped_covar
                                        if dropped_covar is not None
                                        else "None dropped"
                                    ),
                                    "fraction_rural": fraction_rural,
                                    "target_label": target_label,
                                    "method": method_name,
                                    "rmse": rmse,
                                    "abs_bias": abs_bias,
                                    "var": var,
                                }
                            )
                        except Exception as e:
                            rows.append(
                                {
                                    "i": i,
                                    "n1": n1,
                                    "n0": n0,
                                    "dropped_covar": (
                                        dropped_covar
                                        if dropped_covar is not None
                                        else "None dropped"
                                    ),
                                    "fraction_rural": fraction_rural,
                                    "target_label": target_label,
                                    "method": method_name,
                                    "error": str(e),
                                }
                            )

                        # Save results in a CSV file with a timestamp
                        results_df = pd.DataFrame(rows)
                        results_df.to_csv(
                            f"star_results/experiment_{timestamp}.csv", index=False
                        )

In [None]:
from pybnesian import RCoT, LinearCorrelation

dgp = STARDataset(
    path_star_dataset,
    cat_covar_columns=[
        "gender",
        "race",
        "birthmonth",
        "birthday",
        "birthyear",
        "gkfreelunch",
        "g1tchid",
        "g1freelunch",
    ],
    target_label='rural',
)

# Sample
X_train, S_train, A_train, Y_train, _, _ = dgp.sample(
    2800,
    1400,
    target_fraction_of_main_loc=1.0,
    eval_fraction_of_target=0.001
)


A_df = pd.DataFrame(A_train, columns=['A'])
S_df = pd.DataFrame(S_train, columns=['S'])
Y_df = pd.DataFrame(Y_train, columns=['Y'])
X_names = [f"X_{i}" for i in range(X_train.shape[1])]
X_df = pd.DataFrame(X_train, columns=X_names)

data_df = pd.concat([A_df, S_df, Y_df, X_df], axis=1)

test1 = RCoT(data_df)
print(test1.pvalue('Y', 'S', X_names + ['A']))


test2 = LinearCorrelation(data_df)
print(test2.pvalue('Y', 'S', X_names + ['A']))
