In [1]:
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats

from protosc.parallel import execute_parallel

from nudging.model import BiRegressor, MonoRegressor, XRegressor
from nudging.cate import get_cate_correlations, get_cate_top_performance
from nudging.simulation import generate_multi_dataset

In [2]:
np.random.seed(12390845)
datasets = generate_multi_dataset(200, linear=False)

In [3]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import LogisticRegression, Ridge, LinearRegression, SGDRegressor, ElasticNet
from sklearn.linear_model import ARDRegression, BayesianRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
regressors = {
    "gauss_process": GaussianProcessRegressor,
    "ridge": Ridge,
    "linear": LinearRegression,
    "sgd": SGDRegressor,
    "elasticnet": ElasticNet,
    "ard": ARDRegression,
    "bayesian_ridge": BayesianRidge,
    "knn": KNeighborsRegressor,
    "mlp": MLPRegressor,
    "svm": SVR,
    "decision_tree": DecisionTreeRegressor,
    "extra_tree": ExtraTreeRegressor,
}
    

In [4]:
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning


In [5]:
def plot_cate(data, model):
    X, truth = data
    cate_estimate = get_cate(X, model)
    plt.scatter(truth["cate"], cate_estimate)
    plt.show()
    print(stats.pearsonr(truth["cate"], cate_estimate))

In [6]:
def compute_performance(model, data):
    return get_cate_top_performance(model, data)


In [7]:
@ignore_warnings(category=ConvergenceWarning)
def compute_learners(i_data, name):
    regressor = regressors[name]
    cur_data = datasets[i_data]
    slearn_result = get_cate_top_performance(MonoRegressor(regressor()), cur_data)
    tlearn_result = get_cate_top_performance(BiRegressor(regressor()), cur_data)
    xlearn_result = get_cate_top_performance(XRegressor(regressor()), cur_data)
    return name, slearn_result, tlearn_result, xlearn_result

In [8]:
jobs = []
for name in regressors:
    jobs.extend([{"i_data": i_data, "name": name} for i_data in range(len(datasets))])

In [9]:
results = execute_parallel(jobs, compute_learners, progress_bar=True, n_jobs=1)

100%|██████████| 120/120 [18:18<00:00,  9.16s/it]


In [10]:
def get_rank(value, i_data):
    all_values = []
    for res in results[i_data::len(datasets)]:
        all_values.extend(res[1:])
    return np.mean(np.where(-np.sort(-np.array(all_values)) == value)[0])

def plot_results(name):
    sub_res = [x for x in results if x[0] == name]
    labels = ["s-learner", "t-learner", "x-learner"]
    for i in range(1, 4):
        r = [x[i] for x in sub_res]
        plt.scatter(np.arange(len(r)), r, label=labels[i-1])
    plt.title(name)
    plt.legend()
    plt.show()
    
def compute_avg_rank(name):
    ranks = np.zeros(3)
    sub_res = [x for x in results if x[0] == name]
    labels = ["s-learner", "t-learner", "x-learner"]
    for i in range(1, 4):
        for i_data in range(len(datasets)):
            perf = sub_res[i_data][i]
            ranks[i-1] += get_rank(perf, i_data)
    return ranks/len(datasets)
    
#for name in regressors:
#    plot_results(name)

In [11]:
rank_res = []
labels = ["s-learner", "t-learner", "x-learner"]

for name in regressors:
    avg_rank = compute_avg_rank(name)
    rank_res.extend([(name+f" ({labels[i]})", avg_rank[i]) for i in range(len(avg_rank))])
sorted(rank_res, key=lambda x: x[1])

[('linear (t-learner)', 11.9),
 ('svm (s-learner)', 12.2),
 ('ridge (x-learner)', 13.2),
 ('ridge (t-learner)', 13.6),
 ('svm (x-learner)', 13.6),
 ('extra_tree (t-learner)', 14.2),
 ('svm (t-learner)', 14.3),
 ('ard (x-learner)', 14.8),
 ('mlp (t-learner)', 14.8),
 ('decision_tree (s-learner)', 14.9),
 ('mlp (x-learner)', 15.7),
 ('bayesian_ridge (x-learner)', 15.8),
 ('knn (t-learner)', 16.0),
 ('elasticnet (t-learner)', 16.6),
 ('bayesian_ridge (s-learner)', 16.6),
 ('mlp (s-learner)', 16.6),
 ('elasticnet (x-learner)', 17.6),
 ('ridge (s-learner)', 17.8),
 ('decision_tree (t-learner)', 17.8),
 ('sgd (x-learner)', 18.3),
 ('knn (x-learner)', 18.4),
 ('bayesian_ridge (t-learner)', 18.5),
 ('extra_tree (s-learner)', 18.6),
 ('linear (x-learner)', 18.8),
 ('ard (t-learner)', 19.1),
 ('decision_tree (x-learner)', 19.4),
 ('ard (s-learner)', 19.5),
 ('sgd (t-learner)', 19.6),
 ('linear (s-learner)', 20.2),
 ('gauss_process (x-learner)', 20.4),
 ('knn (s-learner)', 20.9),
 ('gauss_process

In [12]:
# plot_cate(SVMModel())

In [13]:
# plot_cate(RFModel())

In [14]:
# plot_cate(RFBiModel())

In [15]:
# plot_cate(RFXModel())

In [16]:
for d in data:
    plot_cate(d, BaseBiRegressor(BayesianRidge))

NameError: name 'data' is not defined

In [None]:
from nudge.test.prior import test_cate