In [1]:
import numpy as np
from scipy.stats import norm
from tqdm import tqdm

import source.pipelineprocesser as plp
from sicore import SelectiveInferenceNorm


def option1_cv():
    X, y = plp.make_dataset()
    y = plp.mean_value_imputation(X, y)

    # O = plp.cook_distance(X, y, 3.0, {2.0, 3.0})
    O = plp.cook_distance(X, y, 3.0)
    X, y = plp.remove_outliers(X, y, O)

    # M = plp.marginal_screening(X, y, 5, {3, 5})
    M = plp.marginal_screening(X, y, 5)
    X = plp.extract_features(X, M)

    M1 = plp.stepwise_feature_selection(X, y, 3, {2, 3})
    M2 = plp.lasso(X, y, 0.08, {0.08, 0.12})
    M = plp.union(M1, M2)
    return plp.make_pipeline(output=M)


def option2_cv():
    X, y = plp.make_dataset()
    y = plp.definite_regression_imputation(X, y)

    M = plp.marginal_screening(X, y, 5)
    # M = plp.marginal_screening(X, y, 5, {3, 5})
    X = plp.extract_features(X, M)

    O = plp.dffits(X, y, 3.0, {2.0, 3.0})
    X, y = plp.remove_outliers(X, y, O)

    M1 = plp.stepwise_feature_selection(X, y, 3, {2, 3})
    M2 = plp.lasso(X, y, 0.08)
    # M2 = plp.lasso(X, y, 0.08, {0.08, 0.12})
    M = plp.intersection(M1, M2)
    return plp.make_pipeline(output=M)

n, p = 200, 10
delta = 0.0

for seed in [29]:
    rng = np.random.default_rng(seed)

    for _ in range(1000):
        X = rng.normal(size=(n, p))
        noise = rng.normal(size=n)

        beta = np.zeros(p)
        beta[:3] = delta
        y = X @ beta + noise
        num_missing = rng.binomial(n, 0.03)
        mask = rng.choice(n, num_missing, replace=False)
        y[mask] = np.nan

        mpls = plp.make_pipelines(option1_cv(), option2_cv())
        mpls.tune(X, y, n_iters=4, cv=5, random_state=seed)

        M, _ = mpls(X, y)
        if len(M) == 0:
            continue
        index = rng.choice(len(M))
        if delta == 0.0 or M[index] in range(3):
        # try:
            mpls.inference(X, y, 1.0, index, is_result=True)
            print(mpls.etas[0].shape)
            mpls.reset_intervals()
            si = SelectiveInferenceNorm(y[~np.isnan(y)], 1.0, mpls.etas[0])
            temp, intervals = mpls.algorithm(si.z, si.c, si.stat)
            # if not mpls.model_selector(temp):
            # print(seed, intervals, si.stat)
            print()
            print("algo", *temp[2:])
            # print(mpls(X, y))
            print("tune", mpls.best_index, mpls.pipelines[mpls.best_index].best_candidate)
            print(np.allclose(y[~np.isnan(y)], si.z + si.c * si.stat))
            print([pl.best_mse for pl in mpls.pipelines])
            # print(mpls.model_selector(temp), intervals, si.stat)
            # print(si.eta_sigma_eta)
            break
            # except Exception as e:
            #     print(seed)
            #     print(e)
            #     break

[1.230003309842305, 1.2310896332003398, 1.2310896332003398, 1.230003309842305]
[1.294187358197775, 1.2058364192738382, 1.1918555148049264, 1.2299885651299713]
1.2300033098423053 {'lasso_0': 0.12, 'stepwise_feature_selection_0': 3}
1.2310896332003396 {'lasso_0': 0.08, 'stepwise_feature_selection_0': 3}
1.2310896332003396 {'lasso_0': 0.08, 'stepwise_feature_selection_0': 2}
1.2300033098423053 {'lasso_0': 0.12, 'stepwise_feature_selection_0': 2}
1.2941873581977748 {'dffits_0': 3.0, 'stepwise_feature_selection_1': 3}
1.2058364192738382 {'dffits_0': 2.0, 'stepwise_feature_selection_1': 3}
1.1918555148049261 {'dffits_0': 2.0, 'stepwise_feature_selection_1': 2}
1.2299885651299713 {'dffits_0': 3.0, 'stepwise_feature_selection_1': 2}
1.229572239207292 {'lasso_0': 0.12, 'stepwise_feature_selection_0': 3}
1.2307296790446556 {'lasso_0': 0.08, 'stepwise_feature_selection_0': 3}
1.2307296790446556 {'lasso_0': 0.08, 'stepwise_feature_selection_0': 2}
1.229572239207292 {'lasso_0': 0.12, 'stepwise_feat

In [1]:
import numpy as np
from scipy.stats import norm
from tqdm import tqdm

import source.pipelineprocesser as plp
from sicore import SelectiveInferenceNorm


def option1_cv():
    X, y = plp.make_dataset()
    y = plp.mean_value_imputation(X, y)

    # O = plp.cook_distance(X, y, 3.0, {2.0, 3.0})
    O = plp.cook_distance(X, y, 3.0)
    X, y = plp.remove_outliers(X, y, O)

    # M = plp.marginal_screening(X, y, 5, {3, 5})
    M = plp.marginal_screening(X, y, 5)
    X = plp.extract_features(X, M)

    M1 = plp.stepwise_feature_selection(X, y, 3, {2, 3})
    M2 = plp.lasso(X, y, 0.08, {0.08, 0.12})
    M = plp.union(M1, M2)
    return plp.make_pipeline(output=M)


def option2_cv():
    X, y = plp.make_dataset()
    y = plp.definite_regression_imputation(X, y)

    M = plp.marginal_screening(X, y, 5)
    # M = plp.marginal_screening(X, y, 5, {3, 5})
    X = plp.extract_features(X, M)

    O = plp.dffits(X, y, 3.0, {2.0, 3.0})
    X, y = plp.remove_outliers(X, y, O)

    M1 = plp.stepwise_feature_selection(X, y, 3, {2, 3})
    M2 = plp.lasso(X, y, 0.08)
    # M2 = plp.lasso(X, y, 0.08, {0.08, 0.12})
    M = plp.intersection(M1, M2)
    return plp.make_pipeline(output=M)

n, p = 200, 10
delta = 0.0

for seed in tqdm(range(100)):
    rng = np.random.default_rng(seed)

    for _ in range(1000):
        X = rng.normal(size=(n, p))
        noise = rng.normal(size=n)

        beta = np.zeros(p)
        beta[:3] = delta
        y = X @ beta + noise
        num_missing = rng.binomial(n, 0.03)
        mask = rng.choice(n, num_missing, replace=False)
        y[mask] = np.nan

        mpls = plp.make_pipelines(option1_cv(), option2_cv())
        mpls.tune(X, y, n_iters=4, cv=5, random_state=seed)

        M, _ = mpls(X, y)
        if len(M) == 0:
            continue
        index = rng.choice(len(M))
        if delta == 0.0 or M[index] in range(3):
            try:
                ans = mpls.inference(X, y, 1.0, index, is_result=True)
                break
            except Exception as e:
                print(seed)
                print(e)
                break

100%|██████████| 100/100 [08:36<00:00,  5.16s/it]
