In [1]:
# import
import numpy as np
import source.pipelineprocesser as plp

In [2]:
# definition of pipeline option1
def option1():
    X, y = plp.make_dataset()
    y = plp.mean_value_imputation(X, y)

    O = plp.cook_distance(X, y, 3.0)
    X, y = plp.remove_outliers(X, y, O)

    M = plp.marginal_screening(X, y, 5)
    X = plp.extract_features(X, M)

    M1 = plp.stepwise_feature_selection(X, y, 3)
    M2 = plp.lasso(X, y, 0.08)
    M = plp.union(M1, M2)
    return plp.make_pipeline(output=M)

# definition of pipeline option2
def option2():
    X, y = plp.make_dataset()
    y = plp.definite_regression_imputation(X, y)

    M = plp.marginal_screening(X, y, 7)
    X = plp.extract_features(X, M)

    O = plp.dffits(X, y, 2.0)
    X, y = plp.remove_outliers(X, y, O)

    M1 = plp.stepwise_feature_selection(X, y, 5)
    M2 = plp.lasso(X, y, 0.05)
    M = plp.intersection(M1, M2)
    return plp.make_pipeline(output=M)


In [3]:
# define the pipeline instance (option1)
op1 = option1()
# print the pipeline structure
print('Option1:')
print(op1)
print()

# define the pipeline instance (option2)
op2 = option2()
# print the pipeline structure
print('Option2:')
print(op2)


Option1:
start -> mean_value_imputation_0
mean_value_imputation_0 -> cook_distance_0
cook_distance_0 -> remove_0
remove_0 -> marginal_screening_0
marginal_screening_0 -> extract_0
extract_0 -> lasso_0
extract_0 -> stepwise_feature_selection_0
lasso_0 -> union_0
stepwise_feature_selection_0 -> union_0
union_0 -> end

Option2:
start -> definite_regression_imputation_0
definite_regression_imputation_0 -> marginal_screening_1
marginal_screening_1 -> extract_1
extract_1 -> dffits_0
dffits_0 -> remove_1
remove_1 -> lasso_1
remove_1 -> stepwise_feature_selection_1
lasso_1 -> intersection_0
stepwise_feature_selection_1 -> intersection_0
intersection_0 -> end


In [4]:
# apply the defined pipeline (option1) to a actual dataset
n, p = 100, 10

rng = np.random.default_rng(0)
X = rng.normal(size=(n, p))
y = rng.normal(size=n)
num_missing = rng.binomial(n, 0.03)
mask = rng.choice(n, num_missing, replace=False)
y[mask] = np.nan
sigma = 1.0

M, O = op1(X, y) # pipeline can be used as a function
print('selected features:', M)
print('detected outliers:', O)
print()

# inference for each selected features
M, p_list = op1.inference(X, y, sigma)
for each_feature, p_value in zip(M, p_list):
    print(f'feature:{each_feature} p-value:{p_value:.3f}')


selected features: [0, 2, 3, 6, 9]
detected outliers: [19, 48, 51, 59, 64, 74, 77, 94]

feature:0 p-value:0.514
feature:2 p-value:0.132
feature:3 p-value:0.610
feature:6 p-value:0.821
feature:9 p-value:0.489


In [7]:
# definition of pipeline option1 for cross-validation
def option1_cv():
    X, y = plp.make_dataset()
    y = plp.mean_value_imputation(X, y)

    O = plp.cook_distance(X, y, 3.0, {2.0, 3.0}) # candidates
    X, y = plp.remove_outliers(X, y, O)

    M = plp.marginal_screening(X, y, 5, {3, 5}) # candidates
    X = plp.extract_features(X, M)

    M1 = plp.stepwise_feature_selection(X, y, 3)
    M2 = plp.lasso(X, y, 0.08)
    M = plp.union(M1, M2)
    return plp.make_pipeline(output=M)

# definition of pipeline option2 for cross-validation
def option2_cv():
    X, y = plp.make_dataset()
    y = plp.definite_regression_imputation(X, y)

    M = plp.marginal_screening(X, y, 5)
    X = plp.extract_features(X, M)

    O = plp.dffits(X, y, 3.0)
    X, y = plp.remove_outliers(X, y, O)

    M1 = plp.stepwise_feature_selection(X, y, 3, {2, 3}) # candidates
    M2 = plp.lasso(X, y, 0.08, {0.08, 0.12}) # candidates
    M = plp.intersection(M1, M2)
    return plp.make_pipeline(output=M)


# define the pipeline instance
op1cv = option1_cv()
op2cv = option2_cv()

# cross-validation
op1cv.tune(X, y, n_iter=4, cv=5, random_state=0)
op2cv.tune(X, y, n_iter=4, cv=5, random_state=0)

# print the tuned parameters
print("tuned parameters (op1):", op1cv.best_candidate)
print("tuned parameters (op2):", op2cv.best_candidate)
print()

M, O = op2cv(X, y) # pipeline can be used as a function
print('selected features:', M)
print('detected outliers:', O)
print()

# inference for each selected features
M, p_list = op2.inference(X, y, sigma)
for each_feature, p_value in zip(M, p_list):
    print(f'feature:{each_feature} p-value:{p_value:.3f}')

tuned parameters (op1): {'cook_distance_3': 3.0, 'marginal_screening_6': 3}
tuned parameters (op2): {'stepwise_feature_selection_7': 2, 'lasso_7': 0.08}

selected features: [0, 2]
detected outliers: [19, 25, 48, 59, 64, 74, 77, 90, 94, 95]

feature:0 p-value:0.191
feature:1 p-value:0.930
feature:2 p-value:0.128
feature:3 p-value:0.725
feature:9 p-value:0.905


In [10]:
# multiple pipelines
mpls = plp.make_pipelines(op1cv, op2cv)

# cross-validation for multiple pipelines
mpls.tune(X, y, n_iters=4, cv=5, random_state=0)

# print the index of the best pipeline and the best parameters
print("best pipeline:", i := mpls.best_index)
print("best parameters:", mpls.pipelines[i].best_candidate)
print()

M, O = mpls(X, y) # pipeline can be used as a function
print('selected features:', M)
print('detected outliers:', O)
print()

# inference for each selected features
M, p_list = mpls.inference(X, y, sigma)
for each_feature, p_value in zip(M, p_list):
    print(f'feature:{each_feature} p-value:{p_value:.3f}')


best pipeline: 0
best parameters: {'cook_distance_3': 3.0, 'marginal_screening_6': 3}

selected features: [0, 2, 9]
detected outliers: [19, 48, 51, 59, 64, 74, 77, 94]

feature:0 p-value:0.372
feature:2 p-value:0.685
feature:9 p-value:0.956
