In [2]:
# import
import numpy as np
import source.pipelineprocesser as plp

In [5]:
# definition of pipeline option1
def option1():
    X, y = plp.make_dataset()
    y = plp.mean_value_imputation(X, y)

    O = plp.cook_distance(X, y, 3.0)
    X, y = plp.remove_outliers(X, y, O)

    M = plp.marginal_screening(X, y, 5)
    X = plp.extract_features(X, M)

    M1 = plp.stepwise_feature_selection(X, y, 3)
    M2 = plp.lasso(X, y, 0.08)
    M = plp.union(M1, M2)
    return plp.make_pipeline(output=M)

# definition of pipeline option2
def option2():
    X, y = plp.make_dataset()
    y = plp.definite_regression_imputation(X, y)

    M = plp.marginal_screening(X, y, 7)
    X = plp.extract_features(X, M)

    O = plp.dffits(X, y, 2.0)
    X, y = plp.remove_outliers(X, y, O)

    M1 = plp.stepwise_feature_selection(X, y, 5)
    M2 = plp.lasso(X, y, 0.05)
    M = plp.intersection(M1, M2)
    return plp.make_pipeline(output=M)


op1 = option1()
# print the pipeline structure
print('Option1:')
print(op1)
print()

op2 = option2()
# print the pipeline structure
print('Option2:')
print(op2)


Option1:
start -> mean_value_imputation_2
mean_value_imputation_2 -> cook_distance_2
cook_distance_2 -> remove_4
remove_4 -> marginal_screening_4
marginal_screening_4 -> extract_4
extract_4 -> lasso_4
extract_4 -> stepwise_feature_selection_4
lasso_4 -> union_2
stepwise_feature_selection_4 -> union_2
union_2 -> end

Option2:
start -> definite_regression_imputation_2
definite_regression_imputation_2 -> marginal_screening_5
marginal_screening_5 -> extract_5
extract_5 -> dffits_2
dffits_2 -> remove_5
remove_5 -> lasso_5
remove_5 -> stepwise_feature_selection_5
lasso_5 -> intersection_2
stepwise_feature_selection_5 -> intersection_2
intersection_2 -> end


In [11]:
# apply the defined pipeline (option1) to a actual dataset
n, p = 100, 10

rng = np.random.default_rng(0)
X = rng.normal(size=(n, p))
y = rng.normal(size=n)
sigma = 1.0

M, O = op1(X, y) # pipeline can be used as a function
print('selected features:', M)
print('detected outliers:', O)
print()

# inference for each selected features
M, p_list = op1.inference(X, y, sigma)
for each_feature, p_value in zip(M, p_list):
    print(f'feature:{each_feature} p-value:{p_value:.3f}')


selected features: [0, 2, 3, 6, 9]
detected outliers: [19, 48, 51, 59, 64, 74, 77, 94]

feature:0 p-value:0.243
feature:2 p-value:0.133
feature:3 p-value:0.805
feature:6 p-value:0.928
feature:9 p-value:0.692
