In [1]:
# import
import numpy as np
import source.pipelineprocesser as plp

In [2]:
# definition of pipeline option1
def option1():
    X, y = plp.make_dataset()
    y = plp.mean_value_imputation(X, y)

    O = plp.cook_distance(X, y, 3.0)
    X, y = plp.remove_outliers(X, y, O)

    M = plp.marginal_screening(X, y, 5)
    X = plp.extract_features(X, M)

    M1 = plp.stepwise_feature_selection(X, y, 3)
    M2 = plp.lasso(X, y, 0.08)
    M = plp.union(M1, M2)
    return plp.make_pipeline(output=M)

# definition of pipeline option2
def option2():
    X, y = plp.make_dataset()
    y = plp.definite_regression_imputation(X, y)

    M = plp.marginal_screening(X, y, 7)
    X = plp.extract_features(X, M)

    O = plp.dffits(X, y, 2.0)
    X, y = plp.remove_outliers(X, y, O)

    M1 = plp.stepwise_feature_selection(X, y, 5)
    M2 = plp.lasso(X, y, 0.05)
    M = plp.intersection(M1, M2)
    return plp.make_pipeline(output=M)


In [3]:
op1 = option1()
# print the pipeline structure
print('Option1:')
print(op1)
print()

op2 = option2()
# print the pipeline structure
print('Option2:')
print(op2)


Option1:
start -> mean_value_imputation_0
mean_value_imputation_0 -> cook_distance_0
cook_distance_0 -> remove_0
remove_0 -> marginal_screening_0
marginal_screening_0 -> extract_0
extract_0 -> stepwise_feature_selection_0
extract_0 -> lasso_0
stepwise_feature_selection_0 -> union_0
lasso_0 -> union_0
union_0 -> end

Option2:
start -> definite_regression_imputation_0
definite_regression_imputation_0 -> marginal_screening_1
marginal_screening_1 -> extract_1
extract_1 -> dffits_0
dffits_0 -> remove_1
remove_1 -> lasso_1
remove_1 -> stepwise_feature_selection_1
lasso_1 -> intersection_0
stepwise_feature_selection_1 -> intersection_0
intersection_0 -> end


In [4]:
# apply the defined pipeline (option1) to a actual dataset
n, p = 100, 10

rng = np.random.default_rng(0)
X = rng.normal(size=(n, p))
y = rng.normal(size=n)
sigma = 1.0

M, O = op1(X, y) # pipeline can be used as a function
print('selected features:', M)
print('detected outliers:', O)
print()

# inference for each selected features
M, p_list = op1.inference(X, y, sigma)
for each_feature, p_value in zip(M, p_list):
    print(f'feature:{each_feature} p-value:{p_value:.3f}')


selected features: [0, 2, 3, 6, 9]
detected outliers: [19, 48, 51, 59, 64, 74, 77, 94]

feature:0 p-value:0.243
feature:2 p-value:0.133
feature:3 p-value:0.805
feature:6 p-value:0.928
feature:9 p-value:0.692


In [19]:
def option3():
    X, y = plp.make_dataset()

    M = plp.marginal_screening(X, y, 5, [3, 5, 7])
    X = plp.extract_features(X, M)

    M = plp.lasso(X, y, 0.08, [0.05, 0.08, 0.1])
    return plp.make_pipeline(output=M)

op3 = option3()
print(op3)
print()

op3.tune(X, y, n_iter=9, cv=5)
print("tuned parameters:", op3.best_candidate)
print()

print("selected features:", op3(X, y)[0])

start -> marginal_screening_16
marginal_screening_16 -> extract_16
extract_16 -> lasso_16
lasso_16 -> end

tuned parameters: {'marginal_screening_16': 3, 'lasso_16': 0.05}

selected features: [0, 2, 1]
