In [90]:
from source.missing_imputation import definite_regression_imputation
import numpy as np
from source.base_component import extract_features, remove_outliers, intersection, union
from source.outlier_detection import cook_distance, dffits
from source.feature_selection import stepwise_feature_selection, lasso, marginal_screening
from source.pipeline import make_pipeline, make_dataset

from temp.missing_imputation import regression_definite_imputation as rdi_matsu
from temp.feature_selection_si import sfs_si, lasso_si, ms_si
from temp.outlier_removal_si import cook_si
from temp.common_func import union as union_si

def option():
    X, y = make_dataset()
    y = definite_regression_imputation(X, y)

    M = marginal_screening(X, y, 5)
    X = extract_features(X, M)

    O = dffits(X, y, 3.0)
    X, y = remove_outliers(X, y, O)

    M1 = stepwise_feature_selection(X, y, 3)
    M2 = lasso(X, y, 0.08)
    M = intersection(M1, M2)
    return make_pipeline(output=M)

n, p = 100, 10
rng = np.random.default_rng(2)


In [125]:
X = rng.normal(size=(n, p))
y = rng.normal(size=(n,))
y[:5] += 4
missing = rng.choice(list(range(n)), size=n // 10, replace=False)
y[missing] = np.nan
print(missing)
sigma = 1.0

print(np.where(np.isnan(y))[0])

[87 59 18 20 80  6  1 94 71 24]
[ 1  6 18 20 24 59 71 80 87 94]


In [58]:
from temp import (
    missing_imputation, feature_selection, common_func, outlier_removal,
    feature_selection_si, outlier_removal_si
)
from sicore import tn_cdf_mpmath

k_ms = 5
lamda_dffits = 3
k_sfs = 3
lamda_lasso = 0.08

# 初期条件
M_obs = list(range(p))
O_obs = []

# 欠損値補完
X, y, cov = missing_imputation.regression_definite_imputation(X, y, sigma)
print(cov[0][:10])

# 特徴選択(ms)
M1_obs = feature_selection.ms(X, y, M_obs, O_obs, k_ms)

# 外れ値除去(dffits)
O_obs = outlier_removal.dffits(X, y, M1_obs, O_obs, lamda_dffits)

# 特徴選択(lasso)
M2_obs = feature_selection.lasso(X, y, M1_obs, O_obs, lamda_lasso)

# 特徴選択(sfs)
M3_obs = feature_selection.sfs(X, y, M1_obs, O_obs, k_sfs)

M_obs = common_func.intersect(M2_obs, M3_obs)

print(M_obs)

if len(M_obs) != 0:
    rand_value = 0
    j_selected = M_obs[rand_value]

    a, b, z_obs, var, etaj = common_func.compute_teststatistics(
        X, y, M_obs, O_obs, j_selected, cov
    )
    print("a", a[:5])
    print("b", b[:5])
    print("eta\n", etaj[20:30])
    std = np.sqrt(var)  # 標準偏差
    print("etasigmaeta", std**2)
    print("eta @ y", z_obs)
    z_min, z_max = -10 * std - np.abs(z_obs), 10 * std + np.abs(z_obs)
    z = z_min
    # リスト
    interval = []
    while z < z_max:
        # 初期設定(特徴，外れ値，切断区間)
        M = list(range(p))
        O = []
        l = np.NINF
        u = np.Inf

        # ms
        M1, O, l, u = feature_selection_si.ms_si(a, b, z, X, M, O, l, u, k_ms)

        # dffits
        M1, O, l, u = outlier_removal_si.dffits_si(
            a, b, z, X, M1, O, l, u, lamda_dffits
        )

        # lasso
        M2, O, l, u = feature_selection_si.lasso_si(
            a, b, z, X, M1, O, l, u, lamda_lasso
        )

        # sfs
        M3, O, l, u = feature_selection_si.sfs_si(a, b, z, X, M1, O, l, u, k_sfs)

        M = common_func.intersect(M2, M3)

        if set(O_obs) == set(O) and set(M_obs) == set(M):
            interval.append([l, u])

        z = u + 1e-6

    if len(interval) != 0:
        print((np.array(interval) / std).tolist())
        p_value = 1 - tn_cdf_mpmath(
            z_obs / std, (np.array(interval) / std).tolist(), absolute=True
        )
        print(z_obs / std, p_value)

[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[7]
a [4.77906572 4.58523109 4.4517832  4.38367428 2.05088631]
b [-0.26205527  0.05525864 -0.17996521  0.07328255  0.38801077]
eta
 [-0.00411753  0.00585687  0.00484071  0.01683394 -0.00836314  0.01083084
 -0.01736851 -0.00142074  0.00824369 -0.00760412]
etasigmaeta 0.013563275419972428
eta @ y 0.1121098225998619
[[0.9338474077151518, 5.077858777704228]]
0.9626343949536033 0.958184007949929


In [126]:
pipeline = option()
M, O = pipeline(X, y)
print(O)
print(np.where(np.isnan(y))[0])

M, results = pipeline.inference(X, y, 1.0, is_result=False, step=1e-6)
rng.choice(len(M))
for m, result in zip(M, results):
    print(m, result)


eta = pipeline.etas[0]
print(eta[20:30])

print(pipeline.cov[0][:10])

print(pipeline.calculators[0].z[:5]) # a
print(pipeline.calculators[0].c[:5]) # b
print()
print(pipeline.calculators[0].eta_sigma_eta)
print(eta @ pipeline.cov @ eta)

print(pipeline.y @ eta)
print(pipeline.cov.shape)

# [0, 9, 6]
# 6 0.7779557147561018
# [0, 1, 4]
# 0 0.5964551118283856
# [1, 7]
# 1 0.19791364361525088

[0, 2, 4, 56, 58, 73, 75]
[ 1  6 18 20 24 59 71 80 87 94]
0 0.6934181866572308
1 0.025955596273525505
4 0.16065909068905648
[ 0.01390324 -0.00872313  0.0079265   0.00892913 -0.00299641 -0.00712016
 -0.00347202  0.00583854 -0.02257527  0.0091577 ]
[ 1.         -0.06671247  0.          0.          0.          0.
 -0.0272425   0.          0.          0.        ]
[ 3.56244658 -0.01405815  4.15474471  4.54354462  3.83034976]
[-0.47508214  1.66841387 -0.35021849 -0.08279206  0.23680824]

0.013187105407843687
0.013187105407843687
-0.10001575365146126
(100, 100)
