## 特徴選択，外れ値除去，欠損値補完に対して統一化(観測データを適用)

In [2]:
import numpy as np
from sklearn.linear_model import Lasso
from sicore import polytope_to_interval, intersection

In [7]:
x = np.array([[1,2,3],[4,5,6],[7,8,9],[10,11,12]])
y = np.array([1,2,3,4])

XTy_abs = np.abs(x.T @ y).flatten()
sort_XTy_abs = np.argsort(XTy_abs)[::-1]

M = list(range(x.shape[1]))
x = x[:,M]

k = 2

A = sort_XTy_abs[:k]
Ac = sort_XTy_abs[k:]

M = [M[i] for i in A]

print(x)
print(M)

[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]
[2, 1]


# 特徴選択

In [None]:
# M：選択された特徴，O：検出された外れ値集合，最後：ハイパーパラメータ
def ms(X,y,M,O,k):
    # 外れ値の除去(X,yに対して)
    X = np.delete(X,[O],0)
    y = np.delete(y,[O]).reshape(-1,1)

    # 特徴の除去
    X = X[:,M]

    # Marginal Screening
    XTy_abs = np.abs(X.T @ y).flatten()
    sort_XTy_abs = np.argsort(XTy_abs)[::-1]

    A = sort_XTy_abs[:k]
    Ac = sort_XTy_abs[k:]

    M = [M[i] for i in A]

    return M,O

def lasso(X,y,M,O,lamda):
    # 外れ値の除去(X,yに対して)
    X = np.delete(X,[O],0)
    y = np.delete(y,[O]).reshape(-1,1)

    # 特徴の除去
    X = X[:,M]

    # lasso
    clf = Lasso(alpha=lamda,fit_intercept=False,max_iter=5000,tol=1e-10)
    clf.fit(X,y)
    coef = clf.coef_

    # lassoによる結果(インデックス表示)
    A = np.where(coef != 0)[0].tolist() #coefが0でないものをリストとして表示
    Ac = [i for i in X.shape[1] if i not in A]
    s = np.sign(coef[A])

    # 元の特徴に基づいた結果
    M = [M[i] for i in A]

    return M,O

def sfs(X,y,M,O,k):
    # 外れ値の除去(X,yに対して)
    X = np.delete(X,[O],0)
    y = np.delete(y,[O]).reshape(-1,1)

    # 特徴の除去
    X = X[:,M]

    # sfs
    A = []
    Ac = list(range(X.shape[1]))
    s = []

    for i in range(k):
        XA = X[:,A]
        r = y - XA @ np.linalg.pinv(XA.T @ XA) @ XA.T @ y
        correlation = X[:,Ac].T @ r 

        index = np.argmax(np.abs(correlation)) #何番目の要素が最大か？

        s.append(np.sign(correlation[index]))

        A.append(Ac[index])
        Ac.remove(Ac[index])

    # 元の特徴に基づいた結果
    M = [M[i] for i in A]

    return M,O

# 外れ値検出

In [None]:
def cook_distance(X,y,M,O,lamda):

    # 最後に最終的に得られる外れ値集合を求めるのに使用する
    num_data = list(range(X.shape[0]))
    num_outlier_data = [x for x in num_data if x not in O]

    # 外れ値の除去(X,yに対して)
    X = np.delete(X,[O],0)
    y = np.delete(y,[O]).reshape(-1,1)

    # 特徴の除去
    X = X[:,M]

    # cook's distance
    non_outlier = []
    outlier = []
    n,p = X.shape

    hat_matrix =  X @ np.linalg.inv(X.T @ X) @ X.T
    Px = np.identity(n) - hat_matrix
    threads = lamda / n #しきい値の設定

    # 外れ値の除去
    for i in range(n):
        ej = np.zeros((n,1))
        ej[i] = 1
        hi = hat_matrix[i][i] #Pxの対角成分
        Di_1 = (y.T @ (Px @ ej @ ej.T @ Px) @ y) / (y.T @ Px @ y) # Diの1項目
        Di_2 = ((n - p) * hi) / (p * (1 - hi)**2) # Diの2項目
        Di = Di_1 * Di_2

        if Di < threads:
            non_outlier.append(i)
        else:
            outlier.append(i)

    # 元の特徴に基づいた結果
    outlier2 = [num_outlier_data[i] for i in outlier]
    O = O + outlier2

    return M,O

def dffits(X,y,M,O,lamda):

    # 最後に最終的に得られる外れ値集合を求めるのに使用する
    num_data = list(range(X.shape[0]))
    num_outlier_data = [x for x in num_data if x not in O]

    # 外れ値の除去(X,yに対して)
    X = np.delete(X,[O],0)
    y = np.delete(y,[O]).reshape(-1,1)

    # 特徴の除去
    X = X[:,M]

    # DFFITS
    non_outlier = []
    outlier = []
    n,p = X.shape

    hat_matrix =  X @ np.linalg.inv(X.T @ X) @ X.T
    Px = np.identity(n) - hat_matrix
    threads = (lamda * p) / (n - p) #しきい値の設定

    # 外れ値の除去
    for i in range(n):
        ej = np.zeros((n,1))
        ej[i] = 1
        hi = hat_matrix[i][i] #Pxの対角成分
        DFFITSi_1 = np.sqrt(hi * (n - p - 1)) / (1 - hi) # DFFITSの片側
        DFFITSi_2_denominator = y.T @ Px @ y - ((y.T @ Px @ ej @ ej.T @ Px @ y) / (1 - hi))
        DFFITSi_2 = (ej.T @ Px @ y) / np.sqrt(DFFITSi_2_denominator )
        DFFITSi = DFFITSi_1 * DFFITSi_2

        if DFFITSi**2 < threads:
            non_outlier.append(i)
        else:
            outlier.append(i)

    # 元の特徴に基づいた結果
    outlier2 = [num_outlier_data[i] for i in outlier]
    O = O + outlier2

    return M,O

def soft_IPOD_lambda(X):
    nsim = 3000
    lamda_list = np.array([])
    
    n = X.shape[0]
    hat_matrix =  X @ np.linalg.inv(X.T @ X) @ X.T
    PXperp = np.identity(n) - hat_matrix

    for i in range(nsim):
        eps = np.random.randn(n)
        tXeps = np.abs(PXperp.T @ eps)
        imax = np.max(tXeps)
        lamda_list = np.append(lamda_list, imax)
    
    lamda = 0.7*(np.mean(lamda_list/n))

    return lamda

def soft_ipod(X,y,M,O,lamda):

    # 最後に最終的に得られる外れ値集合を求めるのに使用する
    num_data = list(range(X.shape[0]))
    num_outlier_data = [x for x in num_data if x not in O]

    # 外れ値の除去(X,yに対して)
    X = np.delete(X,[O],0)
    y = np.delete(y,[O]).reshape(-1,1)

    # 特徴の除去
    X = X[:,M]

    # soft-IPODの準備
    n = X.shape[0]

    hat_matrix =  X @ np.linalg.inv(X.T @ X) @ X.T
    PXperp = np.identity(n) - hat_matrix
    PXperpy = PXperp @ y

    # soft-IPODの実行
    clf = Lasso(alpha=lamda,fit_intercept=False,max_iter=5000,tol=1e-10)
    clf.fit(PXperp,PXperpy)
    coef = clf.coef_
    outlier = np.where(coef!=0)[0].tolist() #外れ値
    non_outlier = np.where(coef==0)[0].tolist() #非外れ値
    s = np.sign(coef[outlier])

    outlier2 = [num_outlier_data[i] for i in outlier]
    O = O + outlier2

    return M,O