In [1]:
# 处理丢失数据的方法
# 1.Univariate feature imputation
import numpy as np
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit([[1,2], [np.nan, 3], [7,6]])
SimpleImputer()

SimpleImputer()

In [2]:
X = [[np.nan, 2], [6, np.nan], [7,6]]
print(X)

[[nan, 2], [6, nan], [7, 6]]


In [3]:
print(imp.transform(X))

[[4.         2.        ]
 [6.         3.66666667]
 [7.         6.        ]]


In [5]:
imp.get_params()

{'add_indicator': False,
 'copy': True,
 'fill_value': None,
 'missing_values': nan,
 'strategy': 'mean',
 'verbose': 0}

In [7]:
import scipy.sparse as sp
X = sp.csc_matrix([[1,2],[0,-1],[8,4]])
imp = SimpleImputer(missing_values=-1, strategy='mean')
imp.fit(X)
SimpleImputer(missing_values=-1)

SimpleImputer(missing_values=-1)

In [10]:
X_test = sp.csc_matrix([[-1,2],[6,-1],[7,6]])
print(imp.transform(X_test).toarray())

[[3. 2.]
 [6. 3.]
 [7. 6.]]


In [11]:
import pandas as pd
df = pd.DataFrame([["a", "x"],
                  [np.nan, "y"],
                  ["a", np.nan],
                  ["b", "y"]], dtype="category")
imp = SimpleImputer(strategy="most_frequent")
print(imp.fit_transform(df))

[['a' 'x']
 ['a' 'y']
 ['a' 'y']
 ['b' 'y']]


In [12]:
# 2.Multivariate feature imputation
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit([[1,2], [3,6], [4,8], [np.nan, 3], [7, np.nan]])
IterativeImputer(random_state=0)

IterativeImputer(random_state=0)

In [13]:
X_test = [[np.nan, 2], [6, np.nan], [np.nan,6]]
print(np.round(imp.transform(X_test)))

[[ 1.  2.]
 [ 6. 12.]
 [ 3.  6.]]


In [14]:
# 3.Nearest neighbors imputation
import numpy as np
from sklearn.impute import KNNImputer
nan = np.nan
X = [[1,2,nan], [3, 4, 4], [nan, 6, 5], [8, 8, 7]]
imputer = KNNImputer(n_neighbors=2, weights="uniform")
imputer.fit_transform(X)

array([[1. , 2. , 4.5],
       [3. , 4. , 4. ],
       [5.5, 6. , 5. ],
       [8. , 8. , 7. ]])

In [15]:
# Marking imputed valus
from sklearn.impute import MissingIndicator
X = np.array([[-1, -1, 1, 3],
             [4, -1, 0, -1],
             [8, -1, 1, 0]])
indicator = MissingIndicator(missing_values=-1)
mask_missing_values_only = indicator.fit_transform(X)   # 显示存在missing_values的特征，并将显示是否丢失，True表示该数据是missing_values 
mask_missing_values_only

array([[ True,  True, False],
       [False,  True,  True],
       [False,  True, False]])

In [17]:
indicator.features_   # 因为在X中第0、1、3列数据存在missing_values

array([0, 1, 3], dtype=int64)

In [19]:
from sklearn.datasets import load_iris
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn.tree import DecisionTreeClassifier
X, y = load_iris(return_X_y = True)
mask = np.random.randint(0, 2, size=X.shape).astype(bool)
X[mask] = np.nan
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=100, random_state=0)

In [20]:
# 需要将所有特征组合起来，因为MissingIndicator中返回的特征是不包含没有missing_values的特征
transformer = FeatureUnion(transformer_list=[('features',SimpleImputer(strategy='mean')), ('indicators', MissingIndicator())])
transformer = transformer.fit(X_train, y_train)
results = transformer.transform(X_test)
results.shape

(100, 8)

In [22]:
clf = make_pipeline(transformer, DecisionTreeClassifier())
clf = clf.fit(X_train, y_train)
results = clf.predict(X_test)
results.shape

(100,)