### 利用sklearn自带的方法进行缺失值的处理 

In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
data = pd.read_csv(r"Narrativedata.csv",index_col = 0)
data.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No


In [2]:
# missing_values 告诉SimpleImputer，数据中的缺失值长什么样，默认空值为np.nan
# strategy 输入“mean,median,most_frequent constant 
# copy 默认为True，将创建特征举证的副本，反之则会将确实值补到原本的特征矩阵中去
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
Age         714 non-null float64
Sex         891 non-null object
Embarked    889 non-null object
Survived    891 non-null object
dtypes: float64(1), object(3)
memory usage: 34.8+ KB


In [10]:
# sklearn 当中的特征矩阵必须是二维的
Age= data.loc[:,"Age"].values.reshape(-1,1)
Age[:10]

array([[22.],
       [38.],
       [26.],
       [35.],
       [35.],
       [nan],
       [54.],
       [ 2.],
       [27.],
       [14.]])

In [15]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer() # 实例化，默认均值填补
imp_median = SimpleImputer(strategy = 'median')
imp_0 = SimpleImputer(strategy = 'constant',fill_value = 0)

imp_mean = imp_mean.fit_transform(Age)
imp_median = imp_median.fit_transform(Age)
imp_0 = imp_0.fit_transform(Age)

imp_0[:20]

array([[22.],
       [38.],
       [26.],
       [35.],
       [35.],
       [ 0.],
       [54.],
       [ 2.],
       [27.],
       [14.],
       [ 4.],
       [58.],
       [20.],
       [39.],
       [14.],
       [55.],
       [ 2.],
       [ 0.],
       [31.],
       [ 0.]])

In [16]:
# 我们使用中位数来填补Age
data.loc[:,"Age"] = imp_median
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
Age         891 non-null float64
Sex         891 non-null object
Embarked    889 non-null object
Survived    891 non-null object
dtypes: float64(1), object(3)
memory usage: 34.8+ KB


In [17]:
# 利用众数来填补Embarked
Embarked = data.loc[:,"Embarked"].values.reshape(-1,1)
imp_mode = SimpleImputer(strategy = "most_frequent")
data.loc[:,"Embarked"] = imp_mode.fit_transform(Embarked)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
Age         891 non-null float64
Sex         891 non-null object
Embarked    891 non-null object
Survived    891 non-null object
dtypes: float64(1), object(3)
memory usage: 34.8+ KB
