# 欠損値の補完

sklearn.impute.IterativeImputer を用いて欠損値を補完する。

- https://scikit-learn.org/stable/modules/impute.html
- https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html
  - 他の特徴量を引数とするモデルによる予測値で欠損値を補完

### 欠損値を含むデータセット

https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data

- 第5回の回帰分析で使用したデータセット
- 欠損値は '?'

In [1]:
import pandas as pd
import numpy as np
autos = pd.read_csv('data/imports-85.data', na_values='?')
autos.columns = ['symboling', 'normalized_losses', 'make', 'fuel_type', 'aspiration',
                 'num_of_doors', 'body_style', 'drive_wheels', 'engine_location',
                 'wheel_base', 'length','width', 'height', 'curb_weight',
                 'engine_type', 'num_of_cylinders', 'engine_size', 'fuel_system',
                 'bore', 'stroke', 'compression_ratio', 'horsepower', 'peak_rpm',
                 'city_mpg', 'highway_mpg','price']
autos.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
1,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
2,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
3,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0
4,2,,audi,gas,std,two,sedan,fwd,front,99.8,...,136,mpfi,3.19,3.4,8.5,110.0,5500.0,19,25,15250.0


In [2]:
# 欠損値の数の確認
autos.isna().sum()

symboling             0
normalized_losses    40
make                  0
fuel_type             0
aspiration            0
num_of_doors          2
body_style            0
drive_wheels          0
engine_location       0
wheel_base            0
length                0
width                 0
height                0
curb_weight           0
engine_type           0
num_of_cylinders      0
engine_size           0
fuel_system           0
bore                  4
stroke                4
compression_ratio     0
horsepower            2
peak_rpm              2
city_mpg              0
highway_mpg           0
price                 4
dtype: int64

In [3]:
# num_of_doors列の欠損値を確認
# - IterativeImputer はカテゴリカル変数は処理できないため、最頻値を代入することにする
autos[autos.num_of_doors.isna()]

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
26,1,148.0,dodge,gas,turbo,,sedan,fwd,front,93.7,...,98,mpfi,3.03,3.39,7.6,102.0,5500.0,24,30,8558.0
62,0,,mazda,diesel,std,,sedan,fwd,front,98.8,...,122,idi,3.39,3.39,22.7,64.0,4650.0,36,42,10795.0


In [4]:
# num_of_doors列の最頻値
autos['num_of_doors'].mode()

0    four
dtype: object

In [5]:
# DataFrameを作業用の変数にコピー
df = autos

# num_of_doors列の欠損値を最頻値に置き換える
# - autos['num_of_doors'].mode()[0] となっている（[0]）が付いているのは最頻値は複数返ることがあるため
df['num_of_doors'] = autos['num_of_doors'].fillna(autos['num_of_doors'].mode()[0])

In [9]:
# num_of_doors列の欠損値がないことを確認
autos.num_of_doors.isna().sum()
df.isna().sum()

symboling             0
normalized_losses    40
make                  0
fuel_type             0
aspiration            0
num_of_doors          0
body_style            0
drive_wheels          0
engine_location       0
wheel_base            0
length                0
width                 0
height                0
curb_weight           0
engine_type           0
num_of_cylinders      0
engine_size           0
fuel_system           0
bore                  4
stroke                4
compression_ratio     0
horsepower            2
peak_rpm              2
city_mpg              0
highway_mpg           0
price                 4
dtype: int64

In [10]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# ダミー変数化
df = pd.get_dummies(data=df, drop_first=True)

# IterativeImputerを初期化、欠損値を補完
imputer = IterativeImputer()
imputed = imputer.fit_transform(df)

# 結果を確認
df_imputed = pd.DataFrame(imputed, columns=df.columns, index=df.index)
df_imputed.isna().sum()

symboling            0
normalized_losses    0
wheel_base           0
length               0
width                0
                    ..
fuel_system_idi      0
fuel_system_mfi      0
fuel_system_mpfi     0
fuel_system_spdi     0
fuel_system_spfi     0
Length: 66, dtype: int64

In [11]:
# 欠損値の数の合計
df_imputed.isna().sum().sum()

0