<img src='slides/5.png'>

In [1]:
import pandas as pd
import numpy as np

# DataFrameに読み込み
import pandas as pd
df = pd.read_csv('data/14_2_cleansing.csv')
df.head()

Unnamed: 0,height,weight,age
0,180,65,18
1,175,68,20
2,170,55,25
3,165,65,30
4,170,70,35


In [2]:
# DataFrameの要約統計量を表示
df.describe()

Unnamed: 0,height,weight,age
count,16.0,16.0,16.0
mean,775.5625,682.0,67.4375
std,2459.967424,2484.608836,37.209262
min,0.0,0.0,18.0
25%,165.0,55.0,28.75
50%,175.0,66.5,99.0
75%,180.0,71.25,99.0
max,9999.0,9999.0,99.0


min, max の値が不自然であり、その結果 mean, std もおかしな値になっている

In [3]:
# 各列の値の数を表示
for c in df.columns:
    print(df.groupby(c).size(), '\n')

height
0       1
160     2
165     2
170     2
175     4
180     4
9999    1
dtype: int64 

weight
0       1
45      1
50      1
55      2
60      1
65      2
68      1
70      3
75      1
80      1
85      1
9999    1
dtype: int64 

age
18    1
20    1
25    2
30    1
35    2
99    9
dtype: int64 



In [4]:
# 値の範囲が大きすぎて見にくいならば、最初と最後にしぼって表示
for c in df.columns:
    print(df.groupby(c).size().head(3), '\n')
    print(df.groupby(c).size().tail(3), '\n')

height
0      1
160    2
165    2
dtype: int64 

height
175     4
180     4
9999    1
dtype: int64 

weight
0     1
45    1
50    1
dtype: int64 

weight
80      1
85      1
9999    1
dtype: int64 

age
18    1
20    1
25    2
dtype: int64 

age
30    1
35    2
99    9
dtype: int64 



In [5]:
# 欠損値扱いの値を欠損値 NaN に置換
df['height'] = df['height'].replace(0, np.nan)
df['height'] = df['height'].replace(9999, np.nan)
df['weight'] = df['weight'].replace(0, np.nan)
df['weight'] = df['weight'].replace(9999, np.nan)
df['age'] = df['age'].replace(99, np.nan)

# 確認
df

Unnamed: 0,height,weight,age
0,180.0,65.0,18.0
1,175.0,68.0,20.0
2,170.0,55.0,25.0
3,165.0,65.0,30.0
4,170.0,70.0,35.0
5,,,25.0
6,,,35.0
7,165.0,45.0,
8,160.0,55.0,
9,180.0,85.0,


<img src='slides/7.png'>

In [6]:
# NumPyの行列を作成し、0で割った結果が nan, inf となることを確認
a = np.arange(5)

# a の値を確認
print(a)

# a/0 の値を確認
# - 0/0 -> nan
# - 1/0 -> inf
print(a/0)

[0 1 2 3 4]
[nan inf inf inf inf]


  print(a/0)
  print(a/0)


### 欠損値の有無の確認

- isna(): 欠損値の場合にTrue
- notnull(): 欠損値の場合にFalse（欠損値ではない場合にTrue）

In [7]:
# 欠損値の数をカウント
df.isna().sum()

height    2
weight    2
age       9
dtype: int64

In [8]:
# 欠損値では無い値の数をカウント
df.notnull().sum()

height    14
weight    14
age        7
dtype: int64

<img src='slides/11.png'>

### 欠損値のある行の除去（リストワイズ法）

- dropna()

In [9]:
# NaNを含む行を全て削除したDataFrame
df.dropna()

Unnamed: 0,height,weight,age
0,180.0,65.0,18.0
1,175.0,68.0,20.0
2,170.0,55.0,25.0
3,165.0,65.0,30.0
4,170.0,70.0,35.0


In [10]:
# age列でNaNを含む行を全て削除したDataFrame
df.dropna(subset=['age'])

Unnamed: 0,height,weight,age
0,180.0,65.0,18.0
1,175.0,68.0,20.0
2,170.0,55.0,25.0
3,165.0,65.0,30.0
4,170.0,70.0,35.0
5,,,25.0
6,,,35.0


### 欠損値の置き換え（代入法）

- fillna()

In [11]:
# NaNを -1 に置き換え
df.fillna(-1)

Unnamed: 0,height,weight,age
0,180.0,65.0,18.0
1,175.0,68.0,20.0
2,170.0,55.0,25.0
3,165.0,65.0,30.0
4,170.0,70.0,35.0
5,-1.0,-1.0,25.0
6,-1.0,-1.0,35.0
7,165.0,45.0,-1.0
8,160.0,55.0,-1.0
9,180.0,85.0,-1.0


In [12]:
# NaNを平均値に置き換え（平均値代入法）
df.fillna(df.mean())

Unnamed: 0,height,weight,age
0,180.0,65.0,18.0
1,175.0,68.0,20.0
2,170.0,55.0,25.0
3,165.0,65.0,30.0
4,170.0,70.0,35.0
5,172.142857,65.214286,25.0
6,172.142857,65.214286,35.0
7,165.0,45.0,26.857143
8,160.0,55.0,26.857143
9,180.0,85.0,26.857143
