In [1]:
import pandas as pd
import numpy  as np
df = pd.DataFrame([\
                   ['frank', 'M',    np.nan], \
                   ['mary' , np.nan, np.nan], \
                   ['tom'  , 'M',    35], \
                   ['ted'  , 'M',    33], \
                   ['jean' , np.nan, 21], \
                   ['lisa' , 'F',    20]])
df.columns = ['name', 'gender', 'age']
df

Unnamed: 0,name,gender,age
0,frank,M,
1,mary,,
2,tom,M,35.0
3,ted,M,33.0
4,jean,,21.0
5,lisa,F,20.0


In [3]:
#检查是否含有缺失值，存在缺失值为TRUE
df['gender'].isnull()

0    False
1     True
2    False
3    False
4     True
5    False
Name: gender, dtype: bool

In [4]:
df['gender'].notnull()

0     True
1    False
2     True
3     True
4    False
5     True
Name: gender, dtype: bool

In [8]:
#检查列中是否含有缺失值
df['age'].isnull().values.any()

True

In [9]:
#检查哪些列中含有缺失值
df.isnull().any()

name      False
gender     True
age        True
dtype: bool

In [12]:
#计算缺失值的数量
df['age'].isnull().sum()

2

In [15]:
df.isnull().sum()

name      0
gender    2
age       2
dtype: int64

In [16]:
df.isnull().sum().sum()

4

# 补齐缺失值

In [17]:
import pandas as pd
import numpy  as np
df = pd.DataFrame([\
                   ['frank', 'M',    np.nan], \
                   [np.nan , np.nan, np.nan], \
                   ['tom'  , 'M',    35], \
                   ['ted'  , 'M',    33], \
                   ['jean' , np.nan, 21], \
                   ['lisa' , 'F',    20]])
df.columns = ['name', 'gender', 'age']
df

Unnamed: 0,name,gender,age
0,frank,M,
1,,,
2,tom,M,35.0
3,ted,M,33.0
4,jean,,21.0
5,lisa,F,20.0


In [18]:
df.dropna()

Unnamed: 0,name,gender,age
2,tom,M,35.0
3,ted,M,33.0
5,lisa,F,20.0


In [19]:
df.dropna(how = 'all')

Unnamed: 0,name,gender,age
0,frank,M,
2,tom,M,35.0
3,ted,M,33.0
4,jean,,21.0
5,lisa,F,20.0


In [21]:
?df.dropna

In [22]:
#存在有2个缺失值的行舍弃
df.dropna(thresh = 2)

Unnamed: 0,name,gender,age
0,frank,M,
2,tom,M,35.0
3,ted,M,33.0
4,jean,,21.0
5,lisa,F,20.0


In [24]:
#删除含有缺失值的列
df['employee'] = np.nan
df

Unnamed: 0,name,gender,age,employee
0,frank,M,,
1,,,,
2,tom,M,35.0,
3,ted,M,33.0,
4,jean,,21.0,
5,lisa,F,20.0,


In [25]:
df.dropna(how = 'all',axis = 1)

Unnamed: 0,name,gender,age
0,frank,M,
1,,,
2,tom,M,35.0
3,ted,M,33.0
4,jean,,21.0
5,lisa,F,20.0


In [26]:
#填补缺失值
df

Unnamed: 0,name,gender,age,employee
0,frank,M,,
1,,,,
2,tom,M,35.0,
3,ted,M,33.0,
4,jean,,21.0,
5,lisa,F,20.0,


In [27]:
df.fillna(0)

Unnamed: 0,name,gender,age,employee
0,frank,M,0.0,0.0
1,0,0,0.0,0.0
2,tom,M,35.0,0.0
3,ted,M,33.0,0.0
4,jean,0,21.0,0.0
5,lisa,F,20.0,0.0


In [28]:
#df.fillna()
df['age'].mean()

27.25

In [29]:
df['age'].fillna(df['age'].mean())

0    27.25
1    27.25
2    35.00
3    33.00
4    21.00
5    20.00
Name: age, dtype: float64

In [34]:
df.groupby('gender')['age'].transform('mean')

0    34.0
1     NaN
2    34.0
3    34.0
4     NaN
5    20.0
Name: age, dtype: float64

In [37]:
df['age'].fillna(df.groupby('gender')['age'].transform('mean'),inplace=True)

In [38]:
df

Unnamed: 0,name,gender,age,employee
0,frank,M,34.0,
1,,,,
2,tom,M,35.0,
3,ted,M,33.0,
4,jean,,21.0,
5,lisa,F,20.0,


In [39]:
#向前或向后填充缺失值
df.fillna(method='pad')

Unnamed: 0,name,gender,age,employee
0,frank,M,34.0,
1,frank,M,34.0,
2,tom,M,35.0,
3,ted,M,33.0,
4,jean,M,21.0,
5,lisa,F,20.0,


In [41]:
df.fillna(method='bfill', limit= 2)

Unnamed: 0,name,gender,age,employee
0,frank,M,34.0,
1,tom,M,35.0,
2,tom,M,35.0,
3,ted,M,33.0,
4,jean,F,21.0,
5,lisa,F,20.0,


In [42]:
#内插法
#当缺失值满足一定的线性规律时，可以使用
#https://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html
df2 = pd.DataFrame([[1, 870],\
                    [2, 900],\
                    [np.nan, np.nan],\
                    [4, 950],\
                    [5,1080],\
                    [6,1200]])
df2.columns = ['time', 'val']
df2

Unnamed: 0,time,val
0,1.0,870.0
1,2.0,900.0
2,,
3,4.0,950.0
4,5.0,1080.0
5,6.0,1200.0


In [43]:
df2.interpolate()

Unnamed: 0,time,val
0,1.0,870.0
1,2.0,900.0
2,3.0,925.0
3,4.0,950.0
4,5.0,1080.0
5,6.0,1200.0
