In [1]:
import pandas as pd
import numpy as np

data = {'b': {2016: None, 2015: 24, 2017: 17, 2018: 33}, 'a': {2015: 11, 2017: np.nan, 2016: 19, 2018: 32}, 
        'c': {2015: 11, 2017: 18, 2016: 19, 2018: 22}}  
df = pd.DataFrame(data)
print(df)

         b     a   c
2015  24.0  11.0  11
2016   NaN  19.0  19
2017  17.0   NaN  18
2018  33.0  32.0  22


In [2]:
# 缺失值处理
print(df.isnull())      # 判断是否有缺失值
print(df['a'].isnull())
print('------')
print(df['a'].isnull().value_counts())  # 列表a有多少缺失值

          b      a      c
2015  False  False  False
2016   True  False  False
2017  False   True  False
2018  False  False  False
2015    False
2016    False
2017     True
2018    False
Name: a, dtype: bool
------
False    3
True     1
Name: a, dtype: int64


In [3]:
# 删除缺失值
print(df.dropna(how='any'))     # 删除含有缺失值的行。等价于df.dropna()，默认axis=0，在原处修改inplace=True
print(df.dropna(how='all'))     # 删除全是缺失值的行
print(df.dropna(how='all', axis=1))     # 删除全是缺失值的列

         b     a   c
2015  24.0  11.0  11
2018  33.0  32.0  22
         b     a   c
2015  24.0  11.0  11
2016   NaN  19.0  19
2017  17.0   NaN  18
2018  33.0  32.0  22
         b     a   c
2015  24.0  11.0  11
2016   NaN  19.0  19
2017  17.0   NaN  18
2018  33.0  32.0  22


In [4]:
# 填充缺失值
print(df.fillna(value=0))       # 填充值
print(df['a'].fillna(df['a'].mean()))       # 填充函数
print(df['a'].fillna(method="ffill"))       # 向前填充，向后填充method=‘bfill’

         b     a   c
2015  24.0  11.0  11
2016   0.0  19.0  19
2017  17.0   0.0  18
2018  33.0  32.0  22
2015    11.000000
2016    19.000000
2017    20.666667
2018    32.000000
Name: a, dtype: float64
2015    11.0
2016    19.0
2017    19.0
2018    32.0
Name: a, dtype: float64


In [5]:
df['a'] = df['a'].astype('str')
print(df.dtypes)
df['a'].map(str.strip)      # 清除空格 
# 转换成大写 df['a'].str.upper() 

b    float64
a     object
c      int64
dtype: object


2015    11.0
2016    19.0
2017     nan
2018    32.0
Name: a, dtype: object

In [6]:
# 修改列名
print(df.rename(columns={'a': 'aa'}))      # 若原处修改则添加inplace=True
# 更改索引 df.set_index('a')    将列a作为索引
# 重置索引 df.reset_index()

         b    aa   c
2015  24.0  11.0  11
2016   NaN  19.0  19
2017  17.0   nan  18
2018  33.0  32.0  22


In [7]:
# dup = df.duplicated(subset=['a', 'b'], keep='first')  查看是否有重复值
# dup.value_counts()    # 重复值数量
# df[df.duplicated(subset=['a', 'b'], keep='first')]   保留重复值
# df.drop_duplicates(subset=['a', 'b'], keep='first')   删除重复值

In [8]:
# 替换值
df['b'] = df['b'].replace({24: 42, 17: 71})
print(df)

         b     a   c
2015  42.0  11.0  11
2016   NaN  19.0  19
2017  71.0   nan  18
2018  33.0  32.0  22
