# Pandas 数据操作

In [3]:
import pandas as pd

## 索引

### Series 索引

In [5]:
ser_obj = pd.Series(range(5),index = ['a', 'b', 'c', 'd', 'e'])
ser_obj

a    0
b    1
c    2
d    3
e    4
dtype: int32

In [6]:
# 行索引
print(ser_obj['a'])
print(ser_obj[0])

0
0


In [8]:
# 切片索引
print(ser_obj[1:3])
print(ser_obj['b':'d'])

b    1
c    2
dtype: int32
b    1
c    2
d    3
dtype: int32


In [11]:
# 不连续索引
print(ser_obj[[0,2,4]])
print(ser_obj[['a','e']])

a    0
c    2
e    4
dtype: int32
a    0
e    4
dtype: int32


In [14]:
# 布尔索引
ser_bool = ser_obj > 2
print(ser_bool)
print(ser_obj[ser_bool])

print(ser_obj[ser_obj > 2])

a    False
b    False
c    False
d     True
e     True
dtype: bool
d    3
e    4
dtype: int32
d    3
e    4
dtype: int32


### DataFrame索引

In [2]:
import numpy as np

df_obj = pd.DataFrame(np.random.randn(5,4),columns=['a', 'b', 'c', 'd'])
df_obj

Unnamed: 0,a,b,c,d
0,0.589972,1.364998,-0.530578,1.53094
1,-0.65286,0.198565,2.434259,-0.952695
2,0.138454,-0.72037,1.766686,-0.073915
3,0.781826,-0.093549,-0.99688,-0.326443
4,-0.166126,1.85063,0.485794,-0.312889


In [10]:
# 列索引
print('列索引')
print(df_obj['a']) 
print(type(df_obj['a'])) # 返回Series类型
print(type(df_obj[['a']])) # 返回DataFrame类型,多列即为 DataFrame 类型

# 不连续索引
print('不连续索引')
print(df_obj[['a','c']])

列索引
0   -2.019668
1   -0.296249
2    1.174541
3   -0.392927
4   -0.465501
Name: a, dtype: float64
<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>
不连续索引
          a         c
0 -2.019668  0.129727
1 -0.296249  1.208930
2  1.174541 -0.020294
3 -0.392927 -0.471966
4 -0.465501  0.587892


### 三种索引方式
- 标签索引
- 位置索引
- 混合索引（过时）

In [6]:
# 标签索引 loc
# Series
print(ser_obj['b':'d'])
print(ser_obj.loc['b':'d'])

# DataFrame
print(df_obj['a'])
print(df_obj.loc[0:2, 'a'])

b    1
c    2
d    3
dtype: int32
b    1
c    2
d    3
dtype: int32
0    0.589972
1   -0.652860
2    0.138454
3    0.781826
4   -0.166126
Name: a, dtype: float64
0    0.589972
1   -0.652860
2    0.138454
Name: a, dtype: float64


In [10]:
# 整型位置索引 iloc
print(ser_obj[1:3])
print(ser_obj.iloc[1:3])

# DataFrame
print(df_obj.iloc[0:2,0])

b    1
c    2
dtype: int32
b    1
c    2
dtype: int32
0    0.589972
1   -0.652860
Name: a, dtype: float64


In [11]:
# 混合索引 ix (已过时，不推荐使用)
print(ser_obj.ix[1:3])
print(ser_obj.ix['b':'c'])

# DataFrame
print(df_obj.ix[0:2, 0]) # 先按标签索引尝试操作，然后再按位置索引尝试操作

b    1
c    2
dtype: int32
b    1
c    2
dtype: int32
0    0.589972
1   -0.652860
2    0.138454
Name: a, dtype: float64


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  


## 运算与对齐

In [13]:
s1 = pd.Series(range(10,20),index = range(10))
s2 = pd.Series(range(20,25),index = range(5))

print('s1: ' )
print(s1)

print('')

print('s2: ' )
print(s2)

s1: 
0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int32

s2: 
0    20
1    21
2    22
3    23
4    24
dtype: int32


In [14]:
# Series 对齐运算
s1 + s2

0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64

In [18]:
df1 = pd.DataFrame(np.ones((2,2)),columns=['a','b'])
df2 = pd.DataFrame(np.ones((3,3)),columns=['a','b','c'])

print('df1: ')
print(df1)

print('') 
print('df2: ')
print(df2)

df1: 
     a    b
0  1.0  1.0
1  1.0  1.0

df2: 
     a    b    c
0  1.0  1.0  1.0
1  1.0  1.0  1.0
2  1.0  1.0  1.0


In [19]:
# DataFrame对齐操作
df1 + df2

Unnamed: 0,a,b,c
0,2.0,2.0,
1,2.0,2.0,
2,,,


In [20]:
# 填充未对齐的数据进行运算
print(s1)
print(s2)

s1.add(s2,fill_value=-1)

0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int32
0    20
1    21
2    22
3    23
4    24
dtype: int32


0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    14.0
6    15.0
7    16.0
8    17.0
9    18.0
dtype: float64

In [22]:
print(df1)
print(df2)
df1.sub(df2,fill_value=2.)

     a    b
0  1.0  1.0
1  1.0  1.0
     a    b    c
0  1.0  1.0  1.0
1  1.0  1.0  1.0
2  1.0  1.0  1.0


Unnamed: 0,a,b,c
0,0.0,0.0,1.0
1,0.0,0.0,1.0
2,1.0,1.0,1.0


In [23]:
# 填充NaN
s3 = s1 + s2
print(s3)

0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64


In [25]:
s3_filled = s3.fillna(-1)
s3_filled

0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    -1.0
6    -1.0
7    -1.0
8    -1.0
9    -1.0
dtype: float64

In [29]:
df3 = df1 + df2
print(df3)
df3.fillna(100, inplace = True)
print(df3)

     a    b   c
0  2.0  2.0 NaN
1  2.0  2.0 NaN
2  NaN  NaN NaN
       a      b      c
0    2.0    2.0  100.0
1    2.0    2.0  100.0
2  100.0  100.0  100.0


## 函数应用
- Numpy ufunc 函数
- pandas apply 应用某一行或列
- pandas applymap 应用所有数据

In [30]:
# Numpy ufunc 函数
df = pd.DataFrame(np.random.randn(5,4) - 1)
print(df)

print(np.abs(df))

          0         1         2         3
0 -3.016803 -1.694230 -0.940270 -0.343594
1  0.844013 -1.085120 -0.358548  0.077132
2 -1.127538 -1.704492 -0.526820  0.242160
3 -0.114008 -1.194735 -1.578213 -0.483163
4 -2.184957  1.041388 -4.103370 -1.259968
          0         1         2         3
0  3.016803  1.694230  0.940270  0.343594
1  0.844013  1.085120  0.358548  0.077132
2  1.127538  1.704492  0.526820  0.242160
3  0.114008  1.194735  1.578213  0.483163
4  2.184957  1.041388  4.103370  1.259968


In [31]:
# 使用apply应用行或列数据
print(df.apply(lambda x : x.max()))

0    0.844013
1    1.041388
2   -0.358548
3    0.242160
dtype: float64


In [32]:
# 指定轴方向
print(df.apply(lambda x : x.max(),axis=1))

0   -0.343594
1    0.844013
2    0.242160
3   -0.114008
4    1.041388
dtype: float64


In [35]:
# 使用applymap应用到每个数据  applymap
f2 = lambda x : '%.2f' %x
print(df.applymap(f2))

       0      1      2      3
0  -3.02  -1.69  -0.94  -0.34
1   0.84  -1.09  -0.36   0.08
2  -1.13  -1.70  -0.53   0.24
3  -0.11  -1.19  -1.58  -0.48
4  -2.18   1.04  -4.10  -1.26


## 排序

In [37]:
s4 = pd.Series(range(10, 15), index = np.random.randint(5, size=5))
print(s4)

0    10
2    11
2    12
4    13
3    14
dtype: int32


In [38]:
# 索引排序
s4.sort_index()

0    10
2    11
2    12
3    14
4    13
dtype: int32

In [45]:
df4 = pd.DataFrame(np.random.randn(3, 4), 
                   index=np.random.randint(3, size=3),
                   columns=np.random.randint(4, size=4))
print(df4)

          1         1         1         2
1  0.053642  0.947312 -1.472311 -2.266253
1 -0.886146 -1.388884  0.669723  0.087012
0  0.475687  0.136118  1.858407  2.081164


In [46]:
df4.sort_index(axis=1)

Unnamed: 0,1,1.1,1.2,2
1,0.053642,0.947312,-1.472311,-2.266253
1,-0.886146,-1.388884,0.669723,0.087012
0,0.475687,0.136118,1.858407,2.081164


In [47]:
# 按值排序
df4.sort_values(by=2)

Unnamed: 0,1,1.1,1.2,2
1,0.053642,0.947312,-1.472311,-2.266253
1,-0.886146,-1.388884,0.669723,0.087012
0,0.475687,0.136118,1.858407,2.081164


## 处理缺失数据
- 判断是否为空  isnull
- 填充 fillna

In [6]:
df_data = pd.DataFrame([np.random.randn(3), [1., np.nan, np.nan],
                       [4., np.nan, np.nan], [1., np.nan, 2.]])
df_data.head()

Unnamed: 0,0,1,2
0,0.080861,1.347422,0.454648
1,1.0,,
2,4.0,,
3,1.0,,2.0


In [7]:
# isnull
df_data.isnull()

Unnamed: 0,0,1,2
0,False,False,False
1,False,True,True
2,False,True,True
3,False,True,False


In [8]:
# dropna
df_data.dropna()

Unnamed: 0,0,1,2
0,0.080861,1.347422,0.454648


In [9]:
df_data.dropna(axis=1)

Unnamed: 0,0
0,0.080861
1,1.0
2,4.0
3,1.0


In [10]:
# fillna
df_data.fillna(-100.)

Unnamed: 0,0,1,2
0,0.080861,1.347422,0.454648
1,1.0,-100.0,-100.0
2,4.0,-100.0,-100.0
3,1.0,-100.0,2.0


## 统计和描述


### 常用的统计计算

In [11]:
df_obj = pd.DataFrame(np.random.randn(5,4), columns = ['a', 'b', 'c', 'd'])
df_obj

Unnamed: 0,a,b,c,d
0,1.346363,0.194264,0.039576,-1.298689
1,0.89432,-0.072229,1.489604,0.015699
2,0.810275,-0.244092,1.506325,-0.246808
3,-1.137869,0.307916,-0.367691,-0.562391
4,0.747632,0.892491,-0.214023,-0.950077


In [12]:
df_obj.sum()

a    2.660720
b    1.078349
c    2.453792
d   -3.042266
dtype: float64

In [13]:
df_obj.max()

a    1.346363
b    0.892491
c    1.506325
d    0.015699
dtype: float64

In [14]:
df_obj.min(axis=1)

0   -1.298689
1   -0.072229
2   -0.246808
3   -1.137869
4   -0.950077
dtype: float64

### 统计描述

In [16]:
df_obj.describe()

Unnamed: 0,a,b,c,d
count,5.0,5.0,5.0,5.0
mean,0.532144,0.21567,0.490758,-0.608453
std,0.96266,0.436269,0.930898,0.528112
min,-1.137869,-0.244092,-0.367691,-1.298689
25%,0.747632,-0.072229,-0.214023,-0.950077
50%,0.810275,0.194264,0.039576,-0.562391
75%,0.89432,0.307916,1.489604,-0.246808
max,1.346363,0.892491,1.506325,0.015699
