# Pandas数据操作

In [154]:
import pandas as pd

## Series索引：行索引/切片索引/不连续索引/布尔索引

In [155]:
ser_obj = pd.Series(range(5), index = ['a', 'b', 'c', 'd', 'e'])
print(ser_obj.head())

a    0
b    1
c    2
d    3
e    4
dtype: int64


In [156]:
# 行索引
print(ser_obj['a'])
print(ser_obj[0])

0
0


In [157]:
# 切片索引
print(ser_obj[1:3]) # ending index not included
print(ser_obj['b':'d']) # ending index included

b    1
c    2
dtype: int64
b    1
c    2
d    3
dtype: int64


In [158]:
# 不连续索引
print(ser_obj[[0, 2, 4]])
print(ser_obj[['a', 'e']])

a    0
c    2
e    4
dtype: int64
a    0
e    4
dtype: int64


In [159]:
# 布尔索引
ser_bool = ser_obj > 2
print(ser_bool)
print(ser_obj[ser_bool])

print(ser_obj[ser_obj > 2])

a    False
b    False
c    False
d     True
e     True
dtype: bool
d    3
e    4
dtype: int64
d    3
e    4
dtype: int64


## DataFrame索引 : 列索引/不连续索引


In [160]:
import numpy as np

df_obj = pd.DataFrame(np.random.randn(5,4), columns = ['a', 'b', 'c', 'd'])
print(df_obj.head())

          a         b         c         d
0  0.471919 -0.068049  1.644507 -0.232924
1 -0.812417  2.232788  0.787807  0.069922
2 -1.601271  2.323332  0.909282 -1.065243
3 -1.983324  0.382434 -2.562075 -1.209466
4 -1.570560 -0.866112 -0.679967 -1.463845


In [161]:
# 列索引
print(df_obj['a']) # 返回Series类型

# 不连续索引
print(df_obj[['a','c']])

0    0.471919
1   -0.812417
2   -1.601271
3   -1.983324
4   -1.570560
Name: a, dtype: float64
          a         c
0  0.471919  1.644507
1 -0.812417  0.787807
2 -1.601271  0.909282
3 -1.983324 -2.562075
4 -1.570560 -0.679967


* 索引方式

 .loc/.iloc 

----
 
 In pandas, loc and iloc are two ways you can select rows and columns by label(s) or a Boolean array.

.loc[]: you use row’s **index** (can be both integer and string. Depends on what the index is, for example index can be names, and can be a number), and column name for indexing (can’t use integer to index column location).

.iloc[] : you can only use *integers* to do **position-based** indexing.

In [162]:
print(ser_obj)
print(df_obj)

a    0
b    1
c    2
d    3
e    4
dtype: int64
          a         b         c         d
0  0.471919 -0.068049  1.644507 -0.232924
1 -0.812417  2.232788  0.787807  0.069922
2 -1.601271  2.323332  0.909282 -1.065243
3 -1.983324  0.382434 -2.562075 -1.209466
4 -1.570560 -0.866112 -0.679967 -1.463845


In [163]:
# 标签索引 loc

# Series
print(ser_obj.loc['b':'d']) #print(ser_obj['b':'d'])

# DataFrame
print(df_obj.loc[:,'a'])# print(df_obj['a'])
print(df_obj.loc[:,['a','b']])  # select first 2 columns by names using .loc[]
print(df_obj.loc[1:2]) # select the 2nd and the 3rd rows using index by .loc[]
print(df_obj.loc[0:2, 'a'])

b    1
c    2
d    3
dtype: int64
0    0.471919
1   -0.812417
2   -1.601271
3   -1.983324
4   -1.570560
Name: a, dtype: float64
          a         b
0  0.471919 -0.068049
1 -0.812417  2.232788
2 -1.601271  2.323332
3 -1.983324  0.382434
4 -1.570560 -0.866112
          a         b         c         d
1 -0.812417  2.232788  0.787807  0.069922
2 -1.601271  2.323332  0.909282 -1.065243
0    0.471919
1   -0.812417
2   -1.601271
Name: a, dtype: float64


In [164]:
# 整型位置索引 iloc
print(ser_obj.iloc[1:3])

# DataFrame
print(df_obj.iloc[:,:2]) #select first 2 columns by using iloc[]
print(df_obj.iloc[1:3])# select the 2nd and the 3rd rows using index by .iloc[]
print(df_obj.iloc[:3, :1]) 

b    1
c    2
dtype: int64
          a         b
0  0.471919 -0.068049
1 -0.812417  2.232788
2 -1.601271  2.323332
3 -1.983324  0.382434
4 -1.570560 -0.866112
          a         b         c         d
1 -0.812417  2.232788  0.787807  0.069922
2 -1.601271  2.323332  0.909282 -1.065243
          a
0  0.471919
1 -0.812417
2 -1.601271


## 运算与对齐

* 按索引对齐运算，没对齐位置补NaN

In [165]:
s1 = pd.Series(range(10, 20), index = range(10))
s2 = pd.Series(range(20, 25), index = range(5))

print('s1: ' )
print(s1)

print('') 

print('s2: ')
print(s2)

s1: 
0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64

s2: 
0    20
1    21
2    22
3    23
4    24
dtype: int64


In [166]:
# Series 对齐运算
s1 + s2

0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64

In [167]:
import numpy as np

df1 = pd.DataFrame(np.ones((2,2)), columns = ['a', 'b'])
df2 = pd.DataFrame(np.ones((3,3)), columns = ['a', 'b', 'c'])

print('df1: ')
print(df1)

print('') 
print('df2: ')
print(df2)

df1: 
     a    b
0  1.0  1.0
1  1.0  1.0

df2: 
     a    b    c
0  1.0  1.0  1.0
1  1.0  1.0  1.0
2  1.0  1.0  1.0


In [168]:
# DataFrame对齐操作
df1 + df2

Unnamed: 0,a,b,c
0,2.0,2.0,
1,2.0,2.0,
2,,,


* 填充未对齐的数据进行运算

add,sub,mul,div

fill_value: 指定填充值

In [169]:
print(s1)
print(s2)

s1.add(s2, fill_value = -1) 

0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64
0    20
1    21
2    22
3    23
4    24
dtype: int64


0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    14.0
6    15.0
7    16.0
8    17.0
9    18.0
dtype: float64

In [170]:
df1.sub(df2, fill_value = 2.)

Unnamed: 0,a,b,c
0,0.0,0.0,1.0
1,0.0,0.0,1.0
2,1.0,1.0,1.0


填充NaN

In [171]:
s3 = s1 + s2
print(s3)

0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64


In [172]:
s3_filled = s3.fillna(-1)
print(s3_filled)

0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    -1.0
6    -1.0
7    -1.0
8    -1.0
9    -1.0
dtype: float64


In [173]:
df3 = df1 + df2
print(df3)

     a    b   c
0  2.0  2.0 NaN
1  2.0  2.0 NaN
2  NaN  NaN NaN


In [174]:
df3.fillna(100, inplace = True)
print(df3)

       a      b      c
0    2.0    2.0  100.0
1    2.0    2.0  100.0
2  100.0  100.0  100.0


## 函数应用

In [175]:
# Numpy ufunc 函数
df = pd.DataFrame(np.random.randn(5,4) - 1)
print(df)

print(np.abs(df)) # 求绝对值

          0         1         2         3
0 -2.067711 -0.581137 -1.263728 -0.320676
1 -0.331852 -1.441290 -1.841251 -1.110321
2  0.214902  0.839446 -2.815138 -1.478594
3 -0.837377 -1.799119 -1.885945 -1.385439
4 -1.593498 -1.879602 -0.446649  0.528542
          0         1         2         3
0  2.067711  0.581137  1.263728  0.320676
1  0.331852  1.441290  1.841251  1.110321
2  0.214902  0.839446  2.815138  1.478594
3  0.837377  1.799119  1.885945  1.385439
4  1.593498  1.879602  0.446649  0.528542


**使用apply应用行或列数据**

In [176]:
#f = lambda x : x.max()
def f(x):
    return x.max()

print(df.apply(f))

0    0.214902
1    0.839446
2   -0.446649
3    0.528542
dtype: float64


In [177]:
# 指定轴方向 （默认axis=0,按列操作)
print(df.apply(lambda x : x.max(), axis=1))

0   -0.320676
1   -0.331852
2    0.839446
3   -0.837377
4    0.528542
dtype: float64


**使用applymap应用到每个数据**

In [178]:
f2 = lambda x : '%.2f' % x
print(df.applymap(f2))

       0      1      2      3
0  -2.07  -0.58  -1.26  -0.32
1  -0.33  -1.44  -1.84  -1.11
2   0.21   0.84  -2.82  -1.48
3  -0.84  -1.80  -1.89  -1.39
4  -1.59  -1.88  -0.45   0.53


## 排序

In [179]:
s4 = pd.Series(range(10, 15), index = np.random.randint(5, size=5))
print(s4)

1    10
2    11
2    12
2    13
1    14
dtype: int64


In [180]:
# 索引排序
s4.sort_index(ascending=False)

2    11
2    12
2    13
1    10
1    14
dtype: int64

In [181]:
df4 = pd.DataFrame(np.random.randn(3, 4), 
                   index=np.random.randint(3, size=3),
                   columns=np.random.randint(4, size=4))
print(df4)

          3         0         2         1
2  0.985574  0.159803 -0.089989 -2.923175
0 -0.930998 -0.770621 -1.252531 -0.682244
0  1.115093  0.594152 -0.604414 -0.330863


In [182]:
#df4.sort_index(ascending=False)
df4.sort_index(axis=1)

Unnamed: 0,0,1,2,3
2,0.159803,-2.923175,-0.089989,0.985574
0,-0.770621,-0.682244,-1.252531,-0.930998
0,0.594152,-0.330863,-0.604414,1.115093


In [183]:
# 按值排序
df4.sort_values(by=0)

Unnamed: 0,3,0,2,1
0,-0.930998,-0.770621,-1.252531,-0.682244
2,0.985574,0.159803,-0.089989,-2.923175
0,1.115093,0.594152,-0.604414,-0.330863


## 处理缺失数据

In [184]:
df_data = pd.DataFrame([np.random.randn(3), [1., np.nan, np.nan],
                       [4., np.nan, np.nan], [1., np.nan, 2.]])
df_data.head()

Unnamed: 0,0,1,2
0,-1.45959,-0.576546,-1.206868
1,1.0,,
2,4.0,,
3,1.0,,2.0


In [185]:
# isnull
df_data.isnull()

Unnamed: 0,0,1,2
0,False,False,False
1,False,True,True
2,False,True,True
3,False,True,False


In [186]:
# dropna
df_data.dropna()
#df_data.dropna(axis=1)

Unnamed: 0,0,1,2
0,-1.45959,-0.576546,-1.206868


In [187]:
# fillna
df_data.fillna(-100.)

Unnamed: 0,0,1,2
0,-1.45959,-0.576546,-1.206868
1,1.0,-100.0,-100.0
2,4.0,-100.0,-100.0
3,1.0,-100.0,2.0
