# 3 Pandas的索引操作

In [1]:
import pandas as pd
import numpy as np

In [2]:
dict_data = {'A': 1,
             'B': pd.Timestamp('20190926'),
             'C': pd.Series(1, index=list(range(4)),dtype='float32'),
             'D': np.array([1,2,3,4],dtype='int32'),
             'E': ["Python","Java","C++","C"],
             'F': 'wangdao' }
df_obj2 = pd.DataFrame(dict_data)
print(df_obj2.index)

Index([0, 1, 2, 3], dtype='int64')


In [3]:
# 索引对象的值不可变（上面代码增加）
# df_obj2.index[0] = 2

## 常见的Index种类
- Index，索引  可以是各种类型
- Int64Index，整数索引
- MultiIndex，层级索引，难度较大
- DatetimeIndex，时间戳类型

In [4]:
ser_obj = pd.Series(range(5), index = list("abcde"))
print(ser_obj)
ser_obj.index

a    0
b    1
c    2
d    3
e    4
dtype: int64


Index(['a', 'b', 'c', 'd', 'e'], dtype='str')

In [5]:
# 行索引，不仅可以用索引名，可以用索引位置或来取, 不规范, 已过时
print(ser_obj['b']) #索引名
print(ser_obj[2]) #位置索引

1


KeyError: 2

In [6]:
print(ser_obj.loc['b']) #索引名
print(ser_obj.iloc[2]) #位置索引

1
2


In [8]:
# 切片索引
print(ser_obj.iloc[1:3])  #索引位置取数据，左闭右开
print(ser_obj.loc['b':'d'])  #记住索引名  左闭右闭

b    1
c    2
dtype: int64
b    1
c    2
d    3
dtype: int64


In [8]:
# 不连续索引
print(ser_obj.iloc[[0, 2, 4]])
print(ser_obj.loc[['a', 'e']])

a    0
c    2
e    4
dtype: int64
a    0
e    4
dtype: int64


In [9]:
# 布尔索引
ser_bool = ser_obj > 2
print(ser_obj)
print(ser_bool)


a    0
b    1
c    2
d    3
e    4
dtype: int64
a    False
b    False
c    False
d     True
e     True
dtype: bool


In [10]:
print('-'*50)
print(ser_obj[ser_bool])

print(ser_obj[ser_obj > 2]) #取出大于2的元素

--------------------------------------------------
d    3
e    4
dtype: int64
d    3
e    4
dtype: int64


## DataFrame索引

In [11]:
import numpy as np
df_obj = pd.DataFrame(np.random.randn(5,4),columns = ['a', 'b', 'c', 'd'])
print(df_obj.head())

          a         b         c         d
0 -0.968590  0.830292  0.034278 -1.164383
1  0.992921 -0.241490 -0.742293 -0.621847
2  1.483033 -0.479704  1.092763 -0.200764
3  0.162693  0.815777 -0.016545  0.111612
4 -1.207179  0.317434 -0.727643  1.277512


In [12]:
# 列索引
print(df_obj['a']) # 返回Series类型
print('-'*50)
print(df_obj[['a']]) # 返回DataFrame类型
print('-'*50)
print(type(df_obj[['a']])) # 返回DataFrame类型

0   -0.968590
1    0.992921
2    1.483033
3    0.162693
4   -1.207179
Name: a, dtype: float64
--------------------------------------------------
          a
0 -0.968590
1  0.992921
2  1.483033
3  0.162693
4 -1.207179
--------------------------------------------------
<class 'pandas.DataFrame'>


1. loc 标签索引(通过索引标签值获取数据)

In [13]:
# 标签索引 loc，建议使用loc，效率更高
# Series
print(ser_obj)
print(ser_obj['b':'d'])
print(ser_obj.loc['b':'d']) #前闭后闭
print('-'*50)


a    0
b    1
c    2
d    3
e    4
dtype: int64
b    1
c    2
d    3
dtype: int64
b    1
c    2
d    3
dtype: int64
--------------------------------------------------


In [14]:
# DataFrame
df_obj = pd.DataFrame(np.random.randn(5,4),
                      columns = list('abcd'),
                      index=list('abcde'))
print(df_obj)
print('-'*50)
print(df_obj['a'])  #建议不用,拿的是列
print('-'*50)
print(df_obj.loc['a'])  #拿的是行
print('-'*50)


          a         b         c         d
a  0.367400 -1.489073 -1.130567  2.462103
b  0.184785  0.402973  0.752028  0.587060
c -0.534158  0.390530  0.613794 -1.117852
d  0.301598  0.587865 -1.370987 -0.293873
e  1.833872 -0.932808 -1.726186 -0.358940
--------------------------------------------------
a    0.367400
b    0.184785
c   -0.534158
d    0.301598
e    1.833872
Name: a, dtype: float64
--------------------------------------------------
a    0.367400
b   -1.489073
c   -1.130567
d    2.462103
Name: a, dtype: float64
--------------------------------------------------


In [15]:
# 第一个参数索引行，第二个参数是列,loc或者iloc效率高于直接用取下标的方式，前闭后闭
print(df_obj.loc['a':'c', 'b':'d']) #连续索引
print(df_obj.loc[['a','c'], ['b','d']]) #不连续索引
print(df_obj.loc[['c'],['b']]) #取一个值,返回的是DataFrame类型
print(df_obj.loc['c','b'])  #取一个值

          b         c         d
a -1.489073 -1.130567  2.462103
b  0.402973  0.752028  0.587060
c  0.390530  0.613794 -1.117852
          b         d
a -1.489073  2.462103
c  0.390530 -1.117852
         b
c  0.39053
0.3905296209529429


## iloc 位置索引(推荐使用)

In [16]:
ser_obj
print('-'*50)
# Series
print(ser_obj[1:3])
print('-'*50)
print(ser_obj.iloc[1:3]) # 前闭后开[)，效率高


--------------------------------------------------
b    1
c    2
dtype: int64
--------------------------------------------------
b    1
c    2
dtype: int64


In [17]:
df_obj

Unnamed: 0,a,b,c,d
a,0.3674,-1.489073,-1.130567,2.462103
b,0.184785,0.402973,0.752028,0.58706
c,-0.534158,0.39053,0.613794,-1.117852
d,0.301598,0.587865,-1.370987,-0.293873
e,1.833872,-0.932808,-1.726186,-0.35894


In [18]:

# DataFrame，iloc是前闭后开[)
print(df_obj)
print('-'*50)
print(df_obj.iloc[0:2, 0:2]) 
print('-'*50)
print(df_obj.iloc[[0,2], [0,2]]) # 不连续索引
print('-'*50)
print(df_obj.iloc[0,0]) # 取一个值

          a         b         c         d
a  0.367400 -1.489073 -1.130567  2.462103
b  0.184785  0.402973  0.752028  0.587060
c -0.534158  0.390530  0.613794 -1.117852
d  0.301598  0.587865 -1.370987 -0.293873
e  1.833872 -0.932808 -1.726186 -0.358940
--------------------------------------------------
          a         b
a  0.367400 -1.489073
b  0.184785  0.402973
--------------------------------------------------
          a         c
a  0.367400 -1.130567
c -0.534158  0.613794
--------------------------------------------------
0.3673995790282642


In [19]:
#没有设置行和列索引的DataFrame，iloc和loc的区别
df_obj2 = pd.DataFrame(np.random.randn(5,4))
print(df_obj2)
print('-'*50)
print(df_obj2.iloc[0:2]) #左闭右开 2行
print('-'*50)
print(df_obj2.loc[0:2]) #左闭右闭 3行

          0         1         2         3
0 -1.334041 -0.397595 -0.266575  0.382490
1  0.981961 -0.912542  0.577852  0.731562
2  0.043123 -0.041562  1.876835  0.211598
3  0.646437  2.112772 -0.368660 -1.232499
4  0.251516 -1.054162  0.695022  0.284609
--------------------------------------------------
          0         1         2         3
0 -1.334041 -0.397595 -0.266575  0.382490
1  0.981961 -0.912542  0.577852  0.731562
--------------------------------------------------
          0         1         2         3
0 -1.334041 -0.397595 -0.266575  0.382490
1  0.981961 -0.912542  0.577852  0.731562
2  0.043123 -0.041562  1.876835  0.211598
