In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series

In [2]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [3]:
# ----------------1.Reindexing（重新索引）----------------------

obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
# 在series上调用reindex能更改index，如果没有对应index的话会引入缺失数据：
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [4]:
# 对于DataFrame，reindex能更改row index,或column index。reindex the rows:
frame = pd.DataFrame(np.arange(9).reshape(3, 3),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [10]:
states = ['Texas', 'Utah', 'California']
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [11]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [12]:
# 还可以使用loc更简洁的reindex：
frame.loc[['a', 'b', 'c', 'd'], states]

Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


In [13]:
# ---------------------2 Dropping Entries from an Axis (按轴删除记录)----------------------------

In [14]:

obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [16]:
# 对于series，drop返回一个新的object，并删去你指定的axis的值：
obj1 = obj.drop(['d', 'c'])
obj1

a    0.0
b    1.0
e    4.0
dtype: float64

In [17]:
data = pd.DataFrame(np.arange(16).reshape(4, 4),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [18]:
# 对于DataFrame，index能按行或列的axis来删除：
data.drop(['Colorado', 'Ohio']) # 默认axis = 0，axis = index


Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [19]:
# 列处理：drop列的话，设定axis=1或axis='columns':
data.drop('two', axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [20]:
# drop也可以不返回一个新的object，而是直接更改series or dataframe in-place:
obj.drop('c', inplace=True)
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [21]:
# -----------------------3 Indexing, Selection, and Filtering(索引，选择，过滤)------------------------

In [22]:
# series indexing(obj[...]) 相当于numpy的array indexing, 而且除了整数，还可以使用series的index：
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
#!!!!!!!!!!!用label来slicing(切片)的时候，和python的切片不一样的在于，会包括尾节点：
obj['b':'c']

b    1.0
c    2.0
dtype: float64

In [23]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [24]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [25]:
data[data['three'] > 5] # 通过布尔数组

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [29]:
# ----------------4.用loc和iloc来选择 行和列的子集（使用loc(for label)或ilco(for integers):）-----------------
print(data)
print(data.loc['Colorado', ['two', 'three']])
# 同iloc实现相同的效果：根据整数位置选取
data.iloc[1, [1, 2]]

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
two      5
three    6
Name: Colorado, dtype: int32


two      5
three    6
Name: Colorado, dtype: int32

In [30]:
# ----------------5 Arithmetic and Data Alignment (运算和数据对齐)----------------

In [32]:
# 数据补齐
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
print(s1)
s2 = pd.Series([2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
s2

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64


a    2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [33]:
s1 + s2

a    9.4
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [35]:
s1.add(s2,fill_value = 0)  # 使用fill_value：填充数据

a    9.4
c    1.1
d    3.4
e    0.0
f    4.0
g    3.1
dtype: float64

In [36]:
# 广播
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list('bde'),
                    index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)
series = frame.iloc[0]
print(series)

          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0
b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64


In [37]:
# 可以理解为series的index与dataframe的列匹配，broadcasting down the rows(向下按行广播):
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [39]:
# 如果想要广播列，去匹配行，必须要用到算数方法：
series3 = frame['d']
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [40]:
frame.sub(series3, axis='index') #axis参数就是用来匹配轴的。在这个例子里是匹配dataframe的row index(axis='index or axis=0)，然后再广播。

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


In [41]:
# ---------------------6 Function Application and Mapping (函数应用和映射)-----------------------

In [42]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), 
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,-2.113508,2.59552,-0.035685
Ohio,0.712662,0.951341,-0.123423
Texas,-0.918045,-1.120308,-0.205849
Oregon,1.78375,-0.3667,-1.140221


In [43]:
# numpy的ufuncs(element-wise数组方法)也能用在pandas的object上
np.abs(frame)

Unnamed: 0,b,d,e
Utah,2.113508,2.59552,0.035685
Ohio,0.712662,0.951341,0.123423
Texas,0.918045,1.120308,0.205849
Oregon,1.78375,0.3667,1.140221


In [44]:
# 另一个常用的操作是把一个用在一维数组上的函数，应用在每一行或每一列上。要用到DataFrame中的apply函数：
f = lambda x: x.max() - x.min()
frame.apply(f) # 在frame中的每一列，这个函数被调用一次

b    3.897258
d    3.715828
e    1.104536
dtype: float64

In [47]:
# 如果你传入axis='column'用于apply，那么函数会被用在每一行：
print(frame.apply(f, axis='columns')) #axis = 1
print(frame.apply(f, axis=1))

Utah      4.709028
Ohio      1.074764
Texas     0.914459
Oregon    2.923971
dtype: float64
Utah      4.709028
Ohio      1.074764
Texas     0.914459
Oregon    2.923971
dtype: float64


In [48]:
print(frame)
def f(x): 
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)

               b         d         e
Utah   -2.113508  2.595520 -0.035685
Ohio    0.712662  0.951341 -0.123423
Texas  -0.918045 -1.120308 -0.205849
Oregon  1.783750 -0.366700 -1.140221


Unnamed: 0,b,d,e
min,-2.113508,-1.120308,-1.140221
max,1.78375,2.59552,-0.035685


In [49]:
# 元素级别的python函数也是可以用的

format = lambda x: '%.2f' % x
frame['e'].map(format)

Utah      -0.04
Ohio      -0.12
Texas     -0.21
Oregon    -1.14
Name: e, dtype: object

In [50]:
frame.applymap(format) # 之所以叫做applymap(),是因为Series对象有一个应用于元素级别的map函数，如上

Unnamed: 0,b,d,e
Utah,-2.11,2.6,-0.04
Ohio,0.71,0.95,-0.12
Texas,-0.92,-1.12,-0.21
Oregon,1.78,-0.37,-1.14


In [51]:
# --------------------7 Sorting and Ranking （排序）----------------------

In [52]:
#按row或column index来排序的话，可以用sort_index方法，会返回一个新的object：
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int32

In [53]:
# 在DataFrame，可以用index或其他axis来排序：
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
print(frame)
frame.sort_index()

       d  a  b  c
three  0  1  2  3
one    4  5  6  7


Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [55]:
frame.sort_index(axis=1, ascending=False)  # 默认是升序，可以设置降序：

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [56]:
# 通过值来排序，用sort_values方法：
# 缺失值会被排在最后：
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [57]:
# 对于一个DataFrame，可以用一列或多列作为sort keys。这样的话，只需要把一列或多列的名字传递给by即可：
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

Unnamed: 0,a,b
0,0,4
1,1,7
2,0,-3
3,1,2


In [58]:
frame.sort_values(by=['a', 'b'])


Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


In [59]:
# rank() rank 表示在这个数在原来的Series中排第几名，有相同的数，取其排名平均（默认）作为值：
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [60]:
obj.sort_values()

1   -5
5    0
4    2
3    4
6    4
0    7
2    7
dtype: int64

In [61]:
# 在obj中，4和4的排名是第4名和第五名，取平均得4.5。7和7的排名分别是第六名和第七名，则其排名取平均得6.5。
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [63]:
# rank也可以根据数据被观测到的顺序来设定：
print(obj)
obj.rank(method='first') # 给第一个看到的7（label 0）设置rank为6，第二个看到的7（label 2）设置rank为7

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64


0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [64]:
# 按照降序进行排名，max表示使用整个分组的最大排名，（average=平均排名，max,min ,first=出现顺序）
obj.rank(ascending=False, method='max')  # 数字7应该是排名1和2，取最大的值=2

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [65]:
# dataframe 可以根据行或列来计算rank:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2],
                      'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})
frame

Unnamed: 0,a,b,c
0,0,4.3,-2.0
1,1,7.0,5.0
2,0,-3.0,8.0
3,1,2.0,-2.5


In [68]:
frame.rank(axis='columns') # columns表示列与列之间的排序（即每一行里数据间的排序）

Unnamed: 0,a,b,c
0,2.0,3.0,1.0
1,1.0,3.0,2.0
2,2.0,1.0,3.0
3,2.0,3.0,1.0


In [69]:
# ------------------8.有重复label的轴索引----------------------

In [70]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int32

In [71]:
#index的is_unique特性能告诉我们label是否是唯一的
obj.index.is_unique

False

In [72]:
# 数据选择对于重复label则表现有点不同。如果一个label有多个值，那么就会返回一个series, 
obj['a']


a    0
a    1
dtype: int32

In [73]:
## 如果是label只对应一个值的话，会返回一个标量：
obj['c']

4