In [1]:
import pandas as pd
import numpy as np

obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b','a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [3]:
# 用该Series的reindex将会根据新索引进行重排。如果某个索引值当前不存在，就引
# 入缺失值

obj2 = obj.reindex(['a','b','c','d','e'])
obj2


a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [4]:
# 对于时间序列这样的有序数据，重新索引时可能需要做一些插值处理。method选项
# 即可达到此目的，例如，使用ffill可以实现前向值填充：

obj3 = pd.Series(['blue', 'purple', 'yellow'],index=[0,2,4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [7]:
obj3.reindex(range(6), method='ffill')
obj3

0      blue
2    purple
4    yellow
dtype: object

In [2]:
# 今天无心学习，改日继续。顶不住了。太无聊 了。
# 继续学习
obj['b':'c'] 


b    7.2
a   -5.3
c    3.6
dtype: float64

In [4]:
obj['b':'c']  = 5
obj

d    4.5
b    5.0
a    5.0
c    5.0
dtype: float64

In [5]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),index=['Ohio', 'Colorado', 'Utah',
'New York'],columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [7]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [9]:
data[['two','one']]

Unnamed: 0,two,one
Ohio,1,0
Colorado,5,4
Utah,9,8
New York,13,12


In [10]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [11]:
data[data['three']>5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [15]:
data[data<5]  = 0 
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [16]:
# 重要,选取loc与iloc 进行选取
# 对于DataFrame的行的标签索引，我引入了特殊的标签运算符loc和iloc。它们可以
# 让你用类似NumPy的标记，使用轴标签（loc）或整数索引（iloc），从DataFrame
# 选择行和列的子集。
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int32

In [19]:
# 上面的值选取的时候公式换算.
# data.iloc[2,[3,0,1]] = data.loc['Utah', ['four','one', 'two']]
data.iloc[2,[3,0,1]]
# data.loc['Utah', ['four','one', 'two']]

four    11
one      8
two      9
Name: Utah, dtype: int32

In [20]:
# 选取第二行
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [21]:
data.iloc[[1, 2], [0, 1,2]]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10


In [22]:
# 选取行列
data.loc [:'Utah', 'two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32

In [24]:
# iloc 前面是行,后面是列
data.iloc[:, :3]
# [data.three > 5]

Unnamed: 0,one,two,three
Ohio,0,0,0
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


In [32]:
# 整数索引,选取ser[-1] 会出错
ser = pd.Series(np.arange(3.))
ser
ser[0]

0.0

In [33]:
ser = pd.Series(np.arange(3.),index=['a', 'b', 'c'])
ser[-1]

2.0

In [35]:
# pandas 最重要的一个功能就是，它可以对不同索引对象进行计算。
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c','d', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],index=['a', 'c', 'e', 'f', 'g'])

In [36]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [37]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [39]:
# 自动数据对齐，再不重叠的区域引入了NA值
s1+s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [40]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [41]:
df1


Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [42]:
df2


Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [43]:
# 把它们相加后将会返回一个新的DataFrame，其索引和列为原来那两个DataFrame
# 的并集：

df1+df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [44]:
# 如果DataFrame对象相加，没有共用的列或行标签，结果都会是空
df1  = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'B': [3, 4]})

In [45]:
df1

Unnamed: 0,A
0,1
1,2


In [46]:
df2

Unnamed: 0,B
0,3
1,4


In [47]:
df1-df2

Unnamed: 0,A,B
0,,
1,,


In [48]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),columns=list('abcde'))

In [49]:
df2.loc[1,'b'] = np.nan

In [50]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [51]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [54]:
df3 = df1+df2
# df3.fillna(0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,0.0
1,9.0,0.0,13.0,15.0,0.0
2,18.0,20.0,22.0,24.0,0.0
3,0.0,0.0,0.0,0.0,0.0


In [53]:

# 空值就变为零了
df1.add(df2,fill_value = 0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [55]:
1/df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [56]:
df1.rdiv(1)

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [57]:
df1.reindex(columns=df2.columns,fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


In [58]:
# 跟不同维度的NumPy数组一样，DataFrame和Series之间算术运算也是有明确规定
arr = np.arange(12.).reshape((3,4))
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [59]:
arr[0]

array([0., 1., 2., 3.])

In [60]:
# 当我们从arr减去arr[0]，每一行都会执行这个操作。这就叫做广播
# （broadcasting）
arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [70]:
frame = pd.DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.iloc[0]
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [71]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [72]:
# 有意思，它这也会广播哦
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [74]:
# 如果某个索引值在DataFrame的列或Series的索引中找不到，则参与运算的两个对
# 象就会被重新索引以形成并集：
series2 = pd.Series(range(3), index=['b', 'e', 'f'])
series2

b    0
e    1
f    2
dtype: int64

In [75]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [76]:
# 如果你希望匹配行且在列上广播，则必须使用算术运算方法。例如
series3 = frame['d']
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [77]:
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [78]:
frame.sub(series3,axis = 'index')
# 传入的轴号就是希望匹配的轴。在本例中，我们的目的是匹配DataFrame的行索引
# （axis='index' or axis=0）并进行广播

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


In [None]:
# 下章学习函数应用与映射