### Reindexing

In [1]:
import numpy as np
import pandas as pd

In [2]:
obj = pd.Series([4.5, 3.6, 7.2, 8.4], index=['d', 'b', 'c', 'a'])

In [3]:
obj

d    4.5
b    3.6
c    7.2
a    8.4
dtype: float64

In [4]:
obj.reindex(['a', 'b', 'c', 'd', 'e'])

a    8.4
b    3.6
c    7.2
d    4.5
e    NaN
dtype: float64

In [5]:
# with method ffill

obj3 = pd.Series(['blue', 'purple', 'black'], index=[1,3,5])

In [6]:
obj3

1      blue
3    purple
5     black
dtype: object

In [7]:
obj3.reindex(range(6), method='ffill')

0       NaN
1      blue
2      blue
3    purple
4    purple
5     black
dtype: object

In [8]:
frame = pd.DataFrame(np.arange(9).reshape(3,3),
                    index=['a', 'c', 'd'],
                    columns=['ohio', 'texas', 'california'])

In [9]:
frame

Unnamed: 0,ohio,texas,california
a,0,1,2
c,3,4,5
d,6,7,8


In [10]:
frame.reindex(['a', 'b', 'c', 'd'])

Unnamed: 0,ohio,texas,california
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [12]:
# reindex columns

states = ['chongqing', 'shanghai', 'beijing', 'ohio']
frame.reindex(columns=states)

Unnamed: 0,chongqing,shanghai,beijing,ohio
a,,,,0
c,,,,3
d,,,,6


In [14]:
# reindex index and columns
frame.loc[['a', 'b', 'c', 'd'], states]

Unnamed: 0,chongqing,shanghai,beijing,ohio
a,,,,0.0
b,,,,
c,,,,3.0
d,,,,6.0


### Dropping	Entries	from	an	Axis

In [15]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])

In [16]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [17]:
obj.drop('c')

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [18]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [19]:
obj.drop(['c', 'd'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [20]:
# drop method will not alters obj in place
# drop method can accept str or list

In [21]:
# drop for dataframe

data = pd.DataFrame(np.arange(9).reshape(3,3),
                   columns=['chongqing', 'shanghai', 'beijing'])

In [22]:
data

Unnamed: 0,chongqing,shanghai,beijing
0,0,1,2
1,3,4,5
2,6,7,8


In [23]:
data.drop(1)

Unnamed: 0,chongqing,shanghai,beijing
0,0,1,2
2,6,7,8


In [24]:
data.drop('shanghai', axis=1)

Unnamed: 0,chongqing,beijing
0,0,2
1,3,5
2,6,8


In [25]:
data.drop?

In [26]:
# manipulate drop in-place
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [27]:
obj.drop('a', inplace=True)

In [28]:
obj

b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

### Indexing,	Selection,	and	Filtering

In [29]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])

In [30]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [31]:
obj['b']

1.0

In [32]:
obj[1]

1.0

In [33]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [34]:
obj[[1,2,3]]

b    1.0
c    2.0
d    3.0
dtype: float64

In [35]:
obj[['a', 'c', 'd']]

a    0.0
c    2.0
d    3.0
dtype: float64

In [36]:
obj[obj<2]

a    0.0
b    1.0
dtype: float64

In [37]:
# slicing with labels
# note that 'c' endpoint is inclusive
obj['a': 'c']

a    0.0
b    1.0
c    2.0
dtype: float64

In [38]:
# assign values

obj['a': 'b'] = 5

In [39]:
obj

a    5.0
b    5.0
c    2.0
d    3.0
dtype: float64

In [40]:
# indexing for dataframe
data

Unnamed: 0,chongqing,shanghai,beijing
0,0,1,2
1,3,4,5
2,6,7,8


In [41]:
data[:2]

Unnamed: 0,chongqing,shanghai,beijing
0,0,1,2
1,3,4,5


In [43]:
data[data['chongqing']>1]

Unnamed: 0,chongqing,shanghai,beijing
1,3,4,5
2,6,7,8


In [44]:
data < 5

Unnamed: 0,chongqing,shanghai,beijing
0,True,True,True
1,True,True,False
2,False,False,False


In [45]:
data[data>5]

Unnamed: 0,chongqing,shanghai,beijing
0,,,
1,,,
2,6.0,7.0,8.0


In [46]:
data

Unnamed: 0,chongqing,shanghai,beijing
0,0,1,2
1,3,4,5
2,6,7,8


In [47]:
# assign 

data[data < 5] = 5

In [48]:
data

Unnamed: 0,chongqing,shanghai,beijing
0,5,5,5
1,5,5,5
2,6,7,8


### Selection	with	loc	and	iloc

In [49]:
data

Unnamed: 0,chongqing,shanghai,beijing
0,5,5,5
1,5,5,5
2,6,7,8


In [51]:
data.loc[1, ['chongqing', 'beijing']]

chongqing    5
beijing      5
Name: 1, dtype: int64

In [52]:
# loc　的参数，先是纵轴，然后是横轴
data.loc?

In [53]:
# iloc
data

Unnamed: 0,chongqing,shanghai,beijing
0,5,5,5
1,5,5,5
2,6,7,8


In [55]:
data.iloc[2]

chongqing    6
shanghai     7
beijing      8
Name: 2, dtype: int64

In [56]:
data.iloc[2, [2,1,0]]

beijing      8
shanghai     7
chongqing    6
Name: 2, dtype: int64

In [58]:
data.iloc[2, [2,1]]

beijing     8
shanghai    7
Name: 2, dtype: int64

In [59]:
data.iloc[[1,2], [1,0,2]]

Unnamed: 0,shanghai,chongqing,beijing
1,5,5,5
2,7,6,8


In [60]:
# slicing

data.loc[:2, 'shanghai']

0    5
1    5
2    7
Name: shanghai, dtype: int64

In [62]:
data.iloc[:2, :2]

Unnamed: 0,chongqing,shanghai
0,5,5
1,5,5


In [75]:
# loc 和 iloc 的区别
# iloc = index loc for intgers
# loc for labels

In [65]:
# re - slicing

data.iloc[:2, :2][data['chongqing'] >= 5]

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,chongqing,shanghai
0,5,5
1,5,5


### Integer	Indexes

In [66]:
ser = pd.Series(np.arange(3.))

In [67]:
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [68]:
ser[1]

1.0

In [70]:
# this will cause error
# ser[-1]

In [71]:
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])

In [72]:
ser2

a    0.0
b    1.0
c    2.0
dtype: float64

In [73]:
ser2[1]

1.0

In [74]:
ser2[-1]

2.0

In [76]:
ser[:1]

0    0.0
dtype: float64

In [77]:
ser[:-1]

0    0.0
1    1.0
dtype: float64

In [78]:
ser2[:1]

a    0.0
dtype: float64

In [79]:
ser[:-1]

0    0.0
1    1.0
dtype: float64

In [80]:
# so for more safety 
# using : seems more safety


### Arithmetic	and	Data	Alignment

In [81]:
s1 = pd.Series([7.3, -2.5, 3.4, 2.5], index=['a', 'b', 'c', 'd'])

In [82]:
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'b', 'f', 'd', 'e'])

In [83]:
s1

a    7.3
b   -2.5
c    3.4
d    2.5
dtype: float64

In [84]:
s2

a   -2.1
b    3.6
f   -1.5
d    4.0
e    3.1
dtype: float64

In [85]:
s1 + s2

a    5.2
b    1.1
c    NaN
d    6.5
e    NaN
f    NaN
dtype: float64

In [87]:
s1 * s2

a   -15.33
b    -9.00
c      NaN
d    10.00
e      NaN
f      NaN
dtype: float64

In [88]:
# for dataframe
df1 = pd.DataFrame(np.arange(9.0).reshape((3,3)), columns=list('bcd'), index='chongqing beijing shanghai'.split())

In [89]:
df1

Unnamed: 0,b,c,d
chongqing,0.0,1.0,2.0
beijing,3.0,4.0,5.0
shanghai,6.0,7.0,8.0


In [91]:
df2 = pd.DataFrame(np.arange(12).reshape((4,3)), columns=list('bde'), 
                   index='chongqing chengdu beijing shanghai'.split())

In [92]:
df2

Unnamed: 0,b,d,e
chongqing,0,1,2
chengdu,3,4,5
beijing,6,7,8
shanghai,9,10,11


In [93]:
df1 + df2

Unnamed: 0,b,c,d,e
beijing,9.0,,12.0,
chengdu,,,,
chongqing,0.0,,3.0,
shanghai,15.0,,18.0,


### Arithmetic	methods	with	fill	values

In [94]:
df1

Unnamed: 0,b,c,d
chongqing,0.0,1.0,2.0
beijing,3.0,4.0,5.0
shanghai,6.0,7.0,8.0


In [98]:
df2 = pd.DataFrame(np.arange(12).reshape((3,4)), columns=list('bcde'), 
                   index='chongqing beijing shanghai'.split())

In [99]:
df2

Unnamed: 0,b,c,d,e
chongqing,0,1,2,3
beijing,4,5,6,7
shanghai,8,9,10,11


In [96]:
df1.loc['chongqing', 'b'] = 1

In [97]:
df1

Unnamed: 0,b,c,d
chongqing,1.0,1.0,2.0
beijing,3.0,4.0,5.0
shanghai,6.0,7.0,8.0


In [100]:
df1 + df2

Unnamed: 0,b,c,d,e
chongqing,1.0,2.0,4.0,
beijing,7.0,9.0,11.0,
shanghai,14.0,16.0,18.0,


In [101]:
# in order to avoid NaN

df1.add(df2, fill_value=0)

Unnamed: 0,b,c,d,e
chongqing,1.0,2.0,4.0,3.0
beijing,7.0,9.0,11.0,7.0
shanghai,14.0,16.0,18.0,11.0


In [102]:
# arithmetic 

1 / df1

Unnamed: 0,b,c,d
chongqing,1.0,1.0,0.5
beijing,0.333333,0.25,0.2
shanghai,0.166667,0.142857,0.125


In [103]:
df1.rdiv(1)

Unnamed: 0,b,c,d
chongqing,1.0,1.0,0.5
beijing,0.333333,0.25,0.2
shanghai,0.166667,0.142857,0.125


In [105]:
df1

Unnamed: 0,b,c,d
chongqing,1.0,1.0,2.0
beijing,3.0,4.0,5.0
shanghai,6.0,7.0,8.0


In [106]:
df2

Unnamed: 0,b,c,d,e
chongqing,0,1,2,3
beijing,4,5,6,7
shanghai,8,9,10,11


In [104]:
 fill value when re-index
df2.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,b,c,d,e
chongqing,0,1,2,3
beijing,4,5,6,7
shanghai,8,9,10,11


### Operations	between	DataFrame	and	Series

In [107]:
arr = np.arange(12).reshape(3,4)

In [108]:
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [109]:
arr[0]

array([0, 1, 2, 3])

In [110]:
arr - arr[0]

array([[0, 0, 0, 0],
       [4, 4, 4, 4],
       [8, 8, 8, 8]])

In [111]:
# subtract between dataframe and series

frame = pd.DataFrame(np.arange(12).reshape(4,3),
                     columns=list('bde'),
                    index='ohio texas oregon utah'.split())

In [112]:
frame

Unnamed: 0,b,d,e
ohio,0,1,2
texas,3,4,5
oregon,6,7,8
utah,9,10,11


In [114]:
series = frame.iloc[0]

In [115]:
series

b    0
d    1
e    2
Name: ohio, dtype: int64

In [116]:
frame - series

Unnamed: 0,b,d,e
ohio,0,0,0
texas,3,3,3
oregon,6,6,6
utah,9,9,9


In [118]:
# if columns not found
series2 = pd.Series([1,2,3], index=list('bcd'))

In [119]:
series2

b    1
c    2
d    3
dtype: int64

In [120]:
frame - series2

Unnamed: 0,b,c,d,e
ohio,-1.0,,-2.0,
texas,2.0,,1.0,
oregon,5.0,,4.0,
utah,8.0,,7.0,


In [121]:
frame

Unnamed: 0,b,d,e
ohio,0,1,2
texas,3,4,5
oregon,6,7,8
utah,9,10,11


In [122]:
frame + series2

Unnamed: 0,b,c,d,e
ohio,1.0,,4.0,
texas,4.0,,7.0,
oregon,7.0,,10.0,
utah,10.0,,13.0,


### Function	Application	and	Mapping

In [123]:
frame

Unnamed: 0,b,d,e
ohio,0,1,2
texas,3,4,5
oregon,6,7,8
utah,9,10,11


In [124]:
frame * (-1)

Unnamed: 0,b,d,e
ohio,0,-1,-2
texas,-3,-4,-5
oregon,-6,-7,-8
utah,-9,-10,-11


In [125]:
np.abs(frame)

Unnamed: 0,b,d,e
ohio,0,1,2
texas,3,4,5
oregon,6,7,8
utah,9,10,11


In [126]:
# lambda

f = lambda x: x * (-1)
frame.apply(f)

Unnamed: 0,b,d,e
ohio,0,-1,-2
texas,-3,-4,-5
oregon,-6,-7,-8
utah,-9,-10,-11


In [127]:
f2 = lambda x:x.max() - x.min()

In [128]:
frame.apply(f2)

b    9
d    9
e    9
dtype: int64

In [129]:
frame.apply(f2, axis=1)

ohio      2
texas     2
oregon    2
utah      2
dtype: int64

In [130]:
# applymap
format_ = lambda x: '%.2f'%x

frame.applymap(format_)

Unnamed: 0,b,d,e
ohio,0.0,1.0,2.0
texas,3.0,4.0,5.0
oregon,6.0,7.0,8.0
utah,9.0,10.0,11.0


### Sorting	and	Ranking

In [133]:
obj = pd.Series(range(4), index='a c d b'.split())

In [134]:
obj

a    0
c    1
d    2
b    3
dtype: int64

In [135]:
obj.sort_index()

a    0
b    3
c    1
d    2
dtype: int64

In [141]:
# for dataframe

data = pd.DataFrame(np.arange(9).reshape((3,3)),
                   columns='a c d'.split(),
                   index = [2, 1, 0])

In [142]:
data

Unnamed: 0,a,c,d
2,0,1,2
1,3,4,5
0,6,7,8


In [143]:
data.sort_index()

Unnamed: 0,a,c,d
0,6,7,8
1,3,4,5
2,0,1,2


In [144]:
data.sort_index(axis=1)

Unnamed: 0,a,c,d
2,0,1,2
1,3,4,5
0,6,7,8


In [145]:
data.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,a
2,2,1,0
1,5,4,3
0,8,7,6


In [146]:
# sort_values 

obj

a    0
c    1
d    2
b    3
dtype: int64

In [147]:
obj = pd.Series([1, 5, 2, 4], index='a c b d'.split())

In [148]:
obj

a    1
c    5
b    2
d    4
dtype: int64

In [149]:
obj.sort_values()

a    1
b    2
d    4
c    5
dtype: int64

In [150]:
obj.sort_values(ascending=False)

c    5
d    4
b    2
a    1
dtype: int64

In [151]:
# sort_value for dataframe
frame

Unnamed: 0,b,d,e
ohio,0,1,2
texas,3,4,5
oregon,6,7,8
utah,9,10,11


In [152]:
frame.sort_values?

In [153]:
frame.sort_values('b')

Unnamed: 0,b,d,e
ohio,0,1,2
texas,3,4,5
oregon,6,7,8
utah,9,10,11


In [155]:
frame.sort_values('b', ascending=False)

Unnamed: 0,b,d,e
utah,9,10,11
oregon,6,7,8
texas,3,4,5
ohio,0,1,2


In [156]:
# rank

In [162]:
obj = pd.Series([7, -5, 3.6, 4.8, 2.9])

In [165]:
obj

0    7.0
1   -5.0
2    3.6
3    4.8
4    2.9
dtype: float64

In [166]:
# 中位数
obj.rank()

0    5.0
1    1.0
2    3.0
3    4.0
4    2.0
dtype: float64

In [167]:
obj.rank(method='first')

0    5.0
1    1.0
2    3.0
3    4.0
4    2.0
dtype: float64

### Axis	Indexes	with	Duplicate	Labels

In [173]:
obj = pd.Series(list(range(5)), index='a b c a b'.split())

In [174]:
obj

a    0
b    1
c    2
a    3
b    4
dtype: int64

In [175]:
# is_unique
obj.index.is_unique

False

In [176]:
obj['a']

a    0
a    3
dtype: int64

In [177]:
obj['b']

b    1
b    4
dtype: int64