In [1]:
import pandas as pd
import numpy as np

In [2]:
# reindex, which means to create a new object with the data conformed to a new index
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [3]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [4]:
# use reindex to do interpolation or filling
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
print(obj3)
obj3.reindex(range(6), method="ffill")  # ffill: forward-fills, bfill
print(obj3)

0      blue
2    purple
4    yellow
dtype: object
0      blue
2    purple
4    yellow
dtype: object


In [6]:
# with DataFrame, reindex can alter either the (row)index or columns or both
# when passed only a sequence, it reindexes the rows
import numpy as np
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])
print(frame)
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
print(frame2)
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8
   Ohio  Texas  California
a   0.0    1.0         2.0
b   NaN    NaN         NaN
c   3.0    4.0         5.0
d   6.0    7.0         8.0


Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [8]:
# Drop entries from an axis
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
print(obj)
obj2 = obj.drop('c')
print(obj2)
obj3 = obj.drop(['d', 'c'])
print(obj3)

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64
a    0.0
b    1.0
e    4.0
dtype: float64


In [11]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
print(data)
print(data.drop(['Ohio']))
# drop values from columns by passing axis=1 or axis='columns'
print(data.drop('two', axis=1))
# set inplace=True, no returning a new object
obj.drop('c', inplace=True)
obj

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
          one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
          one  three  four
Ohio        0      2     3
Colorado    4      6     7
Utah        8     10    11
New York   12     14    15


a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [15]:
# Series indexing(obj[...]) works analogously to NumPy array indexing
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
print(obj)
print(obj['b'])
print(obj[1:2])
print(obj[['b', 'a', 'c']])
print(obj[[1, 3]])
print(obj[obj < 2])
print(obj['b':'c'])     # slicing with labels: end-point is inclusive
obj['b':'c'] = 5
print(obj)

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64
1.0
b    1.0
dtype: float64
b    1.0
a    0.0
c    2.0
dtype: float64
b    1.0
d    3.0
dtype: float64
a    0.0
b    1.0
dtype: float64
b    1.0
c    2.0
dtype: float64
a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64


In [17]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
print(data)
print(data['two'])
print(data[['three', 'one']])
print(data[:2])
print(data[data['three'] > 5])

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32
          three  one
Ohio          2    0
Colorado      6    4
Utah         10    8
New York     14   12
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
          one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


In [22]:
print(data < 5)
data[data < 5] = 0
data

            one    two  three   four
Ohio       True   True   True   True
Colorado   True  False  False  False
Utah      False  False  False  False
New York  False  False  False  False


Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [26]:
# loc, iloc enable you to select a subset of the rows and columns from DataFrame
print(data.loc['Utah', ['two', 'one']])
print(data.iloc[2, [3, 0, 1]])

two    9
one    8
Name: Utah, dtype: int32
four    11
one      8
two      9
Name: Utah, dtype: int32


In [3]:
"""
When you are adding together objects, if any
index pairs are not the same, the respective index in the result will be the union of the
index pairs
"""
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [6]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [11]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))
print(df1)
print(df2)
df2.loc[1, 'b'] = np.nan
print(df2)
print(df1 + df2)
df1.add(df2, fill_value=0)  # use add method
# add, radd, sub, rsub, div, rdiv, floordiv, rfloordiv(//), mul, rmul, pow, rpow

     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0
      a     b     c     d     e
0   0.0   1.0   2.0   3.0   4.0
1   5.0   6.0   7.0   8.0   9.0
2  10.0  11.0  12.0  13.0  14.0
3  15.0  16.0  17.0  18.0  19.0
      a     b     c     d     e
0   0.0   1.0   2.0   3.0   4.0
1   5.0   NaN   7.0   8.0   9.0
2  10.0  11.0  12.0  13.0  14.0
3  15.0  16.0  17.0  18.0  19.0
      a     b     c     d   e
0   0.0   2.0   4.0   6.0 NaN
1   9.0   NaN  13.0  15.0 NaN
2  18.0  20.0  22.0  24.0 NaN
3   NaN   NaN   NaN   NaN NaN


Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [3]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.iloc[0]
print(frame)
print(series)
print(frame - series)   # broadcasting down the rows

          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0
b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64
          b    d    e
Utah    0.0  0.0  0.0
Ohio    3.0  3.0  3.0
Texas   6.0  6.0  6.0
Oregon  9.0  9.0  9.0


In [4]:
# use arithmetic methods to match on the row
series3 = frame['d']
print(frame)
print(frame.sub(series3, axis='index'))

          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0
          b    d    e
Utah   -1.0  0.0  1.0
Ohio   -1.0  0.0  1.0
Texas  -1.0  0.0  1.0
Oregon -1.0  0.0  1.0


In [6]:
# apply a function on one-dimensional arrays to each column or row
f = lambda x: x.max() - x.min()
print(frame.apply(f))   # return a Series
print(frame.apply(f, axis="columns"))

b    9.0
d    9.0
e    9.0
dtype: float64
Utah      2.0
Ohio      2.0
Texas     2.0
Oregon    2.0
dtype: float64


In [7]:
# compute a formatted string from each floating-point value in frame, use applymap
format = lambda x: '%.2f' % x
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [11]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c'])
print(frame)
print(frame.sort_index())
print(frame.sort_index(axis='columns'))
print(frame.sort_index(axis=1, ascending=False))    # ascend by default, set False to sort descending

       d  a  b  c
three  0  1  2  3
one    4  5  6  7
       d  a  b  c
one    4  5  6  7
three  0  1  2  3
       a  b  c  d
three  1  2  3  0
one    5  6  7  4
       d  c  b  a
three  0  3  2  1
one    4  7  6  5


In [14]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
print(frame)
print(frame.sort_values(by='b'))
print(frame.sort_values(by=['a', 'b']))

   b  a
0  4  0
1  7  1
2 -3  0
3  2  1
   b  a
2 -3  0
3  2  1
0  4  0
1  7  1
   b  a
2 -3  0
0  4  0
3  2  1
1  7  1


In [15]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
print(obj.rank())   # rank() break ties by assigning each group the mean rank
print(obj.rank(method='first'))  # according to the order in which they're observed
print(obj.rank(ascending=False, method='max'))  # assign tie values the maximum rank in the group
print(obj.rank(ascending=False, method='min'))  # assign tie values the minimum rank in the group

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64
0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64
0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64
0    1.0
1    7.0
2    1.0
3    3.0
4    5.0
5    6.0
6    3.0
dtype: float64


In [19]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]})
print(frame)
print(frame.rank(axis='columns'))
print(frame.rank(axis='index', method='dense'))

     b  a    c
0  4.3  0 -2.0
1  7.0  1  5.0
2 -3.0  0  8.0
3  2.0  1 -2.5
     b    a    c
0  3.0  2.0  1.0
1  3.0  1.0  2.0
2  1.0  2.0  3.0
3  3.0  2.0  1.0
     b    a    c
0  3.0  1.0  2.0
1  4.0  2.0  3.0
2  1.0  1.0  4.0
3  2.0  2.0  1.0


In [2]:
# unique labels is not mandatory