# Pandas----Essential Functions

## 1 Reindexing 

In [1]:
import pandas as pd 

In [2]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index = ['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

### use \reindex to change indexes, if no corresponding data, it will appears as NaN

In [3]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

### We may need to modify values when using reindex, which can be done by using 'method' option

In [4]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index = [0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [5]:
#  ffill: propagate last valid observation forward to next valid backfill 
# bfill: use next valid observation to fill gap.

obj3.reindex(range(6), method = 'ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

### \reindex can also change column index and row index.

In [6]:
import numpy as np

frame = pd.DataFrame(np.arange(9).reshape(3, 3), 
                     index = ['a', 'c', 'd'],
                     columns = ['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [7]:
# reindex the rows 
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [8]:
# reindex the columns
states = ['Texas', 'Utah', 'California']
frame.reindex(columns = states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [9]:
# also we can use \loc to reindex
frame.loc[['a', 'b', 'c', 'd'], states]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


## 2 Drop entries from axis

### For Series, using \drop command to delete the axis specified and return a new object

In [10]:
obj = pd.Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [11]:
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

### For DataStructures, we can delete indexes and columns by specifying the axis

In [12]:
data = pd.DataFrame(np.arange(16).reshape(4, 4), 
                    index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns = ['One', 'Two', 'Three', 'Four'])
data

Unnamed: 0,One,Two,Three,Four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


### Drop the rows (index); axis = 0

In [13]:
data.drop(['Colorado', 'Utah'])

Unnamed: 0,One,Two,Three,Four
Ohio,0,1,2,3
New York,12,13,14,15


### Drop the columns; axis = 1

In [14]:
data.drop('Two', axis = 1)

Unnamed: 0,One,Three,Four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [15]:
data.drop(['Two', 'Four'], axis = 'columns')

Unnamed: 0,One,Three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


### We can see from above examples \drop(）returns a new object
### we can also adjust \inplace parameter to change the original object

In [16]:
obj.drop('c', inplace = True)
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

## 3 Indexing, slicing and filtering

### For Series, indexing is similar to array indexing in numpy, also we can use index in Series

In [17]:
obj = pd.Series(np.arange(4.), index = ['a', 'b', 'c', 'd'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [18]:
obj['b']

1.0

In [19]:
obj[1]

1.0

In [20]:
obj[1:4]

b    1.0
c    2.0
d    3.0
dtype: float64

In [21]:
# select particular rows
obj[['a', 'd', 'b']]

a    0.0
d    3.0
b    1.0
dtype: float64

In [22]:
obj[[0, 3, 1]]

a    0.0
d    3.0
b    1.0
dtype: float64

In [23]:
# also we can use  condition statements
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

### When slicing using labels, it is a little bit different from Python, it will include the end points

In [24]:
# 'd' is also included
obj['b':'d']

b    1.0
c    2.0
d    3.0
dtype: float64

In [25]:
# we can change values by select labels
obj['b':'c'] = 10
obj

a     0.0
b    10.0
c    10.0
d     3.0
dtype: float64

### For Dataframes

In [26]:
data = pd.DataFrame(np.arange(16).reshape(4, 4), 
                    index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns = ['One', 'Two', 'Three', 'Four'])
data

Unnamed: 0,One,Two,Three,Four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [27]:
data['Two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: Two, dtype: int32

In [28]:
data[['Four', 'One']]

Unnamed: 0,Four,One
Ohio,3,0
Colorado,7,4
Utah,11,8
New York,15,12


### Some interesting ways to index

In [29]:
# only this expression is valid!

data[:2]

Unnamed: 0,One,Two,Three,Four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [30]:
data[data['Three'] > 5]
# Note that the other columns are displayed together

Unnamed: 0,One,Two,Three,Four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


### Boolean dataframe

In [31]:
data < 5

Unnamed: 0,One,Two,Three,Four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [32]:
data[data < 5] = 0
data

Unnamed: 0,One,Two,Three,Four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


### Selection by \loc and \iloc

### label-based indexing (loc)

In [33]:
data

Unnamed: 0,One,Two,Three,Four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [34]:
data.loc['Utah', ['Two', 'Four']]

Two      9
Four    11
Name: Utah, dtype: int32

###  integer-based indexing

In [35]:
data.iloc[2, [1, 3]]

Two      9
Four    11
Name: Utah, dtype: int32

In [36]:
# choose one line
data.iloc[2]

One       8
Two       9
Three    10
Four     11
Name: Utah, dtype: int32

In [37]:
# choose clusters 
data.iloc[[1, 2], [1, 2, 3]]

Unnamed: 0,Two,Three,Four
Colorado,5,6,7
Utah,9,10,11


### Also we can use indexing functions for slicing

In [38]:
data.loc[:'Utah', 'Two']  # Note it includes the end 

Ohio        0
Colorado    5
Utah        9
Name: Two, dtype: int32

In [39]:
data1 = data.iloc[:, :2]
data1

Unnamed: 0,One,Two
Ohio,0,0
Colorado,0,5
Utah,8,9
New York,12,13


In [40]:
data1[data1 > 0] 

Unnamed: 0,One,Two
Ohio,,
Colorado,,5.0
Utah,8.0,9.0
New York,12.0,13.0


In [41]:
data.iloc[:, :2][data.Two > 0]

Unnamed: 0,One,Two
Colorado,0,5
Utah,8,9
New York,12,13


## 4 Integer indexes

### Some errors might occur when the index are integers

In [42]:
ser = pd.Series(np.arange(3.))
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [43]:
ser[-1]

KeyError: -1

### This causes some problem because the indexes are also integers, it is confusing to use -1 to index

In [None]:
ser2 = pd.Series(np.arange(3.), index = ['a', 'b', 'c'])
ser2

In [None]:
ser2[-1]

### It is Ok now because it causes no confusion. Generally we will usr loc and iloc

## 5 Arithmetic and Data Alignment

### When we add two objects together, if they differ in indexes, then pandas will place 'NaN' on these different places and keep all the indexes in both objects

In [None]:
s1 = pd.Series(np.arange(4.), index = ['a', 'b', 'c', 'd'])
s2 = pd.Series(np.arange(5.), index = ['a', 'c', 'd', 'f', 'g'])
s1

In [None]:
s2

In [None]:
s1 + s2

### s1 and s2 only have index a, c, d in common, so they have values on these three places, where others all appears 'NaN'

### In DataFrames,  these internal alignments appear both in rows and columns

In [None]:
df1 = pd.DataFrame(np.arange(9.).reshape(3, 3), columns = list('bcd'),
                   index = ['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape(4, 3), columns = list('bde'),
                   index = ['Utah', 'Ohio', 'Texas', 'Oregon'])

In [None]:
df1

In [None]:
df2

In [None]:
df1 + df2  # must match both columns and index

### Arithmetic methods with filled values

In [None]:
df1 = pd.DataFrame(np.arange(12.).reshape(3, 4),
                   columns = list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape(4, 5),
                   columns = list('abcde'))

In [None]:
df1

In [None]:
df2

In [None]:
df2.loc[1, 'b'] = np.nan
df2

In [None]:
# with no filled values 
df1 + df2

In [None]:
# with filled values
df1.add(df2, fill_value = 0)

In [None]:
# division
1 / df1

In [None]:
# use rdiv function
df1.rdiv(1) # 1 / df1

In [None]:
# we can also use fill_values when we reindex
df1.reindex(columns = df2.columns, fill_value = 0)

### Operations between DataFrames and Series

In [None]:
# it follows broadcasting, similar to numpy 
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.iloc[0]

In [None]:
frame

In [None]:
series

In [None]:
# broadcasting down the rows
frame - series

In [None]:
# if adding a series sharing a different index, appearing 'NaN'
series2 = pd.Series(range(3), index = list('bef'))
frame + series2

In [None]:
# if want to broadcast along columns
series3 = frame['d']
series3

In [None]:
frame

In [None]:
frame.sub(series3, axis = 0) #  or axis = 'index'

## Function applications and mapping

In [44]:
# ufuncs in numpy can also be used on objects in pandas
frame = pd.DataFrame(np.random.randn(4, 3), columns = list('bde'),
                     index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,-0.088481,-1.640046,0.472951
Ohio,-0.961135,1.099837,0.62614
Texas,-0.932949,1.153194,-1.055892
Oregon,0.520256,0.064097,-0.215671


In [45]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.088481,1.640046,0.472951
Ohio,0.961135,1.099837,0.62614
Texas,0.932949,1.153194,1.055892
Oregon,0.520256,0.064097,0.215671


In [47]:
# use functions for one row or column
f = lambda x: x.max() - x.min()
frame.apply(f)  # use apply function


b    1.481391
d    2.793240
e    1.682032
dtype: float64

In [48]:
# if not specified, apply works on every columns
# we can  specify axis to work on rows
frame.apply(f, axis = 'columns')

Utah      2.112997
Ohio      2.060972
Texas     2.209086
Oregon    0.735928
dtype: float64

In [49]:
# for functions like sum(), it is well-defined,  no need to use apply()
frame.sum(axis = 0)

b   -1.462309
d    0.677082
e   -0.172473
dtype: float64

In [50]:
frame.sum(axis = 1)

Utah     -1.255576
Ohio      0.764842
Texas    -0.835647
Oregon    0.368682
dtype: float64

In [51]:
# apply() does not return scalars, it returns a Series
def f(x):
    return pd.Series([x.min(), x.max()], index = ['min', 'max'])

In [52]:
frame.apply(f)

Unnamed: 0,b,d,e
min,-0.961135,-1.640046,-1.055892
max,0.520256,1.153194,0.62614


In [53]:
# To standardize the format of numbers in DataFrame, use applymap()
format = lambda x: '%.2f' % x
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,-0.09,-1.64,0.47
Ohio,-0.96,1.1,0.63
Texas,-0.93,1.15,-1.06
Oregon,0.52,0.06,-0.22


In [54]:
# For Series, we use map() function
frame['e'].map(format)

Utah       0.47
Ohio       0.63
Texas     -1.06
Oregon    -0.22
Name: e, dtype: object

## 7 Sorting and Ranking

In [55]:
# Use sort_index() function to sort, it will return a new object
obj = pd.Series(range(4), index = list('dabc'))
obj

d    0
a    1
b    2
c    3
dtype: int64

In [56]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [57]:
# For DataFrames, we use index/axis to sort
frame = pd.DataFrame(np.arange(8).reshape(2, 4),
                     columns = list('dabc'), index = ['three', 'one'])
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [58]:
frame.sort_index()  # sort index

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [59]:
# sort columns
frame.sort_index(axis = 1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [60]:
# default is increasing order, we can change to decreasing order
frame.sort_index(axis = 1, ascending = False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


### Sort accoring to values

In [61]:
# For Series, use sort_values() function
obj = pd.Series([4, 7, -5, 2])
obj.sort_values()

2   -5
3    2
0    4
1    7
dtype: int64

In [62]:
# NaN values will be placed at last
obj = pd.Series([4, 7, np.nan, -5, 2, np.nan])
obj.sort_values()

3   -5.0
4    2.0
0    4.0
1    7.0
2    NaN
5    NaN
dtype: float64

In [67]:
# For DataFrames, we can use several columns as sort keys
frame = pd.DataFrame({'a':[0, 1, 0, 1], 'b':[8, 5, 4, 9]})
frame

Unnamed: 0,a,b
0,0,8
1,1,5
2,0,4
3,1,9


In [68]:
frame.sort_values(by = 'b')

Unnamed: 0,a,b
2,0,4
1,1,5
0,0,8
3,1,9


In [69]:
# sort by several columns, first sort a, then sort b
frame.sort_values(by = ['a', 'b'])

Unnamed: 0,a,b
2,0,4
0,0,8
1,1,5
3,1,9


### rank() used to give ranks to object with valid data from the smallest to largest, for same values, it gives an average of the rank

In [70]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [71]:
obj.sort_values()

1   -5
5    0
4    2
3    4
6    4
0    7
2    7
dtype: int64

In [72]:
# The value represents their ranks in the original series
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [73]:
# we can also set the rank method as 'first see, first rank'
# not give the average rank to same values
obj.rank(method = 'first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [74]:
# assign the max value in the rank group for same values
obj.rank(method = 'max')

0    7.0
1    1.0
2    7.0
3    5.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [75]:
# descending order
obj.rank(ascending = False)

0    1.5
1    7.0
2    1.5
3    3.5
4    5.0
5    6.0
6    3.5
dtype: float64

In [76]:
# For DataFrames, we can rank according to rows or columns
frame = pd.DataFrame({'b': [4.3, 7, -3, 2],
                      'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [77]:
# rank along the rows(列与列之间的排序，即每一行的排序)
frame.rank(axis = 'columns')

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


## 8 Axes index with duplicate labels

In [78]:
# all we have seen before, the object all has unique indexes, 
#  but it is not mandatory for some cases
obj = pd.Series(range(5), index = ['a', 'a', 'b', 'b', 'c'])
obj


a    0
a    1
b    2
b    3
c    4
dtype: int64

In [79]:
# is_unique afflix tells us whether indexes unique or not
obj.index.is_unique

False

In [80]:
# selection is a little bit different, for duplicate labels, it returns a series
# for unique labels, it returns a scalar
obj['a']

a    0
a    1
dtype: int64

In [81]:
obj['c']

4

In [84]:
# same for DataFrames
df = pd.DataFrame(np.random.randn(4, 3), index = ['a', 'a', 'b', 'c'])
df

Unnamed: 0,0,1,2
a,-1.093984,0.543395,0.509106
a,1.180933,-1.52405,0.302432
b,0.220363,-2.797005,0.238232
c,1.234888,0.78224,-0.78271


In [85]:
df.loc['a']

Unnamed: 0,0,1,2
a,-1.093984,0.543395,0.509106
a,1.180933,-1.52405,0.302432


In [86]:
df.loc['c']

0    1.234888
1    0.782240
2   -0.782710
Name: c, dtype: float64