In [1]:
import pandas as pd
import numpy as np

In [2]:
# let's assume 3 products * 3 years * 12 months = 108 sales figures

g = np.random.default_rng(0)
df = pd.DataFrame(g.integers(0, 100, [36,3]),
                  columns=list('ABC'))


In [3]:
df['year'] = [2018] * 12 + [2019] * 12 + [2020] * 12
df['month'] = '''Jan Feb Mar Apr May Jun
                 Jul Aug Sep Oct Nov Dec'''.split() * 3

In [6]:
df.head(15)

Unnamed: 0,A,B,C,year,month
0,85,63,51,2018,Jan
1,26,30,4,2018,Feb
2,7,1,17,2018,Mar
3,81,64,91,2018,Apr
4,50,60,97,2018,May
5,72,63,54,2018,Jun
6,55,93,27,2018,Jul
7,81,67,0,2018,Aug
8,39,85,55,2018,Sep
9,3,76,72,2018,Oct


In [7]:
df = df.set_index('year')

In [9]:
df.head(15)

Unnamed: 0_level_0,A,B,C,month
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,85,63,51,Jan
2018,26,30,4,Feb
2018,7,1,17,Mar
2018,81,64,91,Apr
2018,50,60,97,May
2018,72,63,54,Jun
2018,55,93,27,Jul
2018,81,67,0,Aug
2018,39,85,55,Sep
2018,3,76,72,Oct


But that wouldn’t give us any special access to the month data, which we would like to have part of our index. We can create a multi-index by passing a list of columns to set_index:

In [11]:
df = df.reset_index()

In [12]:
df.head(5)

Unnamed: 0,year,A,B,C,month
0,2018,85,63,51,Jan
1,2018,26,30,4,Feb
2,2018,7,1,17,Mar
3,2018,81,64,91,Apr
4,2018,50,60,97,May


In [13]:
df = df.set_index(['year', 'month'])
df.head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,Jan,85,63,51
2018,Feb,26,30,4
2018,Mar,7,1,17
2018,Apr,81,64,91
2018,May,50,60,97
2018,Jun,72,63,54
2018,Jul,55,93,27
2018,Aug,81,67,0
2018,Sep,39,85,55
2018,Oct,3,76,72


Remember that when creating a multi-index, we want the most general part to be on the outside and thus be mentioned first. If you create a multi-index with dates, you use year, month, and day, in that order

With this in place, we can retrieve one or more parts of the data frame in a variety of different ways. For example, we can get the sales data for all products in 2018:

In [14]:
df.loc[2018]

Unnamed: 0_level_0,A,B,C
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,85,63,51
Feb,26,30,4
Mar,7,1,17
Apr,81,64,91
May,50,60,97
Jun,72,63,54
Jul,55,93,27
Aug,81,67,0
Sep,39,85,55
Oct,3,76,72


We can get all sales data for just products A and C in 2018:

In [16]:
df.loc[2018, ['A', 'B']]

Unnamed: 0_level_0,A,B
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,85,63
Feb,26,30
Mar,7,1
Apr,81,64
May,50,60
Jun,72,63
Jul,55,93
Aug,81,67
Sep,39,85
Oct,3,76


We have a multi-index on this data frame, which means we can break the data down not just by year but also by month. For example, what did it look like for all three products in June 2018?

In [18]:
df.loc[(2018, 'Jun')] # the outermost level and the inner level

A    72
B    63
C    54
Name: (2018, Jun), dtype: int64

We’re still invoking loc with square brackets. However, the first (and only) argument is a tuple (i.e., round parentheses). ``Tuples are typically used in a multi-index situation when we want to specify a specific combination of index levels and values.``

In [19]:
# Another example
df.loc[(2018, 'Jun'), ['A', 'B']]

A    72
B    63
Name: (2018, Jun), dtype: int64

What if we want to see more than one year at a time? For example, let’s say we want all data for 2018 and 2020

In [20]:
df.loc[[2018,2019]]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,Jan,85,63,51
2018,Feb,26,30,4
2018,Mar,7,1,17
2018,Apr,81,64,91
2018,May,50,60,97
2018,Jun,72,63,54
2018,Jul,55,93,27
2018,Aug,81,67,0
2018,Sep,39,85,55
2018,Oct,3,76,72


And if we want all data for 2018 and 2020, but only products B and C?

In [21]:
df.loc[[2018, 2019], ['A','B']]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,Jan,85,63
2018,Feb,26,30
2018,Mar,7,1
2018,Apr,81,64
2018,May,50,60
2018,Jun,72,63
2018,Jul,55,93
2018,Aug,81,67
2018,Sep,39,85
2018,Oct,3,76


What if we want to get all the data from June in both 2018 and 2020? It’s a little complicated:
- We use square brackets with loc.
- The first argument in the square brackets describes the rows we want (i.e., a row selector).
- We want all columns, so there isn’t a second argument to loc.
- We want to select multiple combinations from the multi-index, so we need a list.
- Each year-month combination is a separate tuple in the list.

In [22]:
df.loc[[(2018, 'Jun'), (2019, 'Jun')]]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,Jun,72,63,54
2019,Jun,38,46,99


What if we want to look at all values from June, July, or August across all three years? We could, of course, do it manually:

In [25]:
df.loc[[
    (2018, 'Jun'), (2018, 'Jul'), (2018, 'Aug'),
    (2019, 'Jun'), (2019, 'Jul'), (2019, 'Aug'),
    (2020, 'Jun'), (2020, 'Jul'), (2020, 'Aug')
    ]]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,Jun,72,63,54
2018,Jul,55,93,27
2018,Aug,81,67,0
2019,Jun,38,46,99
2019,Jul,80,98,37
2019,Aug,68,95,65
2020,Jun,33,76,39
2020,Jul,32,89,26
2020,Aug,22,71,62


This works well, but it seems wordy. Is there another, shorter way? 

In [26]:
df.loc[([2018,2019,2020], ['Jun', 'Jul', 'Aug']), ['A', 'B', 'C']]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,Jun,72,63,54
2018,Jul,55,93,27
2018,Aug,81,67,0
2019,Jun,38,46,99
2019,Jul,80,98,37
2019,Aug,68,95,65
2020,Jun,33,76,39
2020,Jul,32,89,26
2020,Aug,22,71,62


Although the second argument (our column selector) is generally optional when using loc, ``here it isn’t: we need to indicate which column, or columns, we want, along with the rows.``

In [27]:
df.loc[([2018, 2019, 2020], ['Jun','Jul', 'Aug']), 'A':'B']

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,Jun,72,63
2018,Jul,55,93
2018,Aug,81,67
2019,Jun,38,46
2019,Jul,80,98
2019,Aug,68,95
2020,Jun,33,76
2020,Jul,32,89
2020,Aug,22,71


In [28]:
df.loc[([2018, 2019, 2020], ['Jun', 'Jul', 'Aug']), :]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,Jun,72,63,54
2018,Jul,55,93,27
2018,Aug,81,67,0
2019,Jun,38,46,99
2019,Jul,80,98,37
2019,Aug,68,95,65
2020,Jun,33,76,39
2020,Jul,32,89,26
2020,Aug,22,71,62


Assuming the index is sorted, we can even select the years using a slice

In [30]:
df.loc[(slice(None), ['Jun', 'Jul', 'Aug']), 'A':'B']

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,Jun,72,63
2019,Jun,38,46
2020,Jun,33,76
2018,Jul,55,93
2019,Jul,80,98
2020,Jul,32,89
2018,Aug,81,67
2019,Aug,68,95
2020,Aug,22,71
