In [1]:
# Pandas Web User Guide Follow along
# MultiIndex / advanced indexing
# Source: https://pandas.pydata.org/docs/user_guide/advanced.html
# Created 11/28/20

%matplotlib notebook

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from io import StringIO

# How interactive you want is discussed:
# https://ipython.readthedocs.io/en/stable/config/options/terminal.html
# Options are: 'all', 'last', 'last_expr', 'none', 'last_expr_or_assign'
# Default is: 'last_expr'

from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity I can only get last_expr_or_assign to work
InteractiveShell.ast_node_interactivity = "last_expr_or_assign"

In [2]:
def diag(*args):
    """Pandas diagnostics"""
    
    for i in args:
        
        if isinstance(i, pd.core.frame.DataFrame):
            print(i.info())
            display(i)
        else:
            print(f'{"-"*40}')
            print(f'Type: {type(i)}')

            try:
                print(f'Length: {len(i)}')
            except:
                pass

            try:
                print(i.info())
            except:
                pass

            try:
                display(i)
            except:
                print(i)
                
z = diag
d = display

<function IPython.core.display.display(*objs, include=None, exclude=None, metadata=None, transient=None, display_id=None, **kwargs)>

In [3]:
def read_df(text):
    """Create a pandas dataframe from a string of a dataframe
    copied from the pandas website tutorial."""
    lines = text.split('\n')
    cols = lines[0].split()
    index, array = [], []
    for line in lines[1:]:
        vals = line.split()
        index.append(vals[0])
        array.append(vals[1:])
#     print(cols)
#     print(index)
#     print(array)
    df = pd.DataFrame(array, index=index, columns=cols)
    return df

In [4]:
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
          ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
 ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

In [8]:
tuples = list(zip(arrays[0], arrays[1]))
tuples2 = list(zip(arrays[1], arrays[0]))

[('one', 'bar'),
 ('two', 'bar'),
 ('one', 'baz'),
 ('two', 'baz'),
 ('one', 'foo'),
 ('two', 'foo'),
 ('one', 'qux'),
 ('two', 'qux')]

In [9]:
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
index2 = pd.MultiIndex.from_tuples(tuples2, names=['first', 'second'])

MultiIndex([('one', 'bar'),
            ('two', 'bar'),
            ('one', 'baz'),
            ('two', 'baz'),
            ('one', 'foo'),
            ('two', 'foo'),
            ('one', 'qux'),
            ('two', 'qux')],
           names=['first', 'second'])

In [7]:
s = pd.Series(np.random.randn(8), index=index)

first  second
bar    one       1.241012
       two       1.247764
baz    one       0.178791
       two      -0.230680
foo    one      -1.288829
       two       2.018137
qux    one       0.767679
       two       0.789902
dtype: float64

In [10]:
s2 = pd.Series(np.random.randn(8), index=index2)

first  second
one    bar      -0.233188
two    bar       0.145134
one    baz       0.043937
two    baz      -1.425380
one    foo      -2.308013
two    foo      -0.511141
one    qux       1.318730
two    qux       0.002822
dtype: float64

In [11]:
z(index, index2)

----------------------------------------
Type: <class 'pandas.core.indexes.multi.MultiIndex'>
Length: 8


MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

----------------------------------------
Type: <class 'pandas.core.indexes.multi.MultiIndex'>
Length: 8


MultiIndex([('one', 'bar'),
            ('two', 'bar'),
            ('one', 'baz'),
            ('two', 'baz'),
            ('one', 'foo'),
            ('two', 'foo'),
            ('one', 'qux'),
            ('two', 'qux')],
           names=['first', 'second'])

In [14]:
index3 = index2.sortlevel()[0]

MultiIndex([('one', 'bar'),
            ('one', 'baz'),
            ('one', 'foo'),
            ('one', 'qux'),
            ('two', 'bar'),
            ('two', 'baz'),
            ('two', 'foo'),
            ('two', 'qux')],
           names=['first', 'second'])

In [15]:
s3 = pd.Series(np.random.randn(8), index=index3)

first  second
one    bar       0.216424
       baz      -0.357371
       foo       0.434301
       qux      -0.914742
two    bar       1.942854
       baz       0.692249
       foo       0.042710
       qux      -0.814046
dtype: float64

In [16]:
arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']),
          np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])]


[array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
       dtype='<U3'),
 array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'],
       dtype='<U3')]

In [17]:
s = pd.Series(np.random.randn(8), index=arrays)

bar  one   -1.661993
     two    0.359927
baz  one   -0.144447
     two   -0.179669
foo  one   -0.462734
     two    0.732535
qux  one   -0.518339
     two    0.519349
dtype: float64

In [18]:
df = pd.DataFrame(np.random.randn(8, 4), index=arrays)

Unnamed: 0,Unnamed: 1,0,1,2,3
bar,one,-0.115379,-0.95459,0.260215,-1.455257
bar,two,2.118252,0.933411,0.125037,0.332224
baz,one,-1.362134,0.722191,-0.725573,-0.705449
baz,two,-1.977232,-1.657447,-0.524156,1.568251
foo,one,0.670188,1.403562,0.848911,0.33665
foo,two,-1.397599,0.806706,-0.543242,2.144464
qux,one,-0.508533,1.405071,1.494578,-1.373326
qux,two,-0.470918,-0.860176,-0.431508,-0.985807


In [19]:
df.index.names

FrozenList([None, None])

In [20]:
type(_)

pandas.core.indexes.frozen.FrozenList

In [21]:
df = pd.DataFrame(np.random.randn(3, 8), 
                  index=['A', 'B', 'C'], columns=index)


first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,0.023382,0.57712,0.771476,0.085153,0.708879,-0.323803,0.368654,0.811713
B,-0.810641,0.367707,0.492949,0.288856,0.803745,0.490196,0.421637,0.76648
C,-0.072366,-0.24933,-3.506486,-0.072703,0.478583,1.650921,0.587227,0.140359


In [22]:
pd.DataFrame(np.random.randn(6, 6), index=index[:6], columns=index[:6])

Unnamed: 0_level_0,first,bar,bar,baz,baz,foo,foo
Unnamed: 0_level_1,second,one,two,one,two,one,two
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
bar,one,1.827531,-0.485999,1.552349,-0.948817,1.077188,-0.996762
bar,two,1.065523,-0.046297,0.6245,-0.291,1.28507,-1.527565
baz,one,0.345756,-0.660479,-1.645245,0.298978,0.045767,-1.833355
baz,two,-0.573586,1.662789,-1.770571,1.854419,-0.449356,-0.540327
foo,one,-0.019874,-0.409977,1.259413,-1.384061,1.505377,-0.719011
foo,two,0.747988,-1.346936,-0.42689,-0.696416,-0.386669,0.152563


In [23]:
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [24]:
index.get_level_values(0)

Index(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], dtype='object', name='first')

In [25]:
index.get_level_values(1)

Index(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'], dtype='object', name='second')

In [26]:
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [27]:
df

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,0.023382,0.57712,0.771476,0.085153,0.708879,-0.323803,0.368654,0.811713
B,-0.810641,0.367707,0.492949,0.288856,0.803745,0.490196,0.421637,0.76648
C,-0.072366,-0.24933,-3.506486,-0.072703,0.478583,1.650921,0.587227,0.140359


In [28]:
df['bar']

second,one,two
A,0.023382,0.57712
B,-0.810641,0.367707
C,-0.072366,-0.24933


In [29]:
df['bar', 'one']

A    0.023382
B   -0.810641
C   -0.072366
Name: (bar, one), dtype: float64

In [34]:
df[['bar', 'baz']]

first,bar,bar,baz,baz
second,one,two,one,two
A,0.023382,0.57712,0.771476,0.085153
B,-0.810641,0.367707,0.492949,0.288856
C,-0.072366,-0.24933,-3.506486,-0.072703


In [38]:
df.loc[:, ['bar', 'baz']]

first,bar,bar,baz,baz
second,one,two,one,two
A,0.023382,0.57712,0.771476,0.085153
B,-0.810641,0.367707,0.492949,0.288856
C,-0.072366,-0.24933,-3.506486,-0.072703


In [39]:
df['bar']['one']

A    0.023382
B   -0.810641
C   -0.072366
Name: one, dtype: float64

In [40]:
df.columns.levels

FrozenList([['bar', 'baz', 'foo', 'qux'], ['one', 'two']])

In [41]:
df[['foo','qux']]

first,foo,foo,qux,qux
second,one,two,one,two
A,0.708879,-0.323803,0.368654,0.811713
B,0.803745,0.490196,0.421637,0.76648
C,0.478583,1.650921,0.587227,0.140359


In [42]:
df[['foo','qux']].columns

MultiIndex([('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [43]:
df[['foo','qux']].columns.levels

FrozenList([['bar', 'baz', 'foo', 'qux'], ['one', 'two']])

In [47]:
df[[('foo', 'one'),('qux', 'two')]]

first,foo,qux
second,one,two
A,0.708879,0.811713
B,0.803745,0.76648
C,0.478583,0.140359


In [49]:
df.loc[:, [('foo', 'one'),('qux', 'two')]]

first,foo,qux
second,one,two
A,0.708879,0.811713
B,0.803745,0.76648
C,0.478583,0.140359


In [50]:
df2 = pd.DataFrame(np.random.randn(6, 6), index=index[:6], columns=index[:6])

Unnamed: 0_level_0,first,bar,bar,baz,baz,foo,foo
Unnamed: 0_level_1,second,one,two,one,two,one,two
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
bar,one,0.301773,-0.62567,-0.534902,-1.490927,0.203442,0.083134
bar,two,-1.061645,0.155512,0.653107,-0.21843,0.956777,-0.380241
baz,one,1.44701,1.608197,0.363882,-0.735997,2.582787,1.996678
baz,two,-2.623456,1.035516,0.746133,0.925367,-0.40035,-1.554836
foo,one,-1.579503,-1.788064,-0.613173,-0.33603,0.321949,1.207348
foo,two,0.32194,-0.256062,-0.601433,-0.966037,0.512884,-1.878031


In [57]:
df

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,0.023382,0.57712,0.771476,0.085153,0.708879,-0.323803,0.368654,0.811713
B,-0.810641,0.367707,0.492949,0.288856,0.803745,0.490196,0.421637,0.76648
C,-0.072366,-0.24933,-3.506486,-0.072703,0.478583,1.650921,0.587227,0.140359


In [58]:
df[['foo', 'qux']].columns.to_numpy()

array([('foo', 'one'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')],
      dtype=object)

In [59]:
df[['foo', 'qux']].columns.get_level_values(0)

Index(['foo', 'foo', 'qux', 'qux'], dtype='object', name='first')

In [60]:
new_mi = df[['foo', 'qux']].columns.remove_unused_levels()

MultiIndex([('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [61]:
s

bar  one   -1.661993
     two    0.359927
baz  one   -0.144447
     two   -0.179669
foo  one   -0.462734
     two    0.732535
qux  one   -0.518339
     two    0.519349
dtype: float64

In [62]:
s[:-2]

bar  one   -1.661993
     two    0.359927
baz  one   -0.144447
     two   -0.179669
foo  one   -0.462734
     two    0.732535
dtype: float64

In [63]:
s + s[:-2]

bar  one   -3.323986
     two    0.719854
baz  one   -0.288894
     two   -0.359338
foo  one   -0.925469
     two    1.465070
qux  one         NaN
     two         NaN
dtype: float64

In [64]:
s + s[::2]

bar  one   -3.323986
     two         NaN
baz  one   -0.288894
     two         NaN
foo  one   -0.925469
     two         NaN
qux  one   -1.036679
     two         NaN
dtype: float64

In [65]:
index[:3]

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one')],
           names=['first', 'second'])

In [66]:
s.reindex(index[:3])

first  second
bar    one      -1.661993
       two       0.359927
baz    one      -0.144447
dtype: float64

In [67]:
df = df.T

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,0.023382,-0.810641,-0.072366
bar,two,0.57712,0.367707,-0.24933
baz,one,0.771476,0.492949,-3.506486
baz,two,0.085153,0.288856,-0.072703
foo,one,0.708879,0.803745,0.478583
foo,two,-0.323803,0.490196,1.650921
qux,one,0.368654,0.421637,0.587227
qux,two,0.811713,0.76648,0.140359


In [68]:
df.loc[('bar', 'two')]

A    0.577120
B    0.367707
C   -0.249330
Name: (bar, two), dtype: float64

In [69]:
df.loc['bar']

Unnamed: 0_level_0,A,B,C
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0.023382,-0.810641,-0.072366
two,0.57712,0.367707,-0.24933


In [73]:
s = pd.Series([1, 2, 3, 4, 5, 6],
       index=pd.MultiIndex.from_product(
           [["A", "B"], ["c", "d", "e"]]))
z(s.index, s)

----------------------------------------
Type: <class 'pandas.core.indexes.multi.MultiIndex'>
Length: 6


MultiIndex([('A', 'c'),
            ('A', 'd'),
            ('A', 'e'),
            ('B', 'c'),
            ('B', 'd'),
            ('B', 'e')],
           )

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 6


A  c    1
   d    2
   e    3
B  c    4
   d    5
   e    6
dtype: int64

In [74]:
s.loc[[("A", "c"), ("B", "d")]]

A  c    1
B  d    5
dtype: int64

In [81]:
s.loc[(["A", "B"], ["c", "d", "e"])]

A  c    1
   d    2
   e    3
B  c    4
   d    5
   e    6
dtype: int64

In [78]:
s2 = pd.Series(range(12),
       index=pd.MultiIndex.from_product(
           [["A", "B", "C"], ["c", "d", "e", "f"]]))
z(s2.index, s2)

----------------------------------------
Type: <class 'pandas.core.indexes.multi.MultiIndex'>
Length: 12


MultiIndex([('A', 'c'),
            ('A', 'd'),
            ('A', 'e'),
            ('A', 'f'),
            ('B', 'c'),
            ('B', 'd'),
            ('B', 'e'),
            ('B', 'f'),
            ('C', 'c'),
            ('C', 'd'),
            ('C', 'e'),
            ('C', 'f')],
           )

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 12


A  c     0
   d     1
   e     2
   f     3
B  c     4
   d     5
   e     6
   f     7
C  c     8
   d     9
   e    10
   f    11
dtype: int64

In [85]:
s2.loc[[("A", "c"), ("B", "d")]]

A  c    0
B  d    5
dtype: int64

In [86]:
s2.loc[(["A", "C"], ["c", "d", "e"])]

A  c     0
   d     1
   e     2
C  c     8
   d     9
   e    10
dtype: int64

In [84]:
s2.loc[(["A", "C"], ["c", "d", "e"])]

A  c     0
   d     1
   e     2
C  c     8
   d     9
   e    10
dtype: int64

In [87]:
def mklbl(prefix, n):
    return ["%s%s" % (prefix, i) for i in range(n)]

In [89]:
miindex = pd.MultiIndex.from_product([mklbl('A', 4),
                                      mklbl('B', 2),
                                      mklbl('C', 4),
                                      mklbl('D', 2)])
z(miindex)

----------------------------------------
Type: <class 'pandas.core.indexes.multi.MultiIndex'>
Length: 64


MultiIndex([('A0', 'B0', 'C0', 'D0'),
            ('A0', 'B0', 'C0', 'D1'),
            ('A0', 'B0', 'C1', 'D0'),
            ('A0', 'B0', 'C1', 'D1'),
            ('A0', 'B0', 'C2', 'D0'),
            ('A0', 'B0', 'C2', 'D1'),
            ('A0', 'B0', 'C3', 'D0'),
            ('A0', 'B0', 'C3', 'D1'),
            ('A0', 'B1', 'C0', 'D0'),
            ('A0', 'B1', 'C0', 'D1'),
            ('A0', 'B1', 'C1', 'D0'),
            ('A0', 'B1', 'C1', 'D1'),
            ('A0', 'B1', 'C2', 'D0'),
            ('A0', 'B1', 'C2', 'D1'),
            ('A0', 'B1', 'C3', 'D0'),
            ('A0', 'B1', 'C3', 'D1'),
            ('A1', 'B0', 'C0', 'D0'),
            ('A1', 'B0', 'C0', 'D1'),
            ('A1', 'B0', 'C1', 'D0'),
            ('A1', 'B0', 'C1', 'D1'),
            ('A1', 'B0', 'C2', 'D0'),
            ('A1', 'B0', 'C2', 'D1'),
            ('A1', 'B0', 'C3', 'D0'),
            ('A1', 'B0', 'C3', 'D1'),
            ('A1', 'B1', 'C0', 'D0'),
            ('A1', 'B1', 'C0', 'D1'),
            

In [90]:
micolumns = pd.MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'),
                                       ('b', 'foo'), ('b', 'bah')],
                                      names=['lvl0', 'lvl1'])
z(micolumns)

----------------------------------------
Type: <class 'pandas.core.indexes.multi.MultiIndex'>
Length: 4


MultiIndex([('a', 'foo'),
            ('a', 'bar'),
            ('b', 'foo'),
            ('b', 'bah')],
           names=['lvl0', 'lvl1'])

In [92]:
dfmi = pd.DataFrame(np.arange(len(miindex) * len(micolumns))
                      .reshape((len(miindex), len(micolumns))),
                    index=miindex,
                    columns=micolumns).sort_index().sort_index(axis=1)
z(dfmi)

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 64 entries, ('A0', 'B0', 'C0', 'D0') to ('A3', 'B1', 'C3', 'D1')
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   (a, bar)  64 non-null     int32
 1   (a, foo)  64 non-null     int32
 2   (b, bah)  64 non-null     int32
 3   (b, foo)  64 non-null     int32
dtypes: int32(4)
memory usage: 1.6+ KB
None


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bar,foo,bah,foo
A0,B0,C0,D0,1,0,3,2
A0,B0,C0,D1,5,4,7,6
A0,B0,C1,D0,9,8,11,10
A0,B0,C1,D1,13,12,15,14
A0,B0,C2,D0,17,16,19,18
...,...,...,...,...,...,...,...
A3,B1,C1,D1,237,236,239,238
A3,B1,C2,D0,241,240,243,242
A3,B1,C2,D1,245,244,247,246
A3,B1,C3,D0,249,248,251,250


In [93]:
dfmi.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bar,foo,bah,foo
A1,B0,C1,D0,73,72,75,74
A1,B0,C1,D1,77,76,79,78
A1,B0,C3,D0,89,88,91,90
A1,B0,C3,D1,93,92,95,94
A1,B1,C1,D0,105,104,107,106
A1,B1,C1,D1,109,108,111,110
A1,B1,C3,D0,121,120,123,122
A1,B1,C3,D1,125,124,127,126
A2,B0,C1,D0,137,136,139,138
A2,B0,C1,D1,141,140,143,142


In [108]:
dfmi.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']),  slice(None)]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bar,foo,bah,foo
A1,B0,C1,D0,73,72,75,74
A1,B0,C1,D1,77,76,79,78
A1,B0,C3,D0,89,88,91,90
A1,B0,C3,D1,93,92,95,94
A1,B1,C1,D0,105,104,107,106
A1,B1,C1,D1,109,108,111,110
A1,B1,C3,D0,121,120,123,122
A1,B1,C3,D1,125,124,127,126
A2,B0,C1,D0,137,136,139,138
A2,B0,C1,D1,141,140,143,142


In [122]:
a = idx[:, :, ['C1', 'C3']]
b = idx[:, 'foo']
z(a, a[0], b, )

----------------------------------------
Type: <class 'tuple'>
Length: 3


(slice(None, None, None), slice(None, None, None), ['C1', 'C3'])

----------------------------------------
Type: <class 'slice'>


slice(None, None, None)

----------------------------------------
Type: <class 'tuple'>
Length: 2


(slice(None, None, None), 'foo')

In [124]:
z(dfmi.loc[a, b])

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 32 entries, ('A0', 'B0', 'C1', 'D0') to ('A3', 'B1', 'C3', 'D1')
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   (a, foo)  32 non-null     int32
 1   (b, foo)  32 non-null     int32
dtypes: int32(2)
memory usage: 752.0+ bytes
None


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,foo,foo
A0,B0,C1,D0,8,10
A0,B0,C1,D1,12,14
A0,B0,C3,D0,24,26
A0,B0,C3,D1,28,30
A0,B1,C1,D0,40,42
A0,B1,C1,D1,44,46
A0,B1,C3,D0,56,58
A0,B1,C3,D1,60,62
A1,B0,C1,D0,72,74
A1,B0,C1,D1,76,78


In [135]:
a = dfmi[('a', 'foo')]
b = a > 200
z(a, b)

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 64


A0  B0  C0  D0      0
            D1      4
        C1  D0      8
            D1     12
        C2  D0     16
                 ... 
A3  B1  C1  D1    236
        C2  D0    240
            D1    244
        C3  D0    248
            D1    252
Name: (a, foo), Length: 64, dtype: int32

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 64


A0  B0  C0  D0    False
            D1    False
        C1  D0    False
            D1    False
        C2  D0    False
                  ...  
A3  B1  C1  D1     True
        C2  D0     True
            D1     True
        C3  D0     True
            D1     True
Name: (a, foo), Length: 64, dtype: bool

In [132]:
b = dfmi[[('a', 'foo')]].squeezeueeze()
z(b)

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 64


A0  B0  C0  D0      0
            D1      4
        C1  D0      8
            D1     12
        C2  D0     16
                 ... 
A3  B1  C1  D1    236
        C2  D0    240
            D1    244
        C3  D0    248
            D1    252
Name: (a, foo), Length: 64, dtype: int32

In [133]:
all(a == b)

True

In [134]:
a == b

A0  B0  C0  D0    True
            D1    True
        C1  D0    True
            D1    True
        C2  D0    True
                  ... 
A3  B1  C1  D1    True
        C2  D0    True
            D1    True
        C3  D0    True
            D1    True
Name: (a, foo), Length: 64, dtype: bool

In [136]:
mask = dfmi[('a', 'foo')] > 200
dfmi.loc[idx[mask, :, ['C1', 'C3']], idx[:, 'foo']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,foo,foo
A3,B0,C1,D1,204,206
A3,B0,C3,D0,216,218
A3,B0,C3,D1,220,222
A3,B1,C1,D0,232,234
A3,B1,C1,D1,236,238
A3,B1,C3,D0,248,250
A3,B1,C3,D1,252,254


In [137]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,0.023382,-0.810641,-0.072366
bar,two,0.57712,0.367707,-0.24933
baz,one,0.771476,0.492949,-3.506486
baz,two,0.085153,0.288856,-0.072703
foo,one,0.708879,0.803745,0.478583
foo,two,-0.323803,0.490196,1.650921
qux,one,0.368654,0.421637,0.587227
qux,two,0.811713,0.76648,0.140359


In [138]:
df.xs('one', level='second')

Unnamed: 0_level_0,A,B,C
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,0.023382,-0.810641,-0.072366
baz,0.771476,0.492949,-3.506486
foo,0.708879,0.803745,0.478583
qux,0.368654,0.421637,0.587227


In [139]:
df.loc[(slice(None), 'one'), :]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,0.023382,-0.810641,-0.072366
baz,one,0.771476,0.492949,-3.506486
foo,one,0.708879,0.803745,0.478583
qux,one,0.368654,0.421637,0.587227


In [140]:
 df = df.T

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,0.023382,0.57712,0.771476,0.085153,0.708879,-0.323803,0.368654,0.811713
B,-0.810641,0.367707,0.492949,0.288856,0.803745,0.490196,0.421637,0.76648
C,-0.072366,-0.24933,-3.506486,-0.072703,0.478583,1.650921,0.587227,0.140359


In [141]:
df.xs('one', level='second', axis=1)

first,bar,baz,foo,qux
A,0.023382,0.771476,0.708879,0.368654
B,-0.810641,0.492949,0.803745,0.421637
C,-0.072366,-3.506486,0.478583,0.587227


In [142]:
df.loc[:, (slice(None), 'one')]

first,bar,baz,foo,qux
second,one,one,one,one
A,0.023382,0.771476,0.708879,0.368654
B,-0.810641,0.492949,0.803745,0.421637
C,-0.072366,-3.506486,0.478583,0.587227


In [143]:
df.xs(('one', 'bar'), level=('second', 'first'), axis=1)

first,bar
second,one
A,0.023382
B,-0.810641
C,-0.072366


In [144]:
midx = pd.MultiIndex(levels=[['zero', 'one'], ['x', 'y']],
                     codes=[[1, 1, 0, 0], [1, 0, 1, 0]])

MultiIndex([( 'one', 'y'),
            ( 'one', 'x'),
            ('zero', 'y'),
            ('zero', 'x')],
           )

In [145]:
df = pd.DataFrame(np.random.randn(4, 2), index=midx)

Unnamed: 0,Unnamed: 1,0,1
one,y,-1.16293,0.506697
one,x,0.397774,-0.544504
zero,y,-0.047016,-1.503108
zero,x,0.825291,1.199333


In [146]:
df2 = df.mean(level=0)

Unnamed: 0,0,1
one,-0.382578,-0.018903
zero,0.389137,-0.151887


In [147]:
mi = pd.MultiIndex.from_product([[1, 2], ['a', 'b']], names=['x', 'y'])

MultiIndex([(1, 'a'),
            (1, 'b'),
            (2, 'a'),
            (2, 'b')],
           names=['x', 'y'])

In [148]:
 mi.names

FrozenList(['x', 'y'])

In [149]:
 mi2 = mi.rename("new name", level=0)

MultiIndex([(1, 'a'),
            (1, 'b'),
            (2, 'a'),
            (2, 'b')],
           names=['new name', 'y'])

In [150]:
import random

In [151]:
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [152]:
random.shuffle(tuples)

In [153]:
tuples

[('foo', 'two'),
 ('qux', 'two'),
 ('foo', 'one'),
 ('qux', 'one'),
 ('baz', 'two'),
 ('bar', 'one'),
 ('baz', 'one'),
 ('bar', 'two')]

In [154]:
s = pd.Series(np.random.randn(8), index=pd.MultiIndex.from_tuples(tuples))

foo  two    0.472688
qux  two   -0.010784
foo  one    0.211145
qux  one   -1.197655
baz  two   -1.093613
bar  one   -0.563297
baz  one   -0.320092
bar  two    1.049932
dtype: float64

In [155]:
s.sort_index()

bar  one   -0.563297
     two    1.049932
baz  one   -0.320092
     two   -1.093613
foo  one    0.211145
     two    0.472688
qux  one   -1.197655
     two   -0.010784
dtype: float64

In [156]:
s.sort_index(level=0)

bar  one   -0.563297
     two    1.049932
baz  one   -0.320092
     two   -1.093613
foo  one    0.211145
     two    0.472688
qux  one   -1.197655
     two   -0.010784
dtype: float64

In [160]:
s.sort_index(level=[1,0], ascending=[False, True])

bar  two    1.049932
baz  two   -1.093613
foo  two    0.472688
qux  two   -0.010784
bar  one   -0.563297
baz  one   -0.320092
foo  one    0.211145
qux  one   -1.197655
dtype: float64

In [161]:
s.index.set_names(['L1', 'L2'], inplace=True)

In [162]:
s

L1   L2 
foo  two    0.472688
qux  two   -0.010784
foo  one    0.211145
qux  one   -1.197655
baz  two   -1.093613
bar  one   -0.563297
baz  one   -0.320092
bar  two    1.049932
dtype: float64

In [163]:
s.sort_index(level='L1')

L1   L2 
bar  one   -0.563297
     two    1.049932
baz  one   -0.320092
     two   -1.093613
foo  one    0.211145
     two    0.472688
qux  one   -1.197655
     two   -0.010784
dtype: float64

In [165]:
a = df.T

Unnamed: 0_level_0,one,one,zero,zero
Unnamed: 0_level_1,y,x,y,x
0,-1.16293,0.397774,-0.047016,0.825291
1,0.506697,-0.544504,-1.503108,1.199333


In [166]:
a.sort_index(level=1, axis=1)

Unnamed: 0_level_0,one,zero,one,zero
Unnamed: 0_level_1,x,x,y,y
0,0.397774,0.825291,-1.16293,-0.047016
1,-0.544504,1.199333,0.506697,-1.503108


In [167]:
dfm = pd.DataFrame({'jim': [0, 0, 1, 1],
                    'joe': ['x', 'x', 'z', 'y'],
                    'jolie': np.random.rand(4)})

Unnamed: 0,jim,joe,jolie
0,0,x,0.244042
1,0,x,0.365851
2,1,z,0.648331
3,1,y,0.399936


In [168]:
dfm = dfm.set_index(['jim', 'joe'])

Unnamed: 0_level_0,Unnamed: 1_level_0,jolie
jim,joe,Unnamed: 2_level_1
0,x,0.244042
0,x,0.365851
1,z,0.648331
1,y,0.399936


In [169]:
dfm.loc[(1, 'z')]

  dfm.loc[(1, 'z')]


Unnamed: 0_level_0,Unnamed: 1_level_0,jolie
jim,joe,Unnamed: 2_level_1
1,z,0.648331


In [170]:
dfm.index.is_lexsorted()

False

In [171]:
dfm.index.lexsort_depth

1

In [172]:
dfm = dfm.sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,jolie
jim,joe,Unnamed: 2_level_1
0,x,0.244042
0,x,0.365851
1,y,0.399936
1,z,0.648331


In [173]:
dfm.index.is_lexsorted()

True

In [174]:
dfm.index.lexsort_depth

2

In [175]:
dfm.loc[(0, 'y'):(1, 'z')]

Unnamed: 0_level_0,Unnamed: 1_level_0,jolie
jim,joe,Unnamed: 2_level_1
1,y,0.399936
1,z,0.648331


In [176]:
index = pd.Index(np.random.randint(0, 1000, 10))

Int64Index([934, 811, 766, 753, 535, 417, 86, 656, 940, 801], dtype='int64')

In [177]:
positions = [0, 9, 3]

[0, 9, 3]

In [178]:
index[positions]

Int64Index([934, 801, 753], dtype='int64')

In [179]:
index.take(positions)

Int64Index([934, 801, 753], dtype='int64')

In [180]:
frm = pd.DataFrame(np.random.randn(5, 3))

Unnamed: 0,0,1,2
0,-0.950708,-0.314415,-0.140119
1,-0.038139,-0.590419,2.342
2,-1.476272,-0.299201,-0.665961
3,1.112365,1.331336,-0.980658
4,1.168855,-0.229925,-0.141804


In [181]:
frm.take([1, 4, 3])

Unnamed: 0,0,1,2
1,-0.038139,-0.590419,2.342
4,1.168855,-0.229925,-0.141804
3,1.112365,1.331336,-0.980658


In [182]:
frm.take([0, 2], axis=1)

Unnamed: 0,0,2
0,-0.950708,-0.140119
1,-0.038139,2.342
2,-1.476272,-0.665961
3,1.112365,-0.980658
4,1.168855,-0.141804


In [183]:
arr = np.random.randn(10)

array([ 0.82566232,  0.80699007,  0.66652483,  0.18614892, -0.03970134,
       -0.03824737, -0.07984457, -0.09438214, -1.63450426, -0.18912748])

In [185]:
arr.take([False, False, True, True])

array([0.82566232, 0.82566232, 0.80699007, 0.80699007])

In [189]:
def print_header(str, char='*'):
    """Print a header to help separate print output.

    Parameters
    ----------
    str:
        Text to print in header.
    char:
        The character repeated on next line for pizzaz

    Examples:
    ----------
    print_header("More Human Than Human")
    print_header("Do You Like Our Owl?", "~")

    results in:

    More Human Than Human
    *********************

    Do You Like Our Owl?
    ~~~~~~~~~~~~~~~~~~~~

    """
    print(f"{str}\n{char*len(str)}\n")

In [190]:
print_header("More Human Than Human")
print_header("Do You Like Our Owl?", "~")

More Human Than Human
*********************

Do You Like Our Owl?
~~~~~~~~~~~~~~~~~~~~



In [191]:
from pandas.api.types import CategoricalDtype

In [192]:
df = pd.DataFrame({'A': np.arange(6),
                   'B': list('aabbca')})

Unnamed: 0,A,B
0,0,a
1,1,a
2,2,b
3,3,b
4,4,c
5,5,a


In [193]:
df['B'] = df['B'].astype(CategoricalDtype(list('cab')))
z(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   A       6 non-null      int32   
 1   B       6 non-null      category
dtypes: category(1), int32(1)
memory usage: 154.0 bytes
None


Unnamed: 0,A,B
0,0,a
1,1,a
2,2,b
3,3,b
4,4,c
5,5,a


In [194]:
df['B'].cat

<pandas.core.arrays.categorical.CategoricalAccessor object at 0x154F0520>

In [195]:
df['B'].cat.categories

Index(['c', 'a', 'b'], dtype='object')

In [197]:
df2 = df.set_index('B')

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
a,0
a,1
b,2
b,3
c,4
a,5


In [198]:
df2.index

CategoricalIndex(['a', 'a', 'b', 'b', 'c', 'a'], categories=['c', 'a', 'b'], ordered=False, name='B', dtype='category')

In [199]:
df2.loc['a']

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
a,0
a,1
a,5


In [200]:
df2.loc['a']

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
a,0
a,1
a,5


In [201]:
df2.loc['a'].index

CategoricalIndex(['a', 'a', 'a'], categories=['c', 'a', 'b'], ordered=False, name='B', dtype='category')

In [202]:
df2.groupby(level=0).sum()

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
c,4
a,6
b,5


In [203]:
df2

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
a,0
a,1
b,2
b,3
c,4
a,5


In [204]:
df2.groupby(level=0).sum()

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
c,4
a,6
b,5


In [206]:
df3 = pd.DataFrame({'A': np.arange(3),
                    'B': pd.Series(list('abc')).astype('category')})
z(df3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   A       3 non-null      int32   
 1   B       3 non-null      category
dtypes: category(1), int32(1)
memory usage: 139.0 bytes
None


Unnamed: 0,A,B
0,0,a
1,1,b
2,2,c


In [208]:
vars(df3['B'].cat)

{'_parent': ['a', 'b', 'c']
 Categories (3, object): ['a', 'b', 'c'],
 '_index': RangeIndex(start=0, stop=3, step=1),
 '_name': 'B',
 '__frozen': True}

In [209]:
df3 = df3.set_index('B')

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
a,0
b,1
c,2


In [210]:
z(df3)

<class 'pandas.core.frame.DataFrame'>
CategoricalIndex: 3 entries, a to c
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       3 non-null      int32
dtypes: int32(1)
memory usage: 75.0 bytes
None


Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
a,0
b,1
c,2


In [211]:
df3.reindex(['a', 'e'])

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
a,0.0
e,


In [212]:
indexf = pd.Index([1.5, 2, 3, 4.5, 5])
z(indexf)

----------------------------------------
Type: <class 'pandas.core.indexes.numeric.Float64Index'>
Length: 5


Float64Index([1.5, 2.0, 3.0, 4.5, 5.0], dtype='float64')

In [213]:
sf = pd.Series(range(5), index=indexf)

1.5    0
2.0    1
3.0    2
4.5    3
5.0    4
dtype: int64

In [214]:
dfir = pd.concat([pd.DataFrame(np.random.randn(5, 2),
                               index=np.arange(5) * 250.0,
                               columns=list('AB')),
                  pd.DataFrame(np.random.randn(6, 2),
                               index=np.arange(4, 10) * 250.1,
                               columns=list('AB'))])

Unnamed: 0,A,B
0.0,0.367439,1.447807
250.0,0.333523,-0.740649
500.0,1.380795,0.023908
750.0,0.393868,-0.70053
1000.0,0.2527,0.155149
1000.4,-0.85176,0.356377
1250.5,0.817695,-1.154813
1500.6,-0.101661,-0.286456
1750.7,-1.200635,0.404662
2000.8,0.701354,-1.249979


In [215]:
df = pd.DataFrame({'A': [1, 2, 3, 4]},
                 index=pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4]))

Unnamed: 0,A
"(0, 1]",1
"(1, 2]",2
"(2, 3]",3
"(3, 4]",4


In [216]:
s = pd.Series(range(5))

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [218]:
# s[-1]

In [219]:
df = pd.DataFrame(np.random.randn(5, 4))

Unnamed: 0,0,1,2,3
0,0.726317,0.111251,-0.315819,0.431697
1,1.728001,0.923755,-0.565549,-1.614144
2,2.699573,-0.305452,2.302054,-0.222656
3,-1.264054,-1.793232,-0.085918,-1.034967
4,0.687106,-0.126255,-1.878987,-0.032691


In [220]:
df.loc[-2:]

Unnamed: 0,0,1,2,3
0,0.726317,0.111251,-0.315819,0.431697
1,1.728001,0.923755,-0.565549,-1.614144
2,2.699573,-0.305452,2.302054,-0.222656
3,-1.264054,-1.793232,-0.085918,-1.034967
4,0.687106,-0.126255,-1.878987,-0.032691


In [221]:
df = pd.DataFrame(index=[2, 3, 3, 4, 5], columns=['data'], data=list(range(5)))

Unnamed: 0,data
2,0
3,1
3,2
4,3
5,4


In [222]:
df.index.is_monotonic_increasing

True

In [223]:
df = pd.DataFrame(index=[2, 3, 1, 4, 3, 5],
                  columns=['data'], data=list(range(6)))

Unnamed: 0,data
2,0
3,1
1,2
4,3
3,4
5,5


In [224]:
df.index.is_monotonic_increasing

False

In [227]:
df.loc[2:, :]

Unnamed: 0,data
2,0
3,1
1,2
4,3
3,4
5,5


In [228]:
series1 = pd.Series([1, 2, 3])
z(series1)

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 3


0    1
1    2
2    3
dtype: int64

In [229]:
res = series1.reindex([0, 4])

0    1.0
4    NaN
dtype: float64

In [231]:
series2 = pd.Series([True])
z(series2)

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 1


0    True
dtype: bool

In [232]:
res = series2.reindex_like(series1)
z(res)

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 3


0    True
1     NaN
2     NaN
dtype: object