In [None]:
# Pandas Summary of Key Skills & Notes

# Based on User Guide at:
# https://pandas.pydata.org/docs/user_guide/index.html

# Created 12/05/20

In [None]:
%matplotlib notebook

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from io import StringIO
import datetime

# How interactive you want is discussed:
# https://ipython.readthedocs.io/en/stable/config/options/terminal.html
# Options are: 'all', 'last', 'last_expr', 'none', 'last_expr_or_assign'
# Default is: 'last_expr'

from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity I can only get last_expr_or_assign to work
InteractiveShell.ast_node_interactivity = "last_expr_or_assign"

In [2]:
def diag(*args):
    """Pandas diagnostics"""
    
    for i in args:
        
        if isinstance(i, pd.core.frame.DataFrame):
            print(i.info())
            display(i)
        else:
            print(f'{"-"*40}')
            print(f'Type: {type(i)}')

            try:
                print(f'Length: {len(i)}')
            except:
                pass

            try:
                print(i.info())
            except:
                pass

            try:
                display(i)
            except:
                print(i)
                
z = diag
d = display;

In [3]:
def read_df(text):
    """Create a pandas dataframe from a string of a dataframe
    copied from the pandas website tutorial."""
    lines = text.split('\n')
    cols = lines[0].split()
    index, array = [], []
    for line in lines[1:]:
        vals = line.split()
        index.append(vals[0])
        array.append(vals[1:])
#     print(cols)
#     print(index)
#     print(array)
    df = pd.DataFrame(array, index=index, columns=cols)
    return df

In [4]:
np.random.seed(123456)

In [6]:
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

Unnamed: 0,A,B,C,D
2013-01-01,0.469112,-0.282863,-1.509059,-1.135632
2013-01-02,1.212112,-0.173215,0.119209,-1.044236
2013-01-03,-0.861849,-2.104569,-0.494929,1.071804
2013-01-04,0.721555,-0.706771,-1.039575,0.27186
2013-01-05,-0.424972,0.56702,0.276232,-1.087401
2013-01-06,-0.67369,0.113648,-1.478427,0.524988


In [None]:
# Selection by Getting
# it can work, but production code should use .loc, .iloc

In [7]:
# Single Column
df['A']

2013-01-01    0.469112
2013-01-02    1.212112
2013-01-03   -0.861849
2013-01-04    0.721555
2013-01-05   -0.424972
2013-01-06   -0.673690
Freq: D, Name: A, dtype: float64

In [8]:
# Multi Column
df[['A', 'B']]

Unnamed: 0,A,B
2013-01-01,0.469112,-0.282863
2013-01-02,1.212112,-0.173215
2013-01-03,-0.861849,-2.104569
2013-01-04,0.721555,-0.706771
2013-01-05,-0.424972,0.56702
2013-01-06,-0.67369,0.113648


In [10]:
# Slicing rows by index (exclusive on the last element)
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,0.469112,-0.282863,-1.509059,-1.135632
2013-01-02,1.212112,-0.173215,0.119209,-1.044236
2013-01-03,-0.861849,-2.104569,-0.494929,1.071804


In [11]:
# Slicing rows by name (inclusive on the last element)
df['2013-01-01':'2013-01-03']

Unnamed: 0,A,B,C,D
2013-01-01,0.469112,-0.282863,-1.509059,-1.135632
2013-01-02,1.212112,-0.173215,0.119209,-1.044236
2013-01-03,-0.861849,-2.104569,-0.494929,1.071804


In [None]:
# Selection by Label

In [21]:
# Exact Matches (end exclusive)
# All columns is implied, but can be specified

a = df.loc[dates[0:2]]
b = df.loc[dates[0:2], :]
c = (a==b)
d(a, c)

Unnamed: 0,A,B,C,D
2013-01-01,0.469112,-0.282863,-1.509059,-1.135632
2013-01-02,1.212112,-0.173215,0.119209,-1.044236


Unnamed: 0,A,B,C,D
2013-01-01,True,True,True,True
2013-01-02,True,True,True,True


In [14]:
# Sliced matches (end inclusive)
df.loc['2013-01-01':'2013-01-03']

Unnamed: 0,A,B,C,D
2013-01-01,0.469112,-0.282863,-1.509059,-1.135632
2013-01-02,1.212112,-0.173215,0.119209,-1.044236
2013-01-03,-0.861849,-2.104569,-0.494929,1.071804


In [24]:
# Selecting Columns
# df.loc['A'] - will cause error, you must specify rows + cols
df.loc[:, 'A':'B']

Unnamed: 0,A,B
2013-01-01,0.469112,-0.282863
2013-01-02,1.212112,-0.173215
2013-01-03,-0.861849,-2.104569
2013-01-04,0.721555,-0.706771
2013-01-05,-0.424972,0.56702
2013-01-06,-0.67369,0.113648


In [25]:
# 'at' for faster access to scalar
df.at[dates[0], 'A']

0.4691122999071863

In [26]:
%%timeit
df.loc[dates[0], 'A']

33 µs ± 318 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [27]:
%%timeit
df.at[dates[0], 'A']

28 µs ± 68 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [28]:
# Selection by position
df.iloc[3]

A    0.721555
B   -0.706771
C   -1.039575
D    0.271860
Name: 2013-01-04 00:00:00, dtype: float64

In [29]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,0.721555,-0.706771
2013-01-05,-0.424972,0.56702


In [30]:
# if you only have one list, it is assumed to be on the rows
df.iloc[[1, 2, 4]]

Unnamed: 0,A,B,C,D
2013-01-02,1.212112,-0.173215,0.119209,-1.044236
2013-01-03,-0.861849,-2.104569,-0.494929,1.071804
2013-01-05,-0.424972,0.56702,0.276232,-1.087401


In [31]:
# you must selecta rows explicitly when selecting columns
df.iloc[:, [1, 3]]

Unnamed: 0,B,D
2013-01-01,-0.282863,-1.135632
2013-01-02,-0.173215,-1.044236
2013-01-03,-2.104569,1.071804
2013-01-04,-0.706771,0.27186
2013-01-05,0.56702,-1.087401
2013-01-06,0.113648,0.524988


In [33]:
%%timeit
df.iloc[1, 1]  # iloc single value

32.5 µs ± 681 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [34]:
%%timeit
df.iat[1, 1]  # iat single value

29.3 µs ± 160 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [36]:
# Boolean indexing
a = df['A'] > 0

2013-01-01     True
2013-01-02     True
2013-01-03    False
2013-01-04     True
2013-01-05    False
2013-01-06    False
Freq: D, Name: A, dtype: bool

In [37]:
df[a]

Unnamed: 0,A,B,C,D
2013-01-01,0.469112,-0.282863,-1.509059,-1.135632
2013-01-02,1.212112,-0.173215,0.119209,-1.044236
2013-01-04,0.721555,-0.706771,-1.039575,0.27186


In [38]:
# isin to determine df values in a list
df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},
                  index=['falcon', 'dog'])

Unnamed: 0,num_legs,num_wings
falcon,2,2
dog,4,0


In [39]:
df.isin([0, 2])

Unnamed: 0,num_legs,num_wings
falcon,True,True
dog,False,True


In [43]:
# Setting - aligns by index
index1 = range(8)
index2 = index1[::-1]
df1 = pd.DataFrame(np.random.randint(0, 100, 8), index=index1)
s1 = pd.Series(index1, index=index2)
df2 = df1.copy()
df2['s1'] = s1
d(s1, df1, df2)

7    0
6    1
5    2
4    3
3    4
2    5
1    6
0    7
dtype: int64

Unnamed: 0,0
0,28
1,42
2,25
3,62
4,51
5,2
6,99
7,73


Unnamed: 0,0,s1
0,28,7
1,42,6
2,25,5
3,62,4
4,51,3
5,2,2
6,99,1
7,73,0


In [44]:
df

Unnamed: 0,num_legs,num_wings
falcon,2,2
dog,4,0


In [54]:
# -----------------------------------
# Renaming index for rows or columns
# -----------------------------------

In [72]:
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

Unnamed: 0,A,B,C,D
2013-01-01,-1.247823,1.710348,-0.466489,-0.590065
2013-01-02,-0.245043,-0.550553,0.93529,-0.187715
2013-01-03,-0.767531,0.179984,0.073128,-0.154263
2013-01-04,-0.552058,1.52566,-1.600893,1.615303
2013-01-05,-0.566974,1.065327,-0.513375,-0.346705
2013-01-06,-0.550365,0.048707,-0.100712,1.185067


In [76]:
shape = df.shape

(6, 4)

In [86]:
# Set name of levels for index and column
df.index.name = 'row label'
z(df)

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, aa to ff
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   www     6 non-null      float64
 1   yyy     6 non-null      float64
 2   xxx     6 non-null      float64
 3   zzz     6 non-null      float64
dtypes: float64(4)
memory usage: 216.0+ bytes
None


Unnamed: 0_level_0,www,yyy,xxx,zzz
row label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
aa,-1.247823,1.710348,-0.466489,-0.590065
bb,-0.245043,-0.550553,0.93529,-0.187715
cc,-0.767531,0.179984,0.073128,-0.154263
dd,-0.552058,1.52566,-1.600893,1.615303
ee,-0.566974,1.065327,-0.513375,-0.346705
ff,-0.550365,0.048707,-0.100712,1.185067


In [87]:
a = df.index
z(a)

----------------------------------------
Type: <class 'pandas.core.indexes.base.Index'>
Length: 6


Index(['aa', 'bb', 'cc', 'dd', 'ee', 'ff'], dtype='object', name='row label')

In [90]:
df.columns.name = 'col_names'
df

col_names,www,yyy,xxx,zzz
row label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
aa,-1.247823,1.710348,-0.466489,-0.590065
bb,-0.245043,-0.550553,0.93529,-0.187715
cc,-0.767531,0.179984,0.073128,-0.154263
dd,-0.552058,1.52566,-1.600893,1.615303
ee,-0.566974,1.065327,-0.513375,-0.346705
ff,-0.550365,0.048707,-0.100712,1.185067


In [77]:
i_row = np.arange(shape[0], dtype=int);
i_col = np.arange(shape[1], dtype=int);
letters = list('abcdefghijklmnopqrstuvwyxz');

In [78]:
# Rename all row indices
ind = [i*2 for i in letters[:shape[0]]]
df.index = ind
df

Unnamed: 0,A,B,C,D
aa,-1.247823,1.710348,-0.466489,-0.590065
bb,-0.245043,-0.550553,0.93529,-0.187715
cc,-0.767531,0.179984,0.073128,-0.154263
dd,-0.552058,1.52566,-1.600893,1.615303
ee,-0.566974,1.065327,-0.513375,-0.346705
ff,-0.550365,0.048707,-0.100712,1.185067


In [82]:
# Rename all col indices
ind = [i*3 for i in letters[-shape[1]:]]
df.columns = ind
df

Unnamed: 0,www,yyy,xxx,zzz
aa,-1.247823,1.710348,-0.466489,-0.590065
bb,-0.245043,-0.550553,0.93529,-0.187715
cc,-0.767531,0.179984,0.073128,-0.154263
dd,-0.552058,1.52566,-1.600893,1.615303
ee,-0.566974,1.065327,-0.513375,-0.346705
ff,-0.550365,0.048707,-0.100712,1.185067


In [92]:
# rename select row indices
rename1 = {'aa': 'bob', 'bb': 'bebe'}
rename2 = {'yyy': 'yyz', 'zzz': 'zz top'}

{'yyy': 'yyz', 'zzz': 'zz top'}

In [93]:
df.rename(index=rename1, columns=rename2, inplace=True)
df

col_names,www,yyz,xxx,zz top
row label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bob,-1.247823,1.710348,-0.466489,-0.590065
bebe,-0.245043,-0.550553,0.93529,-0.187715
cc,-0.767531,0.179984,0.073128,-0.154263
dd,-0.552058,1.52566,-1.600893,1.615303
ee,-0.566974,1.065327,-0.513375,-0.346705
ff,-0.550365,0.048707,-0.100712,1.185067


In [47]:
# Reset row indexaa
df2 = df.reset_index()

Unnamed: 0,index,A,B,C,D
0,2013-01-01,-1.269741,1.053999,0.651981,0.280668
1,2013-01-02,0.642481,0.109001,-0.533294,0.243981
2,2013-01-03,-1.037831,-1.150016,-0.871461,-0.687693
3,2013-01-04,1.921056,-0.121113,-0.258742,-0.706329
4,2013-01-05,0.402547,1.171652,1.387952,0.009025
5,2013-01-06,0.613766,-0.031653,0.543156,-0.983195


In [48]:
# Reset col index
shape = df2.shape

(6, 5)

In [49]:
col_index = np.arange(shape[1], dtype=int)

array([0, 1, 2, 3, 4])

In [53]:
df2.columns = col_index
df2

Unnamed: 0,0,1,2,3,4
0,2013-01-01,-1.269741,1.053999,0.651981,0.280668
1,2013-01-02,0.642481,0.109001,-0.533294,0.243981
2,2013-01-03,-1.037831,-1.150016,-0.871461,-0.687693
3,2013-01-04,1.921056,-0.121113,-0.258742,-0.706329
4,2013-01-05,0.402547,1.171652,1.387952,0.009025
5,2013-01-06,0.613766,-0.031653,0.543156,-0.983195


In [95]:
# What happens when row labels and columns are the same?
df.index = letters[:shape[0]]
df.columns = letters[:shape[1]]
df

Unnamed: 0,a,b,c,d
a,-1.247823,1.710348,-0.466489,-0.590065
b,-0.245043,-0.550553,0.93529,-0.187715
c,-0.767531,0.179984,0.073128,-0.154263
d,-0.552058,1.52566,-1.600893,1.615303
e,-0.566974,1.065327,-0.513375,-0.346705
f,-0.550365,0.048707,-0.100712,1.185067


In [99]:
df['a']   # single value, takes a column

a   -1.247823
b   -0.245043
c   -0.767531
d   -0.552058
e   -0.566974
f   -0.550365
Name: a, dtype: float64

In [100]:
df[['a', 'b', 'c']] # lis of values value, takes columns

Unnamed: 0,a,b,c
a,-1.247823,1.710348,-0.466489
b,-0.245043,-0.550553,0.93529
c,-0.767531,0.179984,0.073128
d,-0.552058,1.52566,-1.600893
e,-0.566974,1.065327,-0.513375
f,-0.550365,0.048707,-0.100712


In [113]:
# Indexing Series
y = pd.Series([1, 10, 20, 30], index=[0, 1, 2, 3])

0     1
1    10
2    20
3    30
dtype: int64

In [115]:
y[[1, 2]]   # for a series, a list takes the row (since you don't have columns)

1    10
2    20
dtype: int64

In [124]:
d(df['a':'c'])   # slice, takes rows (not columns)
d(df['a':'a'])   # make slice one long for a single row

d(df[0:2])       # can used inter slices that excludes last member
d(df[4:])       # can used inter slices that excludes last member


Unnamed: 0,a,b,c,d
a,-1.247823,1.710348,-0.466489,-0.590065
b,-0.245043,-0.550553,0.93529,-0.187715
c,-0.767531,0.179984,0.073128,-0.154263


Unnamed: 0,a,b,c,d
a,-1.247823,1.710348,-0.466489,-0.590065


Unnamed: 0,a,b,c,d
a,-1.247823,1.710348,-0.466489,-0.590065
b,-0.245043,-0.550553,0.93529,-0.187715


Unnamed: 0,a,b,c,d
e,-0.566974,1.065327,-0.513375,-0.346705
f,-0.550365,0.048707,-0.100712,1.185067


In [109]:
# df[['a':'c']]   # error, can't have a slice in a list

In [110]:
x = pd.DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]})

Unnamed: 0,x,y
0,1,3
1,2,4
2,3,5


In [119]:
y = x.copy()
y.index = ['a', 'b', 'c']
d(y)

Unnamed: 0,x,y
a,1,3
b,a,b
c,3,5


In [122]:
y['a':'a']

Unnamed: 0,x,y
a,1,3


In [111]:
x.iloc[1] = ['a', 'b']

In [112]:
x

Unnamed: 0,x,y
0,1,3
1,a,b
2,3,5


In [125]:
df2

Unnamed: 0,0,1,2,3,4
0,2013-01-01,-1.269741,1.053999,0.651981,0.280668
1,2013-01-02,0.642481,0.109001,-0.533294,0.243981
2,2013-01-03,-1.037831,-1.150016,-0.871461,-0.687693
3,2013-01-04,1.921056,-0.121113,-0.258742,-0.706329
4,2013-01-05,0.402547,1.171652,1.387952,0.009025
5,2013-01-06,0.613766,-0.031653,0.543156,-0.983195


In [127]:
df3 = df2.copy()
df3.index = letters[:df3.shape[0]]
df3

Unnamed: 0,0,1,2,3,4
a,2013-01-01,-1.269741,1.053999,0.651981,0.280668
b,2013-01-02,0.642481,0.109001,-0.533294,0.243981
c,2013-01-03,-1.037831,-1.150016,-0.871461,-0.687693
d,2013-01-04,1.921056,-0.121113,-0.258742,-0.706329
e,2013-01-05,0.402547,1.171652,1.387952,0.009025
f,2013-01-06,0.613766,-0.031653,0.543156,-0.983195


In [128]:
df3['e':'z']

Unnamed: 0,0,1,2,3,4
e,2013-01-05,0.402547,1.171652,1.387952,0.009025
f,2013-01-06,0.613766,-0.031653,0.543156,-0.983195


In [129]:
df3['e':'a']

Unnamed: 0,0,1,2,3,4


In [130]:
df3['e':'a':-1]

Unnamed: 0,0,1,2,3,4
e,2013-01-05,0.402547,1.171652,1.387952,0.009025
d,2013-01-04,1.921056,-0.121113,-0.258742,-0.706329
c,2013-01-03,-1.037831,-1.150016,-0.871461,-0.687693
b,2013-01-02,0.642481,0.109001,-0.533294,0.243981
a,2013-01-01,-1.269741,1.053999,0.651981,0.280668


In [131]:
df1 = pd.DataFrame(np.random.randn(6, 4),
   ....:                    index=list('abcdef'),
   ....:                    columns=list('ABCD'))

Unnamed: 0,A,B,C,D
a,-0.552555,1.003447,-2.282879,-0.608409
b,1.674262,0.937132,1.210837,1.519617
c,1.216678,-0.182945,1.667972,0.814961
d,0.281521,0.080089,-1.698678,-0.196986
e,0.794232,-0.362068,-1.647257,-0.178221
f,-0.347161,0.899204,1.018543,0.917471


In [132]:
df1.loc['a']

A   -0.552555
B    1.003447
C   -2.282879
D   -0.608409
Name: a, dtype: float64

In [133]:
df1.loc['a'] > 0

A    False
B     True
C    False
D    False
Name: a, dtype: bool

In [134]:
df1

Unnamed: 0,A,B,C,D
a,-0.552555,1.003447,-2.282879,-0.608409
b,1.674262,0.937132,1.210837,1.519617
c,1.216678,-0.182945,1.667972,0.814961
d,0.281521,0.080089,-1.698678,-0.196986
e,0.794232,-0.362068,-1.647257,-0.178221
f,-0.347161,0.899204,1.018543,0.917471


In [135]:
df1.loc[:, df1.loc['a'] > 0]

Unnamed: 0,B
a,1.003447
b,0.937132
c,-0.182945
d,0.080089
e,-0.362068
f,0.899204


In [136]:
 s = pd.Series(list('abcde'), index=[0, 3, 2, 5, 4])

0    a
3    b
2    c
5    d
4    e
dtype: object

In [138]:
# s.loc[1:6] - gives a key error

In [139]:
s.sort_index()

0    a
2    c
3    b
4    e
5    d
dtype: object

In [140]:
df

Unnamed: 0,a,b,c,d
a,-1.247823,1.710348,-0.466489,-0.590065
b,-0.245043,-0.550553,0.93529,-0.187715
c,-0.767531,0.179984,0.073128,-0.154263
d,-0.552058,1.52566,-1.600893,1.615303
e,-0.566974,1.065327,-0.513375,-0.346705
f,-0.550365,0.048707,-0.100712,1.185067


In [141]:
df.sort_index(ascending=False)

Unnamed: 0,a,b,c,d
f,-0.550365,0.048707,-0.100712,1.185067
e,-0.566974,1.065327,-0.513375,-0.346705
d,-0.552058,1.52566,-1.600893,1.615303
c,-0.767531,0.179984,0.073128,-0.154263
b,-0.245043,-0.550553,0.93529,-0.187715
a,-1.247823,1.710348,-0.466489,-0.590065


In [142]:
df1 = pd.DataFrame(np.random.randn(6, 4),
   ....:                    index=list('abcdef'),
   ....:                    columns=list('ABCD'))

Unnamed: 0,A,B,C,D
a,-0.954274,1.539172,-1.11686,1.211037
b,1.479106,-1.077757,1.625449,0.817495
c,2.152066,-0.602909,-0.039819,-0.461883
d,-0.10598,-0.941095,-3.583488,-0.300827
e,-1.514301,-0.176802,0.301346,1.037847
f,0.229497,1.226664,1.467619,-0.602621


In [147]:
d(df1['A'] > 0)
d( (df1['A'] > 0) & (df1['B'] > 0))


a    False
b     True
c     True
d    False
e    False
f     True
Name: A, dtype: bool

a    False
b    False
c    False
d    False
e    False
f     True
dtype: bool

In [148]:
df1.loc[lambda df: (df['A'] > 0) & (df['B'] > 0)]

Unnamed: 0,A,B,C,D
f,0.229497,1.226664,1.467619,-0.602621


In [149]:
df1.loc[:, lambda df: ['A', 'B']]

Unnamed: 0,A,B
a,-0.954274,1.539172
b,1.479106,-1.077757
c,2.152066,-0.602909
d,-0.10598,-0.941095
e,-1.514301,-0.176802
f,0.229497,1.226664


In [158]:
def df_func1(df):
    return ['A', 'B']

In [159]:
df1.loc[:, df_func1]

Unnamed: 0,A,B
a,-0.954274,1.539172
b,1.479106,-1.077757
c,2.152066,-0.602909
d,-0.10598,-0.941095
e,-1.514301,-0.176802
f,0.229497,1.226664


In [160]:
df1[lambda df: df.columns[0]]

a   -0.954274
b    1.479106
c    2.152066
d   -0.105980
e   -1.514301
f    0.229497
Name: A, dtype: float64

In [164]:
a = df1['A'] > 0
b = df1[a]
c = df1['A'].loc[a]
d(a, b, c)

a    False
b     True
c     True
d    False
e    False
f     True
Name: A, dtype: bool

Unnamed: 0,A,B,C,D
b,1.479106,-1.077757,1.625449,0.817495
c,2.152066,-0.602909,-0.039819,-0.461883
f,0.229497,1.226664,1.467619,-0.602621


b    1.479106
c    2.152066
f    0.229497
Name: A, dtype: float64

In [165]:
s = pd.Series(np.arange(5), index=np.arange(5)[::-1], dtype='int64')

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [166]:
s.isin([2, 4, 6])

4    False
3    False
2     True
1    False
0     True
dtype: bool

In [167]:
s_mi = pd.Series(np.arange(6),
   .....:                  index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']]))

0  a    0
   b    1
   c    2
1  a    3
   b    4
   c    5
dtype: int32

In [168]:
df = pd.DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],
   .....:                    'ids2': ['a', 'n', 'c', 'n']})

Unnamed: 0,vals,ids,ids2
0,1,a,a
1,2,b,n
2,3,f,c
3,4,n,n


In [169]:
values = ['a', 'b', 1, 3]

['a', 'b', 1, 3]

In [170]:
df.isin(values)

Unnamed: 0,vals,ids,ids2
0,True,True,True
1,False,True,False
2,True,False,False
3,False,False,False


In [171]:
df.index.isin(values)

array([False,  True, False,  True])

In [172]:
 values = {'ids': ['a', 'b'], 'ids2': ['a', 'c'], 'vals': [1, 3]}

{'ids': ['a', 'b'], 'ids2': ['a', 'c'], 'vals': [1, 3]}

In [173]:
df.isin(values)

Unnamed: 0,vals,ids,ids2
0,True,True,True
1,False,True,False
2,True,False,True
3,False,False,False


In [174]:
df.isin(values).all(0)

vals    False
ids     False
ids2    False
dtype: bool

In [175]:
df.isin(values).all(1)

0     True
1    False
2    False
3    False
dtype: bool

In [176]:
s

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [177]:
s[s > 0]

3    1
2    2
1    3
0    4
dtype: int64

In [178]:
df

Unnamed: 0,vals,ids,ids2
0,1,a,a
1,2,b,n
2,3,f,c
3,4,n,n


In [179]:
np.random.seed(123456)
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

Unnamed: 0,A,B,C,D
2013-01-01,0.469112,-0.282863,-1.509059,-1.135632
2013-01-02,1.212112,-0.173215,0.119209,-1.044236
2013-01-03,-0.861849,-2.104569,-0.494929,1.071804
2013-01-04,0.721555,-0.706771,-1.039575,0.27186
2013-01-05,-0.424972,0.56702,0.276232,-1.087401
2013-01-06,-0.67369,0.113648,-1.478427,0.524988


In [180]:
s2 = s.copy()

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [181]:
s2 < 0

4    False
3    False
2    False
1    False
0    False
dtype: bool

In [182]:
s2[s2 < 0] = 0

In [183]:
s2

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [185]:
df[df < 0]

Unnamed: 0,A,B,C,D
2013-01-01,,-0.282863,-1.509059,-1.135632
2013-01-02,,-0.173215,,-1.044236
2013-01-03,-0.861849,-2.104569,-0.494929,
2013-01-04,,-0.706771,-1.039575,
2013-01-05,-0.424972,,,-1.087401
2013-01-06,-0.67369,,-1.478427,


In [186]:
df[df < 0] = -99

In [187]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.469112,-99.0,-99.0,-99.0
2013-01-02,1.212112,-99.0,0.119209,-99.0
2013-01-03,-99.0,-99.0,-99.0,1.071804
2013-01-04,0.721555,-99.0,-99.0,0.27186
2013-01-05,-99.0,0.56702,0.276232,-99.0
2013-01-06,-99.0,0.113648,-99.0,0.524988


In [188]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.469112,-99.0,-99.0,-99.0
2013-01-02,1.212112,-99.0,0.119209,-99.0
2013-01-03,-99.0,-99.0,-99.0,1.071804
2013-01-04,0.721555,-99.0,-99.0,0.27186
2013-01-05,-99.0,0.56702,0.276232,-99.0
2013-01-06,-99.0,0.113648,-99.0,0.524988


In [189]:
df.where(df<0)

Unnamed: 0,A,B,C,D
2013-01-01,,-99.0,-99.0,-99.0
2013-01-02,,-99.0,,-99.0
2013-01-03,-99.0,-99.0,-99.0,
2013-01-04,,-99.0,-99.0,
2013-01-05,-99.0,,,-99.0
2013-01-06,-99.0,,-99.0,


In [192]:
df[df < 0]

Unnamed: 0,A,B,C,D
2013-01-01,,-99.0,-99.0,-99.0
2013-01-02,,-99.0,,-99.0
2013-01-03,-99.0,-99.0,-99.0,
2013-01-04,,-99.0,-99.0,
2013-01-05,-99.0,,,-99.0
2013-01-06,-99.0,,-99.0,


In [193]:
df.where(df<0, 100)

Unnamed: 0,A,B,C,D
2013-01-01,100.0,-99.0,-99.0,-99.0
2013-01-02,100.0,-99.0,100.0,-99.0
2013-01-03,-99.0,-99.0,-99.0,100.0
2013-01-04,100.0,-99.0,-99.0,100.0
2013-01-05,-99.0,100.0,100.0,-99.0
2013-01-06,-99.0,100.0,-99.0,100.0


In [194]:
d(df)
df[1:4] > 0

Unnamed: 0,A,B,C,D
2013-01-01,0.469112,-99.0,-99.0,-99.0
2013-01-02,1.212112,-99.0,0.119209,-99.0
2013-01-03,-99.0,-99.0,-99.0,1.071804
2013-01-04,0.721555,-99.0,-99.0,0.27186
2013-01-05,-99.0,0.56702,0.276232,-99.0
2013-01-06,-99.0,0.113648,-99.0,0.524988


Unnamed: 0,A,B,C,D
2013-01-02,True,False,True,False
2013-01-03,False,False,False,True
2013-01-04,True,False,False,True


In [197]:
df[df[1:4] > 0] = 3
df

Unnamed: 0,A,B,C,D
2013-01-01,0.469112,-99.0,-99.0,-99.0
2013-01-02,3.0,-99.0,3.0,-99.0
2013-01-03,-99.0,-99.0,-99.0,3.0
2013-01-04,3.0,-99.0,-99.0,3.0
2013-01-05,-99.0,0.56702,0.276232,-99.0
2013-01-06,-99.0,0.113648,-99.0,0.524988


In [198]:
df2 = df.copy()

Unnamed: 0,A,B,C,D
2013-01-01,0.469112,-99.0,-99.0,-99.0
2013-01-02,3.0,-99.0,3.0,-99.0
2013-01-03,-99.0,-99.0,-99.0,3.0
2013-01-04,3.0,-99.0,-99.0,3.0
2013-01-05,-99.0,0.56702,0.276232,-99.0
2013-01-06,-99.0,0.113648,-99.0,0.524988


In [199]:
df2 > 0

Unnamed: 0,A,B,C,D
2013-01-01,True,False,False,False
2013-01-02,True,False,True,False
2013-01-03,False,False,False,True
2013-01-04,True,False,False,True
2013-01-05,False,True,True,False
2013-01-06,False,True,False,True


In [200]:
df2['A']

2013-01-01     0.469112
2013-01-02     3.000000
2013-01-03   -99.000000
2013-01-04     3.000000
2013-01-05   -99.000000
2013-01-06   -99.000000
Freq: D, Name: A, dtype: float64

In [201]:
df2.where(df2 > 0, df2['A'], axis='index')

Unnamed: 0,A,B,C,D
2013-01-01,0.469112,0.469112,0.469112,0.469112
2013-01-02,3.0,3.0,3.0,3.0
2013-01-03,-99.0,-99.0,-99.0,3.0
2013-01-04,3.0,3.0,3.0,3.0
2013-01-05,-99.0,0.56702,0.276232,-99.0
2013-01-06,-99.0,0.113648,-99.0,0.524988


In [204]:
a = df2.where(df2>0)

Unnamed: 0,A,B,C,D
2013-01-01,0.469112,,,
2013-01-02,3.0,,3.0,
2013-01-03,,,,3.0
2013-01-04,3.0,,,3.0
2013-01-05,,0.56702,0.276232,
2013-01-06,,0.113648,,0.524988


In [205]:
b = df2.mask(df2>0)

Unnamed: 0,A,B,C,D
2013-01-01,,-99.0,-99.0,-99.0
2013-01-02,,-99.0,,-99.0
2013-01-03,-99.0,-99.0,-99.0,
2013-01-04,,-99.0,-99.0,
2013-01-05,-99.0,,,-99.0
2013-01-06,-99.0,,-99.0,


In [206]:
a != b

Unnamed: 0,A,B,C,D
2013-01-01,True,True,True,True
2013-01-02,True,True,True,True
2013-01-03,True,True,True,True
2013-01-04,True,True,True,True
2013-01-05,True,True,True,True
2013-01-06,True,True,True,True


In [None]:
# Restart with the section
# The query() Method


In [207]:
# Selection by callable

In [217]:
df1 = pd.DataFrame(np.random.randn(6, 4),
   ....:                    index=list('abcdef'),
   ....:                    columns=list('ABCD'))

Unnamed: 0,A,B,C,D
a,0.895717,0.805244,-1.206412,2.565646
b,1.431256,1.340309,-1.170299,-0.226169
c,0.410835,0.81385,0.132003,-0.827317
d,-0.076467,-1.187678,1.130127,-1.436737
e,-1.413681,1.60792,1.02418,0.569605
f,0.875906,-2.211372,0.974466,-2.006747


In [218]:
def call_func1(df):
    z(df)
    return [0, 1]

In [219]:
df1.iloc[call_func1, :]

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, a to f
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      float64
 1   B       6 non-null      float64
 2   C       6 non-null      float64
 3   D       6 non-null      float64
dtypes: float64(4)
memory usage: 216.0+ bytes
None


Unnamed: 0,A,B,C,D
a,0.895717,0.805244,-1.206412,2.565646
b,1.431256,1.340309,-1.170299,-0.226169
c,0.410835,0.81385,0.132003,-0.827317
d,-0.076467,-1.187678,1.130127,-1.436737
e,-1.413681,1.60792,1.02418,0.569605
f,0.875906,-2.211372,0.974466,-2.006747


Unnamed: 0,A,B,C,D
a,0.895717,0.805244,-1.206412,2.565646
b,1.431256,1.340309,-1.170299,-0.226169


In [230]:
def call_func2(df):
    b = df['A'] > 0
    c = b.to_numpy()
    z(b, c)
    return b

In [232]:
df1.loc[call_func2, :]

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 6


a     True
b     True
c     True
d    False
e    False
f     True
Name: A, dtype: bool

----------------------------------------
Type: <class 'numpy.ndarray'>
Length: 6


array([ True,  True,  True, False, False,  True])

Unnamed: 0,A,B,C,D
a,0.895717,0.805244,-1.206412,2.565646
b,1.431256,1.340309,-1.170299,-0.226169
c,0.410835,0.81385,0.132003,-0.827317
f,0.875906,-2.211372,0.974466,-2.006747


In [233]:
def call_func3(df):
    return ['A', 'B']

In [234]:
df1.loc[:, call_func3]

Unnamed: 0,A,B
a,0.895717,0.805244
b,1.431256,1.340309
c,0.410835,0.81385
d,-0.076467,-1.187678
e,-1.413681,1.60792
f,0.875906,-2.211372


In [235]:
df1.loc[:, lambda df: ['A', 'B']]

Unnamed: 0,A,B
a,0.895717,0.805244
b,1.431256,1.340309
c,0.410835,0.81385
d,-0.076467,-1.187678
e,-1.413681,1.60792
f,0.875906,-2.211372


In [237]:
df1.loc[lambda df: df['A']>0, 'B':'D']

Unnamed: 0,B,C,D
a,0.805244,-1.206412,2.565646
b,1.340309,-1.170299,-0.226169
c,0.81385,0.132003,-0.827317
f,-2.211372,0.974466,-2.006747


In [238]:
s

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [239]:
z(df1['A'])

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 6


a    0.895717
b    1.431256
c    0.410835
d   -0.076467
e   -1.413681
f    0.875906
Name: A, dtype: float64

In [240]:
s>0

4    False
3     True
2     True
1     True
0     True
dtype: bool

In [241]:
df1['A'].loc[lambda s: s > 0]

a    0.895717
b    1.431256
c    0.410835
f    0.875906
Name: A, dtype: float64

In [243]:
df1['A'].loc[[True, True, False]]

IndexError: Boolean index has wrong length: 3 instead of 6

In [245]:
z(df1['A'])

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 6


a    0.895717
b    1.431256
c    0.410835
d   -0.076467
e   -1.413681
f    0.875906
Name: A, dtype: float64

In [246]:
z(s)
z(s>0)

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 5


4    0
3    1
2    2
1    3
0    4
dtype: int64

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 5


4    False
3     True
2     True
1     True
0     True
dtype: bool

In [247]:
z(df1['A'].loc[lambda s: s > 0])

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 4


a    0.895717
b    1.431256
c    0.410835
f    0.875906
Name: A, dtype: float64

In [248]:
def func4(s):
    a = s>0
    z(s,a)
    return a

In [249]:
z(df1['A'].loc[func4])

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 6


a    0.895717
b    1.431256
c    0.410835
d   -0.076467
e   -1.413681
f    0.875906
Name: A, dtype: float64

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 6


a     True
b     True
c     True
d    False
e    False
f     True
Name: A, dtype: bool

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 4


a    0.895717
b    1.431256
c    0.410835
f    0.875906
Name: A, dtype: float64

In [250]:
s

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [251]:
df1

Unnamed: 0,A,B,C,D
a,0.895717,0.805244,-1.206412,2.565646
b,1.431256,1.340309,-1.170299,-0.226169
c,0.410835,0.81385,0.132003,-0.827317
d,-0.076467,-1.187678,1.130127,-1.436737
e,-1.413681,1.60792,1.02418,0.569605
f,0.875906,-2.211372,0.974466,-2.006747


In [252]:
df1.sample(n=3)

Unnamed: 0,A,B,C,D
e,-1.413681,1.60792,1.02418,0.569605
c,0.410835,0.81385,0.132003,-0.827317
f,0.875906,-2.211372,0.974466,-2.006747


In [253]:
df1.sample(n=3, axis=1)

Unnamed: 0,D,C,B
a,2.565646,-1.206412,0.805244
b,-0.226169,-1.170299,1.340309
c,-0.827317,0.132003,0.81385
d,-1.436737,1.130127,-1.187678
e,0.569605,1.02418,1.60792
f,-2.006747,0.974466,-2.211372


In [258]:
df1.sample(frac=0.3)

Unnamed: 0,A,B,C,D
b,1.431256,1.340309,-1.170299,-0.226169
e,-1.413681,1.60792,1.02418,0.569605


In [256]:
z(df1)

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, a to f
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      float64
 1   B       6 non-null      float64
 2   C       6 non-null      float64
 3   D       6 non-null      float64
dtypes: float64(4)
memory usage: 312.0+ bytes
None


Unnamed: 0,A,B,C,D
a,0.895717,0.805244,-1.206412,2.565646
b,1.431256,1.340309,-1.170299,-0.226169
c,0.410835,0.81385,0.132003,-0.827317
d,-0.076467,-1.187678,1.130127,-1.436737
e,-1.413681,1.60792,1.02418,0.569605
f,0.875906,-2.211372,0.974466,-2.006747


In [259]:
dfi = pd.DataFrame(np.arange(6).reshape(3, 2),
   .....:                    columns=['A', 'B'])

Unnamed: 0,A,B
0,0,1
1,2,3
2,4,5


In [260]:
dfi.loc[:, 'C'] = dfi.loc[:, 'A']

In [261]:
z(dfi)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       3 non-null      int32
 1   B       3 non-null      int32
 2   C       3 non-null      int32
dtypes: int32(3)
memory usage: 100.0 bytes
None


Unnamed: 0,A,B,C
0,0,1,0
1,2,3,2
2,4,5,4


In [262]:
s = pd.Series(range(-3, 4))

0   -3
1   -2
2   -1
3    0
4    1
5    2
6    3
dtype: int64

In [263]:
# Where method

In [266]:
z(s)

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 7


0   -3
1   -2
2   -1
3    0
4    1
5    2
6    3
dtype: int64

In [267]:
z(s[s>0])

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 3


4    1
5    2
6    3
dtype: int64

In [268]:
z(s.where(s > 0))

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 7


0    NaN
1    NaN
2    NaN
3    NaN
4    1.0
5    2.0
6    3.0
dtype: float64

In [269]:
s = pd.Series(range(5))

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [278]:
s1 = s - 10
s2 = s - 100
s2.index = s2.index[::-1]
z(s1, s2)

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 5


0   -10
1    -9
2    -8
3    -7
4    -6
dtype: int64

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 5


4   -100
3    -99
2    -98
1    -97
0    -96
dtype: int64

In [273]:
s.where(s>2, s1)

0   -10
1    -9
2    -8
3     3
4     4
dtype: int64

In [279]:
s.where(s>2, s2)

0   -96
1   -97
2   -98
3     3
4     4
dtype: int64

In [276]:
s.where(s>2, s1[:2])

0   -10.0
1    -9.0
2     NaN
3     3.0
4     4.0
dtype: float64

In [275]:
s1[:-1]

0   -10
1    -9
2    -8
3    -7
dtype: int64

In [280]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.469112,-99.0,-99.0,-99.0
2013-01-02,3.0,-99.0,3.0,-99.0
2013-01-03,-99.0,-99.0,-99.0,3.0
2013-01-04,3.0,-99.0,-99.0,3.0
2013-01-05,-99.0,0.56702,0.276232,-99.0
2013-01-06,-99.0,0.113648,-99.0,0.524988


In [281]:
df < 0

Unnamed: 0,A,B,C,D
2013-01-01,False,True,True,True
2013-01-02,False,True,False,True
2013-01-03,True,True,True,False
2013-01-04,False,True,True,False
2013-01-05,True,False,False,True
2013-01-06,True,False,True,False


In [282]:
df[df < 0]

Unnamed: 0,A,B,C,D
2013-01-01,,-99.0,-99.0,-99.0
2013-01-02,,-99.0,,-99.0
2013-01-03,-99.0,-99.0,-99.0,
2013-01-04,,-99.0,-99.0,
2013-01-05,-99.0,,,-99.0
2013-01-06,-99.0,,-99.0,


In [283]:
df.where(df < 0)

Unnamed: 0,A,B,C,D
2013-01-01,,-99.0,-99.0,-99.0
2013-01-02,,-99.0,,-99.0
2013-01-03,-99.0,-99.0,-99.0,
2013-01-04,,-99.0,-99.0,
2013-01-05,-99.0,,,-99.0
2013-01-06,-99.0,,-99.0,


In [284]:
df.where(df < 0, df + 1000)

Unnamed: 0,A,B,C,D
2013-01-01,1000.469112,-99.0,-99.0,-99.0
2013-01-02,1003.0,-99.0,1003.0,-99.0
2013-01-03,-99.0,-99.0,-99.0,1003.0
2013-01-04,1003.0,-99.0,-99.0,1003.0
2013-01-05,-99.0,1000.56702,1000.276232,-99.0
2013-01-06,-99.0,1000.113648,-99.0,1000.524988


In [285]:
df2 = df.copy()

Unnamed: 0,A,B,C,D
2013-01-01,0.469112,-99.0,-99.0,-99.0
2013-01-02,3.0,-99.0,3.0,-99.0
2013-01-03,-99.0,-99.0,-99.0,3.0
2013-01-04,3.0,-99.0,-99.0,3.0
2013-01-05,-99.0,0.56702,0.276232,-99.0
2013-01-06,-99.0,0.113648,-99.0,0.524988


In [286]:
df2 > 0

Unnamed: 0,A,B,C,D
2013-01-01,True,False,False,False
2013-01-02,True,False,True,False
2013-01-03,False,False,False,True
2013-01-04,True,False,False,True
2013-01-05,False,True,True,False
2013-01-06,False,True,False,True


In [287]:
df2['A']

2013-01-01     0.469112
2013-01-02     3.000000
2013-01-03   -99.000000
2013-01-04     3.000000
2013-01-05   -99.000000
2013-01-06   -99.000000
Freq: D, Name: A, dtype: float64

In [288]:
df2.where(df2 > 0)

Unnamed: 0,A,B,C,D
2013-01-01,0.469112,,,
2013-01-02,3.0,,3.0,
2013-01-03,,,,3.0
2013-01-04,3.0,,,3.0
2013-01-05,,0.56702,0.276232,
2013-01-06,,0.113648,,0.524988


In [294]:
a = df2['A']
b = df2 > 0
c = df2.where(b, a, axis='index')
d(a, b, c)

2013-01-01     0.469112
2013-01-02     3.000000
2013-01-03   -99.000000
2013-01-04     3.000000
2013-01-05   -99.000000
2013-01-06   -99.000000
Freq: D, Name: A, dtype: float64

Unnamed: 0,A,B,C,D
2013-01-01,True,False,False,False
2013-01-02,True,False,True,False
2013-01-03,False,False,False,True
2013-01-04,True,False,False,True
2013-01-05,False,True,True,False
2013-01-06,False,True,False,True


Unnamed: 0,A,B,C,D
2013-01-01,0.469112,0.469112,0.469112,0.469112
2013-01-02,3.0,3.0,3.0,3.0
2013-01-03,-99.0,-99.0,-99.0,3.0
2013-01-04,3.0,3.0,3.0,3.0
2013-01-05,-99.0,0.56702,0.276232,-99.0
2013-01-06,-99.0,0.113648,-99.0,0.524988


In [295]:
n = 10

10

In [296]:
df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))

Unnamed: 0,a,b,c
0,0.23093,0.931657,0.245765
1,0.123548,0.540839,0.038543
2,0.167806,0.120245,0.017291
3,0.279273,0.849806,0.470272
4,0.815492,0.326272,0.468102
5,0.380314,0.59729,0.673816
6,0.652167,0.621247,0.449078
7,0.838656,0.592737,0.689804
8,0.633916,0.970727,0.472274
9,0.260907,0.328569,0.876051


In [297]:
%%timeit
df[(df['a'] < df['b']) & (df['b'] < df['c'])]

596 µs ± 23.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [298]:
%%timeit
df.query('(a < b) & (b < c)')

1.74 ms ± 30.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [299]:
df = pd.DataFrame(np.random.randint(n / 2, size=(n, 2)), columns=list('bc'))

Unnamed: 0,b,c
0,0,4
1,4,1
2,0,1
3,4,3
4,2,0
5,3,4
6,3,3
7,1,3
8,4,0
9,2,4


In [300]:
df.index.name = 'a'

In [301]:
df.query('a < b')

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4,1
3,4,3


In [302]:
df.query('a < b and b < c')

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1


In [303]:
df1

Unnamed: 0,A,B,C,D
a,0.895717,0.805244,-1.206412,2.565646
b,1.431256,1.340309,-1.170299,-0.226169
c,0.410835,0.81385,0.132003,-0.827317
d,-0.076467,-1.187678,1.130127,-1.436737
e,-1.413681,1.60792,1.02418,0.569605
f,0.875906,-2.211372,0.974466,-2.006747


In [304]:
df1.get('A')

a    0.895717
b    1.431256
c    0.410835
d   -0.076467
e   -1.413681
f    0.875906
Name: A, dtype: float64

In [306]:
z(df1.get('a'))

----------------------------------------
Type: <class 'NoneType'>


None

In [307]:
s = pd.Series([1, 2, 3], index=['a', 'b', 'c'])

a    1
b    2
c    3
dtype: int64

In [308]:
s.get('b')

2

In [309]:
s.get('x', default=2112)

2112

In [310]:
dflookup = pd.DataFrame(np.random.rand(20, 4), columns = ['A', 'B', 'C', 'D'])

Unnamed: 0,A,B,C,D
0,0.598197,0.695247,0.943854,0.932091
1,0.461306,0.638118,0.027907,0.459301
2,0.102941,0.455286,0.49835,0.480349
3,0.46757,0.222243,0.858581,0.604116
4,0.832973,0.562296,0.398776,0.746519
5,0.60309,0.253452,0.497421,0.849689
6,0.12121,0.483519,0.468014,0.023033
7,0.763842,0.260521,0.404564,0.76127
8,0.570846,0.28326,0.842127,0.087686
9,0.726714,0.024263,0.906839,0.541954


In [311]:
dflookup.lookup(list(range(0, 10, 2)), ['B', 'C', 'A', 'B', 'D'])

array([0.69524686, 0.49835034, 0.83297346, 0.48351896, 0.08768553])

In [312]:
index = pd.Index(list(range(5)), name='rows')
columns = pd.Index(['A', 'B', 'C'], name='cols')
df = pd.DataFrame(np.random.randn(5, 3), index=index, columns=columns)

cols,A,B,C
rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.846958,-1.222082,0.600705
1,-1.233203,0.669692,-0.605656
2,-1.169184,0.342416,0.868584
3,-0.948458,2.29778,-0.684718
4,-2.670153,-0.114722,0.168904


In [313]:
ind = pd.Index([1, 2, 3])

Int64Index([1, 2, 3], dtype='int64')

In [317]:
ind.rename("apple", inplace=True)

In [318]:
ind.name

'apple'

In [319]:
ind.name = 'orange'

In [320]:
ind

Int64Index([1, 2, 3], dtype='int64', name='orange')

In [321]:
idx = pd.MultiIndex.from_tuples(
    [
        (1, "one"),
        (1, "two"),
        (2, "one"),
        (2, "two"),
        (3, "one"),
        (3, "two")
    ],
    names=["foo", "bar"]
)

MultiIndex([(1, 'one'),
            (1, 'two'),
            (2, 'one'),
            (2, 'two'),
            (3, 'one'),
            (3, 'two')],
           names=['foo', 'bar'])

In [326]:
idx.set_levels([['a', 'b', 'd', 'c'], [5, 3, 1, 2]])

MultiIndex([('a', 5),
            ('a', 3),
            ('b', 5),
            ('b', 3),
            ('d', 5),
            ('d', 3)],
           names=['foo', 'bar'])

In [327]:
df3 = pd.DataFrame(np.random.rand(10, 4), columns = ['A', 'B', 'C', 'D'])

Unnamed: 0,A,B,C,D
0,0.727376,0.630865,0.076462,0.474453
1,0.438921,0.11868,0.86367,0.138138
2,0.577363,0.686602,0.595307,0.564592
3,0.52063,0.913052,0.926075,0.616184
4,0.078718,0.854477,0.898725,0.076404
5,0.523211,0.591538,0.792342,0.216974
6,0.564056,0.39789,0.454131,0.915716
7,0.074315,0.437913,0.019794,0.559209
8,0.502065,0.026437,0.687682,0.615158
9,0.646198,0.41296,0.170408,0.125086


In [329]:
ind3 = pd.Index(list(range(10))[::-1])

Int64Index([9, 8, 7, 6, 5, 4, 3, 2, 1, 0], dtype='int64')

In [330]:
df3.index = ind3

In [331]:
df3

Unnamed: 0,A,B,C,D
9,0.727376,0.630865,0.076462,0.474453
8,0.438921,0.11868,0.86367,0.138138
7,0.577363,0.686602,0.595307,0.564592
6,0.52063,0.913052,0.926075,0.616184
5,0.078718,0.854477,0.898725,0.076404
4,0.523211,0.591538,0.792342,0.216974
3,0.564056,0.39789,0.454131,0.915716
2,0.074315,0.437913,0.019794,0.559209
1,0.502065,0.026437,0.687682,0.615158
0,0.646198,0.41296,0.170408,0.125086


In [332]:
dfmi = pd.DataFrame([list('abcd'),
   .....:                      list('efgh'),
   .....:                      list('ijkl'),
   .....:                      list('mnop')],
   .....:                     columns=pd.MultiIndex.from_product([['one', 'two'],
   .....:                                                         ['first', 'second']]))

Unnamed: 0_level_0,one,one,two,two
Unnamed: 0_level_1,first,second,first,second
0,a,b,c,d
1,e,f,g,h
2,i,j,k,l
3,m,n,o,p


In [333]:
dfmi.loc[:, ('one', 'first'):('two', 'first')]

Unnamed: 0_level_0,one,one,two
Unnamed: 0_level_1,first,second,first
0,a,b,c
1,e,f,g
2,i,j,k
3,m,n,o


In [334]:
# MultiIndex / advanced indexing

In [335]:
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
   ...:           ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
 ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

In [336]:
def func(x, y):
    z(x,y)

In [338]:
func(*arrays)

----------------------------------------
Type: <class 'list'>
Length: 8


['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']

----------------------------------------
Type: <class 'list'>
Length: 8


['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']

In [339]:
tuples = list(zip(*arrays))

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [340]:
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [341]:
s = pd.Series(np.random.randn(8), index=index)

first  second
bar    one      -0.048048
       two      -0.540700
baz    one      -1.950211
       two       0.640559
foo    one      -0.253404
       two      -0.831915
qux    one      -1.745354
       two       0.709249
dtype: float64

In [342]:
iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']]

[['bar', 'baz', 'foo', 'qux'], ['one', 'two']]

In [343]:
pd.MultiIndex.from_product(iterables, names=['first', 'second'])

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [344]:
df = pd.DataFrame([['bar', 'one'], ['bar', 'two'],
   ....:                    ['foo', 'one'], ['foo', 'two']],
   ....:                   columns=['first', 'second'])

Unnamed: 0,first,second
0,bar,one
1,bar,two
2,foo,one
3,foo,two


In [345]:
arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']),
   ....:           np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])]

[array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
       dtype='<U3'),
 array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'],
       dtype='<U3')]

In [346]:
s = pd.Series(np.random.randn(8), index=arrays)

bar  one   -0.237364
     two   -0.517707
baz  one    0.521899
     two    0.334682
foo  one    0.484535
     two    0.694028
qux  one    0.177154
     two    0.535700
dtype: float64

In [347]:
 df.index.names

FrozenList([None])

In [348]:
 df = pd.DataFrame(np.random.randn(8, 4), index=arrays)

Unnamed: 0,Unnamed: 1,0,1,2,3
bar,one,-0.506675,0.421335,-1.289076,-0.178069
bar,two,-0.271841,1.406993,-1.334905,-1.087664
baz,one,-0.883833,-1.554827,-0.118953,-1.460084
baz,two,-0.020351,-0.256125,0.358575,1.112033
foo,one,-0.200521,-0.508784,-0.327758,0.627056
foo,two,0.067058,-1.376511,1.16233,-0.48212
qux,one,-0.455309,-0.985682,1.383438,-0.887807
qux,two,-1.457645,-0.84304,0.723058,0.079959


In [349]:
 df.index.names

FrozenList([None, None])

In [350]:
 df = pd.DataFrame(np.random.randn(3, 8), index=['A', 'B', 'C'], columns=index)

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,0.930166,-1.133731,0.370222,-0.634823,1.316806,1.386141,0.957984,-0.643576
B,0.405946,-1.955935,0.236658,-1.247545,0.809163,1.283452,1.284907,-1.97624
C,-2.050342,0.153034,0.784836,-2.483927,-0.890199,-0.918028,2.830012,-0.617136


In [351]:
pd.DataFrame(np.random.randn(6, 6), index=index[:6], columns=index[:6])

Unnamed: 0_level_0,first,bar,bar,baz,baz,foo,foo
Unnamed: 0_level_1,second,one,two,one,two,one,two
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
bar,one,0.853363,-0.146959,-1.478586,-1.154548,0.0047,-0.17813
bar,two,0.112243,-0.044445,0.907446,-2.306805,-0.533714,-0.165288
baz,one,1.303514,0.347892,0.421296,2.580954,1.17007,0.149262
baz,two,0.543724,2.693229,0.391228,0.016323,0.284641,1.471671
foo,one,1.234235,-0.857753,-0.230924,1.0289,0.014837,-0.036479
foo,two,-0.422228,1.519969,-0.328178,0.166875,0.948111,0.875586


In [353]:
with pd.option_context('display.multi_sparse', False):
    d(df)

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,0.930166,-1.133731,0.370222,-0.634823,1.316806,1.386141,0.957984,-0.643576
B,0.405946,-1.955935,0.236658,-1.247545,0.809163,1.283452,1.284907,-1.97624
C,-2.050342,0.153034,0.784836,-2.483927,-0.890199,-0.918028,2.830012,-0.617136


In [354]:
df

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,0.930166,-1.133731,0.370222,-0.634823,1.316806,1.386141,0.957984,-0.643576
B,0.405946,-1.955935,0.236658,-1.247545,0.809163,1.283452,1.284907,-1.97624
C,-2.050342,0.153034,0.784836,-2.483927,-0.890199,-0.918028,2.830012,-0.617136


In [355]:
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [356]:
df.columns == index

array([ True,  True,  True,  True,  True,  True,  True,  True])

In [357]:
df.columns is index

True

In [358]:
index.get_level_values(0)

Index(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], dtype='object', name='first')

In [359]:
index.get_level_values('second')

Index(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'], dtype='object', name='second')

In [None]:
# Restart at: Basic indexing on axis with MultiIndex

In [360]:
df

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,0.930166,-1.133731,0.370222,-0.634823,1.316806,1.386141,0.957984,-0.643576
B,0.405946,-1.955935,0.236658,-1.247545,0.809163,1.283452,1.284907,-1.97624
C,-2.050342,0.153034,0.784836,-2.483927,-0.890199,-0.918028,2.830012,-0.617136


In [361]:
z(df['bar'])

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, A to C
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   one     3 non-null      float64
 1   two     3 non-null      float64
dtypes: float64(2)
memory usage: 60.0+ bytes
None


second,one,two
A,0.930166,-1.133731
B,0.405946,-1.955935
C,-2.050342,0.153034


In [362]:
z(df['bar', 'one'])

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 3


A    0.930166
B    0.405946
C   -2.050342
Name: (bar, one), dtype: float64

In [363]:
s

bar  one   -0.237364
     two   -0.517707
baz  one    0.521899
     two    0.334682
foo  one    0.484535
     two    0.694028
qux  one    0.177154
     two    0.535700
dtype: float64

In [364]:
s['qux']

one    0.177154
two    0.535700
dtype: float64

In [365]:
z(df.columns.levels)  # original MultiIndex

----------------------------------------
Type: <class 'pandas.core.indexes.frozen.FrozenList'>
Length: 2


FrozenList([['bar', 'baz', 'foo', 'qux'], ['one', 'two']])

In [366]:
df

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,0.930166,-1.133731,0.370222,-0.634823,1.316806,1.386141,0.957984,-0.643576
B,0.405946,-1.955935,0.236658,-1.247545,0.809163,1.283452,1.284907,-1.97624
C,-2.050342,0.153034,0.784836,-2.483927,-0.890199,-0.918028,2.830012,-0.617136


In [414]:
d(df.columns.levels)

FrozenList([['bar', 'baz', 'foo', 'qux'], ['one', 'two']])

In [412]:
d(df.columns.get_level_values('first'))
d(df.columns.get_level_values('second'))
d(df.index.get_level_values(0))

Index(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], dtype='object', name='first')

Index(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'], dtype='object', name='second')

Index(['A', 'B', 'C'], dtype='object')

In [368]:
df.columns.get_level_values(0)

Index(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], dtype='object', name='first')

In [371]:
df.columns.levels

FrozenList([['bar', 'baz', 'foo', 'qux'], ['one', 'two']])

In [373]:
z(df.columns)

----------------------------------------
Type: <class 'pandas.core.indexes.multi.MultiIndex'>
Length: 8


MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [379]:
df.columns.levels

FrozenList([['bar', 'baz', 'foo', 'qux'], ['one', 'two']])

In [381]:
df.columns.get_level_values(0)

Index(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], dtype='object', name='first')

In [382]:
df.columns.nlevels

2

In [383]:
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]
pop = pd.Series(populations, index=index)

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [385]:
index = pd.MultiIndex.from_tuples(index)
z(index)

----------------------------------------
Type: <class 'pandas.core.indexes.multi.MultiIndex'>
Length: 6


MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [386]:
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [387]:
pop = pop.reindex(index)

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [389]:
pop[:, 2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [390]:
pop_df = pop.unstack()

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [391]:
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [392]:
pop_df2 = pd.DataFrame({
                       'under18': [9267089, 9284094,
                                   4687374, 4318033,
                                   5906301, 6879014]})

Unnamed: 0,under18
0,9267089
1,9284094
2,4687374
3,4318033
4,5906301
5,6879014


In [393]:
pop_df = pd.DataFrame({'total': pop,
                       'under18': [9267089, 9284094,
                                   4687374, 4318033,
                                   5906301, 6879014]})

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [394]:
f_u18 = pop_df['under18'] / pop_df['total']

California  2000    0.273594
            2010    0.249211
New York    2000    0.247010
            2010    0.222831
Texas       2000    0.283251
            2010    0.273568
dtype: float64

In [395]:
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


In [397]:
# hierarchical indices and columns
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                   names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                     names=['subject', 'type'])

# mock some data
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37

# create the DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,33.0,35.1,46.0,34.6,50.0,36.7
2013,2,47.0,37.6,18.0,36.1,34.0,36.6
2014,1,55.0,35.4,45.0,37.9,41.0,36.8
2014,2,32.0,36.4,16.0,36.6,25.0,38.4


In [407]:
health_data[  ['Guido', 'Sue']  ]

Unnamed: 0_level_0,subject,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2013,1,46.0,34.6,50.0,36.7
2013,2,18.0,36.1,34.0,36.6
2014,1,45.0,37.9,41.0,36.8
2014,2,16.0,36.6,25.0,38.4


In [408]:
health_data.iloc[:2, :2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,33.0,35.1
2013,2,47.0,37.6


In [421]:
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df4 = pd.DataFrame(np.random.randn(6, 6), index=index[:6], columns=index[:6])

Unnamed: 0_level_0,first,bar,bar,baz,baz,foo,foo
Unnamed: 0_level_1,second,one,two,one,two,one,two
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
bar,one,-0.064587,0.075447,-0.789525,0.006389,1.186447,-0.268145
bar,two,2.186364,-0.578367,-0.770776,-0.08489,0.582773,0.926635
baz,one,2.278802,0.277379,0.170114,-0.693694,0.217613,-0.788841
baz,two,-1.355429,0.40772,-0.10124,-0.429228,0.160664,-1.013968
foo,one,0.453503,1.028611,0.401769,0.381978,-1.112425,0.037163
foo,two,-0.139385,0.480202,1.302333,-0.01013,1.223824,0.185778


In [420]:
df4.loc['bar', 'one']

first  second
bar    one       0.309500
       two      -0.211056
baz    one      -1.842023
       two      -0.390820
foo    one      -1.964475
       two       1.298329
Name: (bar, one), dtype: float64

In [422]:
ind1 = pd.MultiIndex.from_product([['A', 'B', 'C'], ['a', 'b']],
                                  names=['Cap', 'lower'])
ind2 = pd.MultiIndex.from_product([['X', 'Y', 'Z'], ['x', 'y']],
                                  names=['Big', 'small'])

df4 = pd.DataFrame(np.random.randn(6, 6), index=ind1[:6], columns=ind2[:6])

Unnamed: 0_level_0,Big,X,X,Y,Y,Z,Z
Unnamed: 0_level_1,small,x,y,x,y,x,y
Cap,lower,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
A,a,0.436259,0.678101,0.311369,-0.528378,-0.674808,-1.103529
A,b,-0.656157,1.889957,2.076651,-1.102192,-1.211795,-0.791746
B,a,0.634724,0.243154,1.130659,0.580708,-0.597035,0.53604
B,b,-0.230984,-0.046489,1.327244,-1.371095,0.276962,-0.902128
C,a,0.974722,-1.528802,0.808541,-0.140657,0.091982,-0.198101
C,b,-0.412734,0.300011,0.043938,-0.123005,-1.385465,0.048273


In [423]:
df4['X']

Unnamed: 0_level_0,small,x,y
Cap,lower,Unnamed: 2_level_1,Unnamed: 3_level_1
A,a,0.436259,0.678101
A,b,-0.656157,1.889957
B,a,0.634724,0.243154
B,b,-0.230984,-0.046489
C,a,0.974722,-1.528802
C,b,-0.412734,0.300011


In [425]:
df4[['X', 'Y']]

Unnamed: 0_level_0,Big,X,X,Y,Y
Unnamed: 0_level_1,small,x,y,x,y
Cap,lower,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
A,a,0.436259,0.678101,0.311369,-0.528378
A,b,-0.656157,1.889957,2.076651,-1.102192
B,a,0.634724,0.243154,1.130659,0.580708
B,b,-0.230984,-0.046489,1.327244,-1.371095
C,a,0.974722,-1.528802,0.808541,-0.140657
C,b,-0.412734,0.300011,0.043938,-0.123005


In [427]:
df4['X', 'y']

Cap  lower
A    a        0.678101
     b        1.889957
B    a        0.243154
     b       -0.046489
C    a       -1.528802
     b        0.300011
Name: (X, y), dtype: float64

In [434]:
df4.loc[:, [('X', 'y')]]

Unnamed: 0_level_0,Big,X
Unnamed: 0_level_1,small,y
Cap,lower,Unnamed: 2_level_2
A,a,0.678101
A,b,1.889957
B,a,0.243154
B,b,-0.046489
C,a,-1.528802
C,b,0.300011


In [433]:
df4.loc[:, ('X', 'y')]

Cap  lower
A    a        0.678101
     b        1.889957
B    a        0.243154
     b       -0.046489
C    a       -1.528802
     b        0.300011
Name: (X, y), dtype: float64

In [435]:
def mklbl(prefix, n):
    return ["%s%s" % (prefix, i) for i in range(n)]


miindex = pd.MultiIndex.from_product([mklbl('A', 4),
                                      mklbl('B', 2),
                                      mklbl('C', 4),
                                      mklbl('D', 2)])


micolumns = pd.MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'),
                                       ('b', 'foo'), ('b', 'bah')],
                                      names=['lvl0', 'lvl1'])


dfmi = pd.DataFrame(np.arange(len(miindex) * len(micolumns))
                      .reshape((len(miindex), len(micolumns))),
                    index=miindex,
                    columns=micolumns).sort_index().sort_index(axis=1)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bar,foo,bah,foo
A0,B0,C0,D0,1,0,3,2
A0,B0,C0,D1,5,4,7,6
A0,B0,C1,D0,9,8,11,10
A0,B0,C1,D1,13,12,15,14
A0,B0,C2,D0,17,16,19,18
...,...,...,...,...,...,...,...
A3,B1,C1,D1,237,236,239,238
A3,B1,C2,D0,241,240,243,242
A3,B1,C2,D1,245,244,247,246
A3,B1,C3,D0,249,248,251,250


In [438]:
d(dfmi.index.nlevels)
dfmi.index.levels

4

FrozenList([['A0', 'A1', 'A2', 'A3'], ['B0', 'B1'], ['C0', 'C1', 'C2', 'C3'], ['D0', 'D1']])

In [442]:
dfmi.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C2']), 
         (slice('b', 'b'), ['bah'])]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bah
A1,B0,C1,D0,75
A1,B0,C1,D1,79
A1,B0,C2,D0,83
A1,B0,C2,D1,87
A1,B1,C1,D0,107
A1,B1,C1,D1,111
A1,B1,C2,D0,115
A1,B1,C2,D1,119
A2,B0,C1,D0,139
A2,B0,C1,D1,143


In [444]:
df_old = df.copy()
df = df.T

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,0.930166,0.405946,-2.050342
bar,two,-1.133731,-1.955935,0.153034
baz,one,0.370222,0.236658,0.784836
baz,two,-0.634823,-1.247545,-2.483927
foo,one,1.316806,0.809163,-0.890199
foo,two,1.386141,1.283452,-0.918028
qux,one,0.957984,1.284907,2.830012
qux,two,-0.643576,-1.97624,-0.617136


In [445]:
df.loc[('bar', 'two')]

A   -1.133731
B   -1.955935
C    0.153034
Name: (bar, two), dtype: float64

In [446]:
df.loc[('bar',), 'A']

second
one    0.930166
two   -1.133731
Name: A, dtype: float64

In [447]:
df.loc[('bar',), ['A']]

Unnamed: 0_level_0,A
second,Unnamed: 1_level_1
one,0.930166
two,-1.133731


In [448]:
df.loc[('bar',), ['A', 'B']]

Unnamed: 0_level_0,A,B
second,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0.930166,0.405946
two,-1.133731,-1.955935


In [449]:
df.loc[('bar',), ['A':'B']]

SyntaxError: invalid syntax (<ipython-input-449-99488fb78b8c>, line 1)

In [451]:
df.loc[('bar',), 'B':'A':-1]

Unnamed: 0_level_0,B,A
second,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0.405946,0.930166
two,-1.955935,-1.133731


In [452]:
df.loc[('bar',),]

Unnamed: 0_level_0,A,B,C
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0.930166,0.405946,-2.050342
two,-1.133731,-1.955935,0.153034


In [453]:
df.loc[('bar',)]

Unnamed: 0_level_0,A,B,C
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0.930166,0.405946,-2.050342
two,-1.133731,-1.955935,0.153034


In [454]:
df.loc[('bar')]

Unnamed: 0_level_0,A,B,C
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0.930166,0.405946,-2.050342
two,-1.133731,-1.955935,0.153034


In [455]:
df.loc[('bar', :), :]

SyntaxError: invalid syntax (<ipython-input-455-fbc36ca232d4>, line 1)

In [456]:
df.loc[('bar', slice(None)), slice(None)]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,0.930166,0.405946,-2.050342
bar,two,-1.133731,-1.955935,0.153034


In [459]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,0.930166,0.405946,-2.050342
bar,two,-1.133731,-1.955935,0.153034
baz,one,0.370222,0.236658,0.784836
baz,two,-0.634823,-1.247545,-2.483927
foo,one,1.316806,0.809163,-0.890199
foo,two,1.386141,1.283452,-0.918028
qux,one,0.957984,1.284907,2.830012
qux,two,-0.643576,-1.97624,-0.617136


In [465]:
df.loc[(slice('baz', 'foo'), (['one', 'two'])), slice('B', 'C')]

Unnamed: 0_level_0,Unnamed: 1_level_0,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
baz,one,0.236658,0.784836
baz,two,-1.247545,-2.483927
foo,one,0.809163,-0.890199
foo,two,1.283452,-0.918028


In [466]:
dfm

NameError: name 'dfm' is not defined

In [467]:
dfmi

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bar,foo,bah,foo
A0,B0,C0,D0,1,0,3,2
A0,B0,C0,D1,5,4,7,6
A0,B0,C1,D0,9,8,11,10
A0,B0,C1,D1,13,12,15,14
A0,B0,C2,D0,17,16,19,18
...,...,...,...,...,...,...,...
A3,B1,C1,D1,237,236,239,238
A3,B1,C2,D0,241,240,243,242
A3,B1,C2,D1,245,244,247,246
A3,B1,C3,D0,249,248,251,250


In [468]:
dfmi.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bar,foo,bah,foo
A1,B0,C1,D0,73,72,75,74
A1,B0,C1,D1,77,76,79,78
A1,B0,C3,D0,89,88,91,90
A1,B0,C3,D1,93,92,95,94
A1,B1,C1,D0,105,104,107,106
A1,B1,C1,D1,109,108,111,110
A1,B1,C3,D0,121,120,123,122
A1,B1,C3,D1,125,124,127,126
A2,B0,C1,D0,137,136,139,138
A2,B0,C1,D1,141,140,143,142


In [470]:
dfmi.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), (slice(None), 'foo')]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,foo,foo
A1,B0,C1,D0,72,74
A1,B0,C1,D1,76,78
A1,B0,C3,D0,88,90
A1,B0,C3,D1,92,94
A1,B1,C1,D0,104,106
A1,B1,C1,D1,108,110
A1,B1,C3,D0,120,122
A1,B1,C3,D1,124,126
A2,B0,C1,D0,136,138
A2,B0,C1,D1,140,142


In [471]:
df2 = dfmi.copy()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bar,foo,bah,foo
A0,B0,C0,D0,1,0,3,2
A0,B0,C0,D1,5,4,7,6
A0,B0,C1,D0,9,8,11,10
A0,B0,C1,D1,13,12,15,14
A0,B0,C2,D0,17,16,19,18
...,...,...,...,...,...,...,...
A3,B1,C1,D1,237,236,239,238
A3,B1,C2,D0,241,240,243,242
A3,B1,C2,D1,245,244,247,246
A3,B1,C3,D0,249,248,251,250


In [472]:
df3 = df2.copy()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bar,foo,bah,foo
A0,B0,C0,D0,1,0,3,2
A0,B0,C0,D1,5,4,7,6
A0,B0,C1,D0,9,8,11,10
A0,B0,C1,D1,13,12,15,14
A0,B0,C2,D0,17,16,19,18
...,...,...,...,...,...,...,...
A3,B1,C1,D1,237,236,239,238
A3,B1,C2,D0,241,240,243,242
A3,B1,C2,D1,245,244,247,246
A3,B1,C3,D0,249,248,251,250


In [474]:
idx = pd.IndexSlice

<pandas.core.indexing._IndexSlice at 0x102b9520>

In [482]:
%%timeit 
df3.loc[idx[:, :, ['C1', 'C3']], :] = df2 * 1000

7.86 ms ± 171 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [483]:
%%timeit 
df3.loc[idx[:, :, ['C1', 'C3']], :] = df2.loc[idx[:, :, ['C1', 'C3']], :] * 1000

6.16 ms ± 280 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [484]:
# Return at Advanced reindexing and alignment

In [485]:
midx = pd.MultiIndex(levels=[['zero', 'one'], ['x', 'y']],
   ....:                      codes=[[1, 1, 0, 0], [1, 0, 1, 0]])

MultiIndex([( 'one', 'y'),
            ( 'one', 'x'),
            ('zero', 'y'),
            ('zero', 'x')],
           )

In [486]:
vars(midx)

{'_cache': {'levels': FrozenList([['zero', 'one'], ['x', 'y']]),
  'dtype': dtype('O'),
  '_engine': <pandas.core.indexes.multi.MultiIndexUIntEngine at 0x12d64ba8>},
 '_levels': FrozenList([['zero', 'one'], ['x', 'y']]),
 '_tuples': array([('one', 'y'), ('one', 'x'), ('zero', 'y'), ('zero', 'x')],
       dtype=object),
 '_codes': FrozenList([[1, 1, 0, 0], [1, 0, 1, 0]]),
 '_names': [None, None],
 'sortorder': None,
 '_id': <object at 0x12db2428>}

In [488]:
d(midx.levels)
d(midx.codes)


FrozenList([['zero', 'one'], ['x', 'y']])

FrozenList([[1, 1, 0, 0], [1, 0, 1, 0]])

In [489]:
midx = pd.MultiIndex(levels=[['zero', 'one'], ['x', 'y']],
   ....:                      codes=[[1, 1, 0, 0], [1, 0, 1, 0]])

MultiIndex([( 'one', 'y'),
            ( 'one', 'x'),
            ('zero', 'y'),
            ('zero', 'x')],
           )

In [490]:
df = pd.DataFrame(np.random.randn(4, 2), index=midx)

Unnamed: 0,Unnamed: 1,0,1
one,y,1.086366,-0.743366
one,x,0.012316,-0.633656
zero,y,1.163565,-1.04456
zero,x,-0.386879,-0.244686


In [491]:
df2 = df.mean(level=0)

Unnamed: 0,0,1
one,0.549341,-0.688511
zero,0.388343,-0.644623


In [492]:
df2 = df.mean(level=1)

Unnamed: 0,0,1
y,1.124965,-0.893963
x,-0.187282,-0.439171
