# Pandas tutorial (Advanced)

The following content is the **Advanced**  part. Please make sure you have studied the **Basic**  part before you start.

In [1]:
import pandas as pd
import numpy as np

## Hierarchical indexing (MultiIndex)

### Creating a MultiIndex

In [2]:
iterables = [
    ["temperature","rainfall","runoff"],
    ["max","mean","min"],
]
idx = pd.MultiIndex.from_product(iterables, names=["factor", "method"])
idx

MultiIndex([('temperature',  'max'),
            ('temperature', 'mean'),
            ('temperature',  'min'),
            (   'rainfall',  'max'),
            (   'rainfall', 'mean'),
            (   'rainfall',  'min'),
            (     'runoff',  'max'),
            (     'runoff', 'mean'),
            (     'runoff',  'min')],
           names=['factor', 'method'])

In [3]:
df = pd.DataFrame(np.random.randn(9, 4), index=idx)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3
factor,method,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
temperature,max,0.462741,0.859426,0.24679,-0.653215
temperature,mean,0.309688,-0.617493,-0.361673,1.042269
temperature,min,-0.916308,0.077147,1.092102,0.836335
rainfall,max,-0.962365,-0.944029,0.203335,-0.132975
rainfall,mean,-0.848509,-1.143164,-0.174119,0.396041
rainfall,min,-0.817613,0.395625,-0.410328,-1.024797
runoff,max,0.30617,-0.938199,-0.918163,1.541053
runoff,mean,2.181643,-0.95299,0.452965,0.169779
runoff,min,0.649132,-0.411436,0.611533,0.505931


In [4]:
idx = pd.MultiIndex.from_arrays(iterables, names=["factor", "method"])
idx

MultiIndex([('temperature',  'max'),
            (   'rainfall', 'mean'),
            (     'runoff',  'min')],
           names=['factor', 'method'])

In [5]:
df = pd.DataFrame(np.random.randn(3, 4), index=idx)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3
factor,method,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
temperature,max,0.107408,-1.944158,0.036356,-0.018707
rainfall,mean,-0.3001,-1.173919,0.652389,-0.697953
runoff,min,1.031612,-0.157259,0.797382,-0.296476


```pd.MultiIndex.from_tuples```, ```pd.MultiIndex.from_frame```

### Get index for multiindex

In [6]:
iterables = [
    ["temperature","rainfall","runoff"],
    ["max","mean","min"],
]
idx = pd.MultiIndex.from_product(iterables, names=["factor", "method"])
df = pd.DataFrame(np.random.randn(9, 4), index=idx)
df.index

MultiIndex([('temperature',  'max'),
            ('temperature', 'mean'),
            ('temperature',  'min'),
            (   'rainfall',  'max'),
            (   'rainfall', 'mean'),
            (   'rainfall',  'min'),
            (     'runoff',  'max'),
            (     'runoff', 'mean'),
            (     'runoff',  'min')],
           names=['factor', 'method'])

In [7]:
df.index.get_level_values(0)

Index(['temperature', 'temperature', 'temperature', 'rainfall', 'rainfall',
       'rainfall', 'runoff', 'runoff', 'runoff'],
      dtype='object', name='factor')

In [8]:
df.index.get_level_values(1)

Index(['max', 'mean', 'min', 'max', 'mean', 'min', 'max', 'mean', 'min'], dtype='object', name='method')

## Apply and Applymap
* Apply: Apply a function along an axis of the DataFrame.
* Applymap: Apply a function to a Dataframe elementwise. You can address each element for specfic requirements.

In [9]:
df = pd.DataFrame(np.random.randn(3, 4))
df

Unnamed: 0,0,1,2,3
0,-0.033472,0.260952,-1.315109,0.505233
1,0.35746,-1.92657,0.309591,0.834444
2,0.032877,-0.766774,-0.234094,1.567056


In [10]:
df.apply(np.abs)

Unnamed: 0,0,1,2,3
0,0.033472,0.260952,1.315109,0.505233
1,0.35746,1.92657,0.309591,0.834444
2,0.032877,0.766774,0.234094,1.567056


In [11]:
func_x3 = lambda x: x**3 # lambda functiodn
df.apply(func_x3)

Unnamed: 0,0,1,2,3
0,-3.8e-05,0.01777,-2.274494,0.128966
1,0.045675,-7.150792,0.029673,0.58102
2,3.6e-05,-0.450819,-0.012828,3.848167


In [12]:
# This function don't have specific meaning. 
# It only defines a complex operation for each element of dataframe.
def func_range(x):
    if x > 1:
        return 1
    elif x< -1:
        return -1
    else:
        return np.abs(x)
df.applymap(func_range)

Unnamed: 0,0,1,2,3
0,0.033472,0.260952,-1.0,0.505233
1,0.35746,-1.0,0.309591,0.834444
2,0.032877,0.766774,0.234094,1.0


## Groupby
`Groupby()` can be used to group large amounts of data and compute operations on these groups.



In [13]:
iterables = [
    ["temperature","rainfall","runoff"],
    ["site1","site2","site3"],
]
idx = pd.MultiIndex.from_product(iterables, names=["factor", "method"])
df = pd.DataFrame(np.random.randn(9, 4), index=idx)
for n,subdf in df.groupby(by=["factor"]):
    print(n)
    print(subdf)

rainfall
                        0         1         2         3
factor   method                                        
rainfall site1  -0.393359  0.882672 -0.074783  0.604692
         site2  -0.591284  0.565010  1.539993 -0.018503
         site3  -1.409607  1.032555 -0.731094  1.385462
runoff
                      0         1         2         3
factor method                                        
runoff site1   0.556628 -1.300532  0.699864  1.610207
       site2  -0.637928 -0.178179  0.503056 -1.537695
       site3  -0.509241  0.170885 -0.814654 -0.377565
temperature
                           0         1         2         3
factor      method                                        
temperature site1   0.676445 -0.391639 -0.908043 -0.817479
            site2   1.106261 -0.131353 -0.797427 -1.283888
            site3  -1.271802  0.240521  1.596404  0.217446


In [14]:
df.groupby(by=["factor"]).mean()

Unnamed: 0_level_0,0,1,2,3
factor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
rainfall,-0.798084,0.826746,0.244705,0.657217
runoff,-0.196847,-0.435942,0.129422,-0.101684
temperature,0.170302,-0.094157,-0.036355,-0.627974


## Table Visualization

In [15]:
np.random.seed(0)
df2 = pd.DataFrame(np.random.randn(10,4), columns=['A','B','C','D'])
df2.style
def style_negative(v, props=''):
    return props if v < 0 else None
s2 = df2.style.applymap(style_negative, props='color:red;')\
              .applymap(lambda v: 'opacity: 20%;' if (v < 0.3) and (v > -0.3) else None)
s2

Unnamed: 0,A,B,C,D
0,1.764052,0.400157,0.978738,2.240893
1,1.867558,-0.977278,0.950088,-0.151357
2,-0.103219,0.410599,0.144044,1.454274
3,0.761038,0.121675,0.443863,0.333674
4,1.494079,-0.205158,0.313068,-0.854096
5,-2.55299,0.653619,0.864436,-0.742165
6,2.269755,-1.454366,0.045759,-0.187184
7,1.532779,1.469359,0.154947,0.378163
8,-0.887786,-1.980796,-0.347912,0.156349
9,1.230291,1.20238,-0.387327,-0.302303


In [16]:
def highlight_max(s, props=''):
    return np.where(s == np.nanmax(s.values), props, '')
s2.apply(highlight_max, props='color:white;background-color:darkblue', axis=0)

Unnamed: 0,A,B,C,D
0,1.764052,0.400157,0.978738,2.240893
1,1.867558,-0.977278,0.950088,-0.151357
2,-0.103219,0.410599,0.144044,1.454274
3,0.761038,0.121675,0.443863,0.333674
4,1.494079,-0.205158,0.313068,-0.854096
5,-2.55299,0.653619,0.864436,-0.742165
6,2.269755,-1.454366,0.045759,-0.187184
7,1.532779,1.469359,0.154947,0.378163
8,-0.887786,-1.980796,-0.347912,0.156349
9,1.230291,1.20238,-0.387327,-0.302303


## Tooltips and Captions

In [17]:
# s.set_caption("Confusion matrix for multiple cancer prediction models.")\
#  .set_table_styles([{
#      'selector': 'caption',
#      'props': 'caption-side: bottom; font-size:1.25em;'
#  }], overwrite=False)
