# Advanced

The following content is the **Advanced**  part. Please make sure you have studied the **Basic**  part before you start.

In [1]:
import pandas as pd
import numpy as np

## Hierarchical indexing (MultiIndex)

### Creating a MultiIndex

In [2]:
iterables = [
    ["temperature","rainfall","runoff"],
    ["max","mean","min"],
]
idx = pd.MultiIndex.from_product(iterables, names=["factor", "method"])
idx

MultiIndex([('temperature',  'max'),
            ('temperature', 'mean'),
            ('temperature',  'min'),
            (   'rainfall',  'max'),
            (   'rainfall', 'mean'),
            (   'rainfall',  'min'),
            (     'runoff',  'max'),
            (     'runoff', 'mean'),
            (     'runoff',  'min')],
           names=['factor', 'method'])

In [3]:
df = pd.DataFrame(np.random.randn(9, 4), index=idx)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3
factor,method,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
temperature,max,-0.399027,0.804838,-0.299771,-1.160647
temperature,mean,0.510886,-2.204406,0.844373,-0.850894
temperature,min,0.169911,-0.800798,0.664208,-2.229422
rainfall,max,0.171517,1.806454,1.443633,0.037504
rainfall,mean,0.282716,0.419611,-0.224727,1.602656
rainfall,min,-0.544362,2.246466,-1.007365,-0.987332
runoff,max,-0.675737,-0.162522,0.556914,-1.645138
runoff,mean,0.596017,-0.115331,-0.839829,1.269507
runoff,min,-0.402181,-1.058276,-0.088421,-0.92484


In [4]:
idx = pd.MultiIndex.from_arrays(iterables, names=["factor", "method"])
idx

MultiIndex([('temperature',  'max'),
            (   'rainfall', 'mean'),
            (     'runoff',  'min')],
           names=['factor', 'method'])

In [5]:
df = pd.DataFrame(np.random.randn(3, 4), index=idx)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3
factor,method,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
temperature,max,0.508726,-1.017005,-0.780006,0.027753
rainfall,mean,-0.693272,-1.484366,-3.732062,0.417366
runoff,min,0.373411,-0.122169,-0.571951,0.048568


```pd.MultiIndex.from_tuples```, ```pd.MultiIndex.from_frame```

### Get index for multiindex

In [6]:
iterables = [
    ["temperature","rainfall","runoff"],
    ["max","mean","min"],
]
idx = pd.MultiIndex.from_product(iterables, names=["factor", "method"])
df = pd.DataFrame(np.random.randn(9, 4), index=idx)
df.index

MultiIndex([('temperature',  'max'),
            ('temperature', 'mean'),
            ('temperature',  'min'),
            (   'rainfall',  'max'),
            (   'rainfall', 'mean'),
            (   'rainfall',  'min'),
            (     'runoff',  'max'),
            (     'runoff', 'mean'),
            (     'runoff',  'min')],
           names=['factor', 'method'])

In [7]:
df.index.get_level_values(0)

Index(['temperature', 'temperature', 'temperature', 'rainfall', 'rainfall',
       'rainfall', 'runoff', 'runoff', 'runoff'],
      dtype='object', name='factor')

In [8]:
df.index.get_level_values(1)

Index(['max', 'mean', 'min', 'max', 'mean', 'min', 'max', 'mean', 'min'], dtype='object', name='method')

## Apply and Applymap
* Apply: Apply a function along an axis of the DataFrame.
* Applymap: Apply a function to a Dataframe elementwise. You can address each element for specfic requirements.

In [9]:
df = pd.DataFrame(np.random.randn(3, 4))
df

Unnamed: 0,0,1,2,3
0,0.688581,0.258612,-0.116632,-1.234923
1,-1.033229,-0.273521,0.106188,-0.475416
2,-0.1326,-0.177004,1.532636,-1.487604


In [10]:
df.apply(np.abs)

Unnamed: 0,0,1,2,3
0,0.688581,0.258612,0.116632,1.234923
1,1.033229,0.273521,0.106188,0.475416
2,0.1326,0.177004,1.532636,1.487604


In [11]:
func_x3 = lambda x: x**3 # lambda functiodn
df.apply(func_x3)

Unnamed: 0,0,1,2,3
0,0.326487,0.017296,-0.001587,-1.883302
1,-1.103036,-0.020463,0.001197,-0.107454
2,-0.002331,-0.005546,3.600124,-3.292015


In [12]:
# This function don't have specific meaning. 
# It only defines a complex operation for each element of dataframe.
def func_range(x):
    if x > 1:
        return 1
    elif x< -1:
        return -1
    else:
        return np.abs(x)
df.applymap(func_range)

Unnamed: 0,0,1,2,3
0,0.688581,0.258612,0.116632,-1.0
1,-1.0,0.273521,0.106188,0.475416
2,0.1326,0.177004,1.0,-1.0


## Groupby
`Groupby()` can be used to group large amounts of data and compute operations on these groups.



In [13]:
iterables = [
    ["temperature","rainfall","runoff"],
    ["site1","site2","site3"],
]
idx = pd.MultiIndex.from_product(iterables, names=["factor", "method"])
df = pd.DataFrame(np.random.randn(9, 4), index=idx)
for n,subdf in df.groupby(by=["factor"]):
    print(n)
    print(subdf)

rainfall
                        0         1         2         3
factor   method                                        
rainfall site1   0.732368 -0.682311 -0.256257  0.643786
         site2  -3.083902 -1.000745  0.911775 -1.607313
         site3   1.140646  0.054459 -2.724847 -1.623004
runoff
                      0         1         2         3
factor method                                        
runoff site1  -0.626130  0.409723 -1.044564  0.747707
       site2  -0.088922 -0.048216 -1.697176 -0.098273
       site3   0.693753  0.438575 -0.763520  0.036212
temperature
                           0         1         2         3
factor      method                                        
temperature site1   1.650404  0.970965  0.608340  1.608136
            site2   0.636997  1.083134  0.722128 -1.352512
            site3  -0.650333 -0.388040  0.082063  0.124609


In [14]:
df.groupby(by=["factor"]).mean()

Unnamed: 0_level_0,0,1,2,3
factor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
rainfall,-0.40363,-0.542866,-0.689776,-0.862177
runoff,-0.0071,0.266694,-1.16842,0.228549
temperature,0.545689,0.555353,0.470844,0.126744


## Table Visualization

In [15]:
np.random.seed(0)
df2 = pd.DataFrame(np.random.randn(10,4), columns=['A','B','C','D'])
df2.style
def style_negative(v, props=''):
    return props if v < 0 else None
s2 = df2.style.applymap(style_negative, props='color:red;')\
              .applymap(lambda v: 'opacity: 20%;' if (v < 0.3) and (v > -0.3) else None)
s2

Unnamed: 0,A,B,C,D
0,1.764052,0.400157,0.978738,2.240893
1,1.867558,-0.977278,0.950088,-0.151357
2,-0.103219,0.410599,0.144044,1.454274
3,0.761038,0.121675,0.443863,0.333674
4,1.494079,-0.205158,0.313068,-0.854096
5,-2.55299,0.653619,0.864436,-0.742165
6,2.269755,-1.454366,0.045759,-0.187184
7,1.532779,1.469359,0.154947,0.378163
8,-0.887786,-1.980796,-0.347912,0.156349
9,1.230291,1.20238,-0.387327,-0.302303


In [16]:
def highlight_max(s, props=''):
    return np.where(s == np.nanmax(s.values), props, '')
s2.apply(highlight_max, props='color:white;background-color:darkblue', axis=0)

Unnamed: 0,A,B,C,D
0,1.764052,0.400157,0.978738,2.240893
1,1.867558,-0.977278,0.950088,-0.151357
2,-0.103219,0.410599,0.144044,1.454274
3,0.761038,0.121675,0.443863,0.333674
4,1.494079,-0.205158,0.313068,-0.854096
5,-2.55299,0.653619,0.864436,-0.742165
6,2.269755,-1.454366,0.045759,-0.187184
7,1.532779,1.469359,0.154947,0.378163
8,-0.887786,-1.980796,-0.347912,0.156349
9,1.230291,1.20238,-0.387327,-0.302303


## Tooltips and Captions

In [17]:
# s.set_caption("Confusion matrix for multiple cancer prediction models.")\
#  .set_table_styles([{
#      'selector': 'caption',
#      'props': 'caption-side: bottom; font-size:1.25em;'
#  }], overwrite=False)
