In [40]:
import pandas as pd
import numpy as np

pd.options.display.float_format = '{:,.1f}'.format

In [3]:
path = 'datasets/employee_list.parquet'
df = pd.read_parquet(path)
df.head()

Unnamed: 0,employee_number,name,company,country,dob,age,department,salary,has_parking_space
0,897028,Kenneth Jensen,Wilson and Sons,India,1983-07-03,38,Management,124790,False
1,463979,Sarah Anderson,"Hernandez, Cunningham and Clark",India,1980-08-09,41,Consulting,103122,True
2,388446,Tracie Rollins,"Hernandez, Cunningham and Clark",Cayman Islands,1987-07-29,34,Consulting,119072,False
3,267447,Seth Smith,Spears-Brown,Germany,1969-03-04,52,System Architect,115653,False
4,401300,Katherine Fields,"Hernandez, Cunningham and Clark",Venezuela,1980-01-26,42,Finance,119412,False


#### Create a MultiIndex series

In [5]:
ser = df.groupby(['company', 'department']).salary.mean()
ser

company                          department      
Hernandez, Cunningham and Clark  Consulting          108411.857143
                                 Developer           101505.500000
                                 Finance             104111.666667
                                 Management           98756.000000
                                 System Architect    108414.333333
Spears-Brown                     Consulting          108118.333333
                                 Developer           102611.428571
                                 Finance             107777.000000
                                 Management          120032.125000
                                 System Architect    103507.000000
Wilson and Sons                  Consulting          124709.666667
                                 Developer            97368.714286
                                 Finance              89295.666667
                                 Management          115702.714286
            

#### Subset of a MultiIndex

In [8]:
ser.loc[ser.index.get_level_values('department').isin(['Developer', 'System Architect'])]

company                          department      
Hernandez, Cunningham and Clark  Developer           101505.500000
                                 System Architect    108414.333333
Spears-Brown                     Developer           102611.428571
                                 System Architect    103507.000000
Wilson and Sons                  Developer            97368.714286
                                 System Architect    110295.000000
Name: salary, dtype: float64

In [21]:
f1 = ser.index.get_level_values('company') == 'Spears-Brown'
f2 = ser.index.get_level_values('department').isin(['Developer', 'System Architect'])
ser.loc[f1 & f2]

company       department      
Spears-Brown  Developer           102611.428571
              System Architect    103507.000000
Name: salary, dtype: float64

In [15]:
ser.loc['Spears-Brown', ['Developer', 'System Architect']]

company       department      
Spears-Brown  Developer           102611.428571
              System Architect    103507.000000
Name: salary, dtype: float64

In [16]:
ser.loc[:, ['Developer', 'System Architect']]

company                          department      
Hernandez, Cunningham and Clark  Developer           101505.500000
                                 System Architect    108414.333333
Spears-Brown                     Developer           102611.428571
                                 System Architect    103507.000000
Wilson and Sons                  Developer            97368.714286
                                 System Architect    110295.000000
Name: salary, dtype: float64

In [23]:
ser

company                          department      
Hernandez, Cunningham and Clark  Consulting          108411.857143
                                 Developer           101505.500000
                                 Finance             104111.666667
                                 Management           98756.000000
                                 System Architect    108414.333333
Spears-Brown                     Consulting          108118.333333
                                 Developer           102611.428571
                                 Finance             107777.000000
                                 Management          120032.125000
                                 System Architect    103507.000000
Wilson and Sons                  Consulting          124709.666667
                                 Developer            97368.714286
                                 Finance              89295.666667
                                 Management          115702.714286
            

### Operations of the MultiIndex

##### Find mean salary of each department

In [27]:
ser.groupby(level=1).mean()

department
Consulting          113746.619048
Developer           100495.214286
Finance             100394.777778
Management          111496.946429
System Architect    107405.444444
Name: salary, dtype: float64

Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().

##### Find mean salary of each company

In [28]:
ser.groupby(level=0).mean()

company
Hernandez, Cunningham and Clark    104239.871429
Spears-Brown                       108409.177381
Wilson and Sons                    107474.352381
Name: salary, dtype: float64

##### Find total salary

In [29]:
ser.sum()

1600617.0059523813

#### Swap index levels

In [31]:
ser.index = ser.index.swaplevel()
ser

department        company                        
Consulting        Hernandez, Cunningham and Clark    108411.857143
Developer         Hernandez, Cunningham and Clark    101505.500000
Finance           Hernandez, Cunningham and Clark    104111.666667
Management        Hernandez, Cunningham and Clark     98756.000000
System Architect  Hernandez, Cunningham and Clark    108414.333333
Consulting        Spears-Brown                       108118.333333
Developer         Spears-Brown                       102611.428571
Finance           Spears-Brown                       107777.000000
Management        Spears-Brown                       120032.125000
System Architect  Spears-Brown                       103507.000000
Consulting        Wilson and Sons                    124709.666667
Developer         Wilson and Sons                     97368.714286
Finance           Wilson and Sons                     89295.666667
Management        Wilson and Sons                    115702.714286
System Archi

#### What happens when we unstack a MultiIndex series?

In [17]:
tbl = ser.unstack()
tbl

department,Consulting,Developer,Finance,Management,System Architect
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Hernandez, Cunningham and Clark",108411.857143,101505.5,104111.666667,98756.0,108414.333333
Spears-Brown,108118.333333,102611.428571,107777.0,120032.125,103507.0
Wilson and Sons,124709.666667,97368.714286,89295.666667,115702.714286,110295.0


...It turns into a pivot table...and we can use the same way of subsetting as in the series

In [18]:
tbl.loc['Spears-Brown', ['Developer', 'System Architect']]

department
Developer           102611.428571
System Architect    103507.000000
Name: Spears-Brown, dtype: float64

In [19]:
tbl.loc[:, ['Developer', 'System Architect']]

department,Developer,System Architect
company,Unnamed: 1_level_1,Unnamed: 2_level_1
"Hernandez, Cunningham and Clark",101505.5,108414.333333
Spears-Brown,102611.428571,103507.0
Wilson and Sons,97368.714286,110295.0


#### Create a DataFrame with a MultiIndex

In [41]:
tbl = df.groupby(['company', 'department']).agg(
    mean_age=pd.NamedAgg('age', 'mean'),
    meax_age=pd.NamedAgg('age', 'max'),
    mean_salary=pd.NamedAgg('salary', 'mean'),
    max_salary=pd.NamedAgg('salary', 'max'))
tbl

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_age,meax_age,mean_salary,max_salary
company,department,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Hernandez, Cunningham and Clark",Consulting,41.1,50,108411.9,124667
"Hernandez, Cunningham and Clark",Developer,43.5,48,101505.5,121975
"Hernandez, Cunningham and Clark",Finance,41.8,53,104111.7,128194
"Hernandez, Cunningham and Clark",Management,39.1,49,98756.0,131896
"Hernandez, Cunningham and Clark",System Architect,45.7,53,108414.3,132988
Spears-Brown,Consulting,46.3,53,108118.3,133354
Spears-Brown,Developer,45.1,50,102611.4,129124
Spears-Brown,Finance,45.7,53,107777.0,130051
Spears-Brown,Management,39.6,51,120032.1,133602
Spears-Brown,System Architect,43.8,53,103507.0,133943


In [57]:
print(type(tbl))
print(type(tbl.index))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.indexes.multi.MultiIndex'>


### Different subsets of the MultiIndex DataFrame

In [43]:
tbl.loc['Hernandez, Cunningham and Clark']

Unnamed: 0_level_0,mean_age,meax_age,mean_salary,max_salary
department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Consulting,41.1,50,108411.9,124667
Developer,43.5,48,101505.5,121975
Finance,41.8,53,104111.7,128194
Management,39.1,49,98756.0,131896
System Architect,45.7,53,108414.3,132988


With a MultiIndex DataFrame, use `slice(None)` if you want to select all values of a certain index level.

In [51]:
tbl.loc[(slice(None), 'Developer'), :]

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_age,meax_age,mean_salary,max_salary
company,department,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Hernandez, Cunningham and Clark",Developer,43.5,48,101505.5,121975
Spears-Brown,Developer,45.1,50,102611.4,129124
Wilson and Sons,Developer,42.1,52,97368.7,117387


In [52]:
tbl.loc[(slice(None), ['Developer', 'System Architect']), :]

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_age,meax_age,mean_salary,max_salary
company,department,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Hernandez, Cunningham and Clark",Developer,43.5,48,101505.5,121975
"Hernandez, Cunningham and Clark",System Architect,45.7,53,108414.3,132988
Spears-Brown,Developer,45.1,50,102611.4,129124
Spears-Brown,System Architect,43.8,53,103507.0,133943
Wilson and Sons,Developer,42.1,52,97368.7,117387
Wilson and Sons,System Architect,44.6,53,110295.0,131134


In [56]:
tbl.loc[('Spears-Brown', slice(None)), ['mean_age', 'mean_salary']]

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_age,mean_salary
company,department,Unnamed: 2_level_1,Unnamed: 3_level_1
Spears-Brown,Consulting,46.3,108118.3
Spears-Brown,Developer,45.1,102611.4
Spears-Brown,Finance,45.7,107777.0
Spears-Brown,Management,39.6,120032.1
Spears-Brown,System Architect,43.8,103507.0


In [55]:
tbl.loc[('Spears-Brown', ['Developer', 'System Architect']), ['mean_age', 'mean_salary']]

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_age,mean_salary
company,department,Unnamed: 2_level_1,Unnamed: 3_level_1
Spears-Brown,Developer,45.1,102611.4
Spears-Brown,System Architect,43.8,103507.0
