<a href="https://colab.research.google.com/github/Saifullah785/python-data-science-handbook-notes/blob/main/03_05_Hierarchical_Indexing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Hierarchical Indexing**

In [46]:
import pandas as pd
import numpy as np
# Import necessary libraries: pandas for data manipulation and numpy for numerical operations.

# **A Multiply Indexed Series**

**The Bad Way**

In [47]:
# Define a list of tuples to represent the index with state and year.
index = [('California', 2010), ('California', 2020),
         ('New York', 2010), ('New York', 2020),
         ('Texas', 2010), ('Texas', 2020)]

# Define a list of population values corresponding to the index.
populations = [37253956, 39538223,
               19378102, 20201249,
               25145561, 29145505]

# Create a pandas Series with the population data and the defined index.
pop = pd.Series(populations, index=index)
# Display the created Series.
pop

Unnamed: 0,0
"(California, 2010)",37253956
"(California, 2020)",39538223
"(New York, 2010)",19378102
"(New York, 2020)",20201249
"(Texas, 2010)",25145561
"(Texas, 2020)",29145505


In [48]:
# Slice the Series using the defined tuple index.
# This selects rows from ('California', 2020) up to ('Texas', 2010).
pop[('California', 2020):('Texas', 2010)]

Unnamed: 0,0
"(California, 2020)",39538223
"(New York, 2010)",19378102
"(New York, 2020)",20201249
"(Texas, 2010)",25145561


In [49]:
# Select rows where the second element of the index (year) is 2010 using a list comprehension.
pop[[i for i in pop.index if i[1] == 2010]]

Unnamed: 0,0
"(California, 2010)",37253956
"(New York, 2010)",19378102
"(Texas, 2010)",25145561


# **The Better Way: The Pandas MultiIndex**

In [50]:
# Create a MultiIndex from the list of tuples.
index = pd.MultiIndex.from_tuples(index)
# Display the created MultiIndex.
index

MultiIndex([('California', 2010),
            ('California', 2020),
            (  'New York', 2010),
            (  'New York', 2020),
            (     'Texas', 2010),
            (     'Texas', 2020)],
           )

In [51]:
# Reindex the 'pop' Series with the created MultiIndex.
pop = pop.reindex(index)
# Display the reindexed Series with the MultiIndex.
pop

Unnamed: 0,Unnamed: 1,0
California,2010,37253956
California,2020,39538223
New York,2010,19378102
New York,2020,20201249
Texas,2010,25145561
Texas,2020,29145505


In [52]:
# Select all rows where the second level of the MultiIndex (year) is 2020.
pop[:, 2020]

Unnamed: 0,0
California,39538223
New York,20201249
Texas,29145505


# **MultiIndex as Extra Dimension**

In [53]:
# Unstack the MultiIndex Series to create a DataFrame.
# The level 0 index (state) becomes the DataFrame index and level 1 (year) becomes the columns.
pop_df = pop.unstack()
# Display the created DataFrame.
pop_df

Unnamed: 0,2010,2020
California,37253956,39538223
New York,19378102,20201249
Texas,25145561,29145505


In [54]:
# Stack the DataFrame back to a MultiIndex Series.
# This is the inverse operation of unstack().
pop_df.stack()

Unnamed: 0,Unnamed: 1,0
California,2010,37253956
California,2020,39538223
New York,2010,19378102
New York,2020,20201249
Texas,2010,25145561
Texas,2020,29145505


In [55]:
# Create a DataFrame with 'total' population and 'under18' population using the MultiIndex from the 'pop' Series.
pop_df = pd.DataFrame({'total': pop,
                       'under18': [9284094, 8898092,
                                   4318033, 4181528,
                                   6879014, 7432474]})

# Display the created DataFrame.
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2010,37253956,9284094
California,2020,39538223,8898092
New York,2010,19378102,4318033
New York,2020,20201249,4181528
Texas,2010,25145561,6879014
Texas,2020,29145505,7432474


In [56]:
# Calculate the fraction of the population under 18.
f_u18 = pop_df['under18'] / pop_df['total']
# Unstack the resulting Series to see the fraction by state and year.
f_u18.unstack()

Unnamed: 0,2010,2020
California,0.249211,0.22505
New York,0.222831,0.206994
Texas,0.273568,0.255013


# **Methods of MultiIndex Creation**

In [57]:
# Create a DataFrame with a MultiIndex directly during creation.
# The index is defined using a list of lists for the levels.
df = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=['data1', 'data2'])
# Display the created DataFrame.
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.268589,0.373812
a,2,0.659176,0.105748
b,1,0.293309,0.39206
b,2,0.022764,0.627994


In [58]:
# Create a dictionary where keys are tuples representing the MultiIndex and values are the data.
data = {('California', 2010): 37253956,
        ('California', 2020): 39538223,
        ('New York', 2010): 19378102,
        ('New York', 2020): 20201249,
        ('Texas', 2010): 25145561,
        ('Texas', 2020): 29145505}

# Create a Series directly from the dictionary. Pandas automatically creates a MultiIndex.
pd.Series(data)

Unnamed: 0,Unnamed: 1,0
California,2010,37253956
California,2020,39538223
New York,2010,19378102
New York,2020,20201249
Texas,2010,25145561
Texas,2020,29145505


# **Explicit MultiIndex Constructors**

In [59]:
# Create a MultiIndex using the from_arrays constructor.
# It takes a list of arrays, where each array represents a level of the index.
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [60]:
# Create a MultiIndex using the from_tuples constructor.
# It takes a list of tuples, where each tuple is an index entry.
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [61]:
# Create a MultiIndex using the from_product constructor.
# It takes iterables and computes the cartesian product to form the index.
pd.MultiIndex.from_product([['a', 'b'], [1, 2]])
#

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [62]:
# Create a MultiIndex using the explicit MultiIndex constructor with levels and codes.
# Levels are the unique values in each level, and codes indicate the position of the level value for each index entry.
pd.MultiIndex(levels=[['a', 'b'], [1,  2]],
              codes=[[0, 0, 1, 1], [0, 1, 0, 1]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

# **MultiIndex Level Names**

In [63]:
# Assign names to the levels of the MultiIndex of the 'pop' Series.
pop.index.names = ['state', 'year']
# Display the Series with named index levels.
pop

Unnamed: 0_level_0,Unnamed: 1_level_0,0
state,year,Unnamed: 2_level_1
California,2010,37253956
California,2020,39538223
New York,2010,19378102
New York,2020,20201249
Texas,2010,25145561
Texas,2020,29145505


# **MultiIndex for Columns**

In [64]:
# Create a MultiIndex for the index and columns of a DataFrame.
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                   names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                     names=['subject', 'type'])

# Generate some random data for the DataFrame.
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37

# Create the DataFrame with the defined MultiIndex for index and columns.
health_data = pd.DataFrame(data, index=index, columns=columns)
# Display the created DataFrame.
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,32.0,36.4,51.0,38.5,39.0,36.9
2013,2,27.0,37.6,37.0,36.6,48.0,38.0
2014,1,33.0,36.0,50.0,36.6,36.0,35.5
2014,2,25.0,38.4,41.0,37.2,37.0,37.9


In [65]:
# Select the column with the key 'Guido' from the DataFrame with MultiIndex columns.
health_data['Guido']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,51.0,38.5
2013,2,37.0,36.6
2014,1,50.0,36.6
2014,2,41.0,37.2


# **Indexing and Slicing a MultiIndex**


**Multiply Indexed Series**

In [66]:
# Display the 'pop' Series with named MultiIndex.
pop

Unnamed: 0_level_0,Unnamed: 1_level_0,0
state,year,Unnamed: 2_level_1
California,2010,37253956
California,2020,39538223
New York,2010,19378102
New York,2020,20201249
Texas,2010,25145561
Texas,2020,29145505


In [67]:
# Access a specific element in the MultiIndex Series using a tuple.
pop['California', 2010]

np.int64(37253956)

In [68]:
# Access a slice of the MultiIndex Series using only the first level of the index.
pop['California']

Unnamed: 0_level_0,0
year,Unnamed: 1_level_1
2010,37253956
2020,39538223


In [69]:
# Slice the MultiIndex Series using label-based indexing with .loc.
# This selects rows from 'California' up to 'New York' (inclusive).
pop.loc['California' : 'New York']

Unnamed: 0_level_0,Unnamed: 1_level_0,0
state,year,Unnamed: 2_level_1
California,2010,37253956
California,2020,39538223
New York,2010,19378102
New York,2020,20201249


In [70]:
# Slice the MultiIndex Series using a tuple with a slice object for the first level.
# This selects all states for the year 2010.
pop[:, 2010]

Unnamed: 0_level_0,0
state,Unnamed: 1_level_1
California,37253956
New York,19378102
Texas,25145561


In [71]:
# Select elements from the MultiIndex Series based on a boolean condition.
# This selects populations greater than 22,000,000.
pop[pop > 22000000]

Unnamed: 0_level_0,Unnamed: 1_level_0,0
state,year,Unnamed: 2_level_1
California,2010,37253956
California,2020,39538223
Texas,2010,25145561
Texas,2020,29145505


In [72]:
# Select multiple states from the MultiIndex Series using a list of first-level index values.
pop[['California','Texas']]

Unnamed: 0_level_0,Unnamed: 1_level_0,0
state,year,Unnamed: 2_level_1
California,2010,37253956
California,2020,39538223
Texas,2010,25145561
Texas,2020,29145505


# **Multiply Indexed DataFrames**

In [73]:
# Display the 'health_data' DataFrame with MultiIndex for both index and columns.
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,32.0,36.4,51.0,38.5,39.0,36.9
2013,2,27.0,37.6,37.0,36.6,48.0,38.0
2014,1,33.0,36.0,50.0,36.6,36.0,35.5
2014,2,25.0,38.4,41.0,37.2,37.0,37.9


In [74]:
# Access a specific column in the DataFrame with MultiIndex columns using a tuple.
health_data['Guido', 'HR']

Unnamed: 0_level_0,Unnamed: 1_level_0,Guido
Unnamed: 0_level_1,Unnamed: 1_level_1,HR
year,visit,Unnamed: 2_level_2
2013,1,51.0
2013,2,37.0
2014,1,50.0
2014,2,41.0


In [75]:
# Slice the DataFrame using integer-location based indexing with .iloc.
# This selects the first 2 rows and first 2 columns.
health_data.iloc[:2, :2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,32.0,36.4
2013,2,27.0,37.6


In [76]:
# Slice the DataFrame using label-based indexing with .loc.
# This selects all rows and the column with the key ('Bob', 'HR').
health_data.loc[:, ('Bob', 'HR')]

Unnamed: 0_level_0,Unnamed: 1_level_0,Bob
Unnamed: 0_level_1,Unnamed: 1_level_1,HR
year,visit,Unnamed: 2_level_2
2013,1,32.0
2013,2,27.0
2014,1,33.0
2014,2,25.0


In [77]:
# Attempt to slice a MultiIndex DataFrame using a tuple with a slice object and .loc.
# This will result in a SyntaxError as the slice object (:) needs to be used with pd.IndexSlice.
health_data.loc[(:, 1), (:, 'HR')]

SyntaxError: invalid syntax (ipython-input-77-634165144.py, line 3)

In [78]:
# Create an IndexSlice object to facilitate slicing with slice objects in MultiIndex.
idx = pd.IndexSlice
# Slice the DataFrame using .loc with IndexSlice to select specific levels from the MultiIndex.
# This selects all first levels (year) and the second level 1 from the index,
# and all first levels (subject) and the second level 'HR' from the columns.
health_data.loc[idx[:, 1], idx[:, 'HR']]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,32.0,51.0,39.0
2014,1,33.0,50.0,36.0


# **Rearranging Multi-Indexes**


 **Sorted and Unsorted Indices**

In [79]:
# Create a MultiIndex with unsorted levels.
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
# Create a Series with the unsorted MultiIndex.
data = pd.Series(np.random.rand(6), index=index)
# Assign names to the index levels.
data.index.names = ['char', 'int']
# Display the Series with the unsorted MultiIndex.
data

Unnamed: 0_level_0,Unnamed: 1_level_0,0
char,int,Unnamed: 2_level_1
a,1,0.683994
a,2,0.233897
c,1,0.876586
c,2,0.595184
b,1,0.896885
b,2,0.093904


In [80]:
# Attempt to slice the unsorted MultiIndex Series.
# This will raise a KeyError because slicing requires the index to be sorted.
try:
    data['a':'b']
except KeyError as e:
    print("KeyError", e)

KeyError 'Key length (1) was greater than MultiIndex lexsort depth (0)'


In [81]:
# Sort the index of the Series.
data = data.sort_index()
# Display the Series with the sorted MultiIndex.
data

Unnamed: 0_level_0,Unnamed: 1_level_0,0
char,int,Unnamed: 2_level_1
a,1,0.683994
a,2,0.233897
b,1,0.896885
b,2,0.093904
c,1,0.876586
c,2,0.595184


In [82]:
# Slice the sorted MultiIndex Series. This will now work without error.
data['a':'b']

Unnamed: 0_level_0,Unnamed: 1_level_0,0
char,int,Unnamed: 2_level_1
a,1,0.683994
a,2,0.233897
b,1,0.896885
b,2,0.093904


# **Stacking and Unstacking Indices**

In [83]:
# Unstack the 'pop' Series based on the first level of the index (level=0), which is 'state'.
pop.unstack(level=0)

state,California,New York,Texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010,37253956,19378102,25145561
2020,39538223,20201249,29145505


In [84]:
# Unstack the 'pop' Series based on the second level of the index (level=1), which is 'year'.
pop.unstack(level=1)

year,2010,2020
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,37253956,39538223
New York,19378102,20201249
Texas,25145561,29145505


In [85]:
# Unstack the 'pop' Series and then stack it back.
# This shows that stacking and unstacking are inverse operations.
pop.unstack().stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
state,year,Unnamed: 2_level_1
California,2010,37253956
California,2020,39538223
New York,2010,19378102
New York,2020,20201249
Texas,2010,25145561
Texas,2020,29145505


# **Index Setting and Resetting**

In [86]:
# Reset the index of the 'pop' Series, turning the MultiIndex levels into columns.
# The original data column will be named 'population'.
pop_flat = pop.reset_index(name='population')
# Display the resulting flat DataFrame.
pop_flat

Unnamed: 0,state,year,population
0,California,2010,37253956
1,California,2020,39538223
2,New York,2010,19378102
3,New York,2020,20201249
4,Texas,2010,25145561
5,Texas,2020,29145505


In [87]:
# Set the index of the flat DataFrame back to a MultiIndex using the 'state' and 'year' columns.
pop_flat.set_index(['state', 'year'])

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2010,37253956
California,2020,39538223
New York,2010,19378102
New York,2020,20201249
Texas,2010,25145561
Texas,2020,29145505
