In [1]:
import numpy as np
import pandas as pd

In [2]:
#multiple indexed series

index=[('Nairobi',2000),('Nairobi',2010),
       ('Kisumu',2000),('Kisumu',2010),
       ('Mombasa',2000),('Mombasa',2010)]
populations=[2879500,3790345,
            567900,908456,
            1489342,2894560]
pop=pd.Series(populations,index=index)
pop

(Nairobi, 2000)    2879500
(Nairobi, 2010)    3790345
(Kisumu, 2000)      567900
(Kisumu, 2010)      908456
(Mombasa, 2000)    1489342
(Mombasa, 2010)    2894560
dtype: int64

In [3]:
#indexing the bad way
pop[('Nairobi',2010):('Mombasa',2000)]

(Nairobi, 2010)    3790345
(Kisumu, 2000)      567900
(Kisumu, 2010)      908456
(Mombasa, 2000)    1489342
dtype: int64

In [4]:
pop[[i for i in pop.index if i[1]==2010]]

(Nairobi, 2010)    3790345
(Kisumu, 2010)      908456
(Mombasa, 2010)    2894560
dtype: int64

In [5]:
#Indexing the better way:Pandas multiindex
index=pd.MultiIndex.from_tuples(index)
index

MultiIndex([('Nairobi', 2000),
            ('Nairobi', 2010),
            ( 'Kisumu', 2000),
            ( 'Kisumu', 2010),
            ('Mombasa', 2000),
            ('Mombasa', 2010)],
           )

In [6]:
pop=pop.reindex(index)
pop

Nairobi  2000    2879500
         2010    3790345
Kisumu   2000     567900
         2010     908456
Mombasa  2000    1489342
         2010    2894560
dtype: int64

In [7]:
pop[:,2010]

Nairobi    3790345
Kisumu      908456
Mombasa    2894560
dtype: int64

In [8]:
#multiIndex as extra dimension
pop_df=pop.unstack()
pop_df

Unnamed: 0,2000,2010
Kisumu,567900,908456
Mombasa,1489342,2894560
Nairobi,2879500,3790345


In [9]:
pop_df.stack()

Kisumu   2000     567900
         2010     908456
Mombasa  2000    1489342
         2010    2894560
Nairobi  2000    2879500
         2010    3790345
dtype: int64

In [10]:
pop_df=pd.DataFrame({'total':pop,
                    'under 18':[237689,504567,
                               768940,1456709,
                               1568930,2304569]})
pop_df

Unnamed: 0,Unnamed: 1,total,under 18
Nairobi,2000,2879500,237689
Nairobi,2010,3790345,504567
Kisumu,2000,567900,768940
Kisumu,2010,908456,1456709
Mombasa,2000,1489342,1568930
Mombasa,2010,2894560,2304569


In [11]:
pop_df.unstack()

Unnamed: 0_level_0,total,total,under 18,under 18
Unnamed: 0_level_1,2000,2010,2000,2010
Kisumu,567900,908456,768940,1456709
Mombasa,1489342,2894560,1568930,2304569
Nairobi,2879500,3790345,237689,504567


In [12]:
f_u18=pop_df['under 18']/pop_df['total']
f_u18.unstack()

Unnamed: 0,2000,2010
Kisumu,1.354006,1.6035
Mombasa,1.053438,0.796172
Nairobi,0.082545,0.133119


In [13]:
#methods of multiIndex creation
df=pd.DataFrame(np.random.rand(4,2),
               index=[['a','a','b','b'],[1,2,1,2]],
               columns=['data1','data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.62165,0.043615
a,2,0.345695,0.477217
b,1,0.205507,0.87914
b,2,0.562976,0.683602


In [14]:
data= {('California', 2000): 33871648,
       ('California', 2010): 37253956,
       ('Texas', 2000): 20851820,
       ('Texas', 2010): 25145561,
       ('New York', 2000): 18976457,
       ('New York', 2010): 19378102}
pd.Series(data)

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

In [15]:
#explicit multiIndex constructors
#from a simple list of arrays
pd.MultiIndex.from_arrays([['a','a','b','b'],[1,2,1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [16]:
#from a list of tuples
pd.MultiIndex.from_tuples([('a',1),('a',2),('b',1),('b',2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [17]:
pd.MultiIndex.from_product([['a','b'],[1,2]])
#from cartesian product

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [18]:
pd.MultiIndex(levels=[['a','b'],[1,2]],
             codes=[[0,0,1,1],[0,1,0,1]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [19]:
#MultiIndex level names
pop.index.names=['County','year']
pop

County   year
Nairobi  2000    2879500
         2010    3790345
Kisumu   2000     567900
         2010     908456
Mombasa  2000    1489342
         2010    2894560
dtype: int64

In [20]:
pop_df.index.names=['county','year']
pop_df

Unnamed: 0_level_0,Unnamed: 1_level_0,total,under 18
county,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Nairobi,2000,2879500,237689
Nairobi,2010,3790345,504567
Kisumu,2000,567900,768940
Kisumu,2010,908456,1456709
Mombasa,2000,1489342,1568930
Mombasa,2010,2894560,2304569


In [21]:
#MUltiIndex for columns
#hierarchical indices and columns

index=pd.MultiIndex.from_product([[2019,2020],[1,2]],
                                names=['year','visit'])
columns=pd.MultiIndex.from_product([['Bob','Guido','Sue'],['HR','Temp']],
                                   names=['subject','type'])

In [22]:
#mock some data
data=np.round(np.random.randn(4,6),1)
data[:,::2]*=10
data+=37

In [23]:
#create the data frame
health_data=pd.DataFrame(data,index=index,columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2019,1,40.0,36.8,37.0,35.7,50.0,37.2
2019,2,26.0,37.2,31.0,36.6,36.0,36.9
2020,1,22.0,39.1,41.0,37.1,45.0,37.4
2020,2,56.0,37.1,26.0,37.9,29.0,37.3


In [24]:
health_data['Guido']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2019,1,37.0,35.7
2019,2,31.0,36.6
2020,1,41.0,37.1
2020,2,26.0,37.9


In [25]:
#Indexing and Slicing a MultiIndex
pop

county   year
Nairobi  2000    2879500
         2010    3790345
Kisumu   2000     567900
         2010     908456
Mombasa  2000    1489342
         2010    2894560
dtype: int64

In [26]:
pop['Nairobi',2000]

2879500

In [27]:
pop['Nairobi']

year
2000    2879500
2010    3790345
dtype: int64

In [28]:
#pop.loc['Nairobi':'Kisumu']

In [29]:
pop[:,2000]

county
Nairobi    2879500
Kisumu      567900
Mombasa    1489342
dtype: int64

In [30]:
pop[pop>1000000]

county   year
Nairobi  2000    2879500
         2010    3790345
Mombasa  2000    1489342
         2010    2894560
dtype: int64

In [31]:
pop[['Nairobi','Kisumu']]

county   year
Nairobi  2000    2879500
         2010    3790345
Kisumu   2000     567900
         2010     908456
dtype: int64

In [32]:
#multiply indexed DataFrames
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2019,1,40.0,36.8,37.0,35.7,50.0,37.2
2019,2,26.0,37.2,31.0,36.6,36.0,36.9
2020,1,22.0,39.1,41.0,37.1,45.0,37.4
2020,2,56.0,37.1,26.0,37.9,29.0,37.3


In [33]:
health_data['Guido','HR']

year  visit
2019  1        37.0
      2        31.0
2020  1        41.0
      2        26.0
Name: (Guido, HR), dtype: float64

In [34]:
health_data.iloc[:2,:2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2019,1,40.0,36.8
2019,2,26.0,37.2


In [35]:
#Rearranging Multi-Indices
#>>>>>>sorted and unsorted indices
index=pd.MultiIndex.from_product([['a','c','b'],[1,2]])
data=pd.Series(np.random.rand(6),index=index)
data.index.names=['char','int']
data

char  int
a     1      0.414639
      2      0.946870
c     1      0.425696
      2      0.226818
b     1      0.310699
      2      0.578478
dtype: float64

In [36]:
try:
    data['a':'b']
except KeyError as e:
        print(type(e))
        print(e)

<class 'pandas.errors.UnsortedIndexError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'


In [37]:
data=data.sort_index()
data

char  int
a     1      0.414639
      2      0.946870
b     1      0.310699
      2      0.578478
c     1      0.425696
      2      0.226818
dtype: float64

In [38]:
data['a':'b']

char  int
a     1      0.414639
      2      0.946870
b     1      0.310699
      2      0.578478
dtype: float64

In [39]:
#stacking and unstacking indices
pop.unstack(level=0)

county,Kisumu,Mombasa,Nairobi
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,567900,1489342,2879500
2010,908456,2894560,3790345


In [40]:
pop.unstack(level=1)

year,2000,2010
county,Unnamed: 1_level_1,Unnamed: 2_level_1
Kisumu,567900,908456
Mombasa,1489342,2894560
Nairobi,2879500,3790345


In [41]:
pop.unstack().stack()

county   year
Kisumu   2000     567900
         2010     908456
Mombasa  2000    1489342
         2010    2894560
Nairobi  2000    2879500
         2010    3790345
dtype: int64

In [42]:
#index setting and resetting
pop_flat=pop.reset_index(name='population')
pop_flat

Unnamed: 0,county,year,population
0,Nairobi,2000,2879500
1,Nairobi,2010,3790345
2,Kisumu,2000,567900
3,Kisumu,2010,908456
4,Mombasa,2000,1489342
5,Mombasa,2010,2894560


In [43]:
pop_flat.set_index(['county','year'])

Unnamed: 0_level_0,Unnamed: 1_level_0,population
county,year,Unnamed: 2_level_1
Nairobi,2000,2879500
Nairobi,2010,3790345
Kisumu,2000,567900
Kisumu,2010,908456
Mombasa,2000,1489342
Mombasa,2010,2894560


In [44]:
#Data Aggregation on Multi-Indices
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2019,1,40.0,36.8,37.0,35.7,50.0,37.2
2019,2,26.0,37.2,31.0,36.6,36.0,36.9
2020,1,22.0,39.1,41.0,37.1,45.0,37.4
2020,2,56.0,37.1,26.0,37.9,29.0,37.3


In [45]:
data_mean=health_data.mean(level='year')
data_mean

  data_mean=health_data.mean(level='year')


subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2019,33.0,37.0,34.0,36.15,43.0,37.05
2020,39.0,38.1,33.5,37.5,37.0,37.35


In [46]:
data_mean.mean(axis=1,level='type')

  data_mean.mean(axis=1,level='type')


type,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2019,36.666667,36.733333
2020,36.5,37.65


In [47]:

data

char  int
a     1      0.414639
      2      0.946870
b     1      0.310699
      2      0.578478
c     1      0.425696
      2      0.226818
dtype: float64

In [49]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2019,1,40.0,36.8,37.0,35.7,50.0,37.2
2019,2,26.0,37.2,31.0,36.6,36.0,36.9
2020,1,22.0,39.1,41.0,37.1,45.0,37.4
2020,2,56.0,37.1,26.0,37.9,29.0,37.3
