In [1]:
import pandas as pd
import numpy as np
import os
os.chdir(r'C:\Users\dell\PycharmProjects\MachineLearning\Pandas\datasets')

In [2]:
columns = ['level', 'average damage']
data = [[31, 57081], [34, 53533], [32, 34532], [33, 30232]]

# Create MultiIndex

## From tuples

In [3]:
multi_index_tuple = pd.MultiIndex.from_tuples([('VNC', 'Pikachu'), ('VNC', 'Tank Cao'), ('Dirilis', 'MILORD'), ('Dirilis', 'CRIMEAN')])
multi_index_tuple

MultiIndex(levels=[['Dirilis', 'VNC'], ['CRIMEAN', 'MILORD', 'Pikachu', 'Tank Cao']],
           codes=[[1, 1, 0, 0], [2, 3, 1, 0]])

In [4]:
pd.DataFrame(data, index = multi_index_tuple, columns = columns)

Unnamed: 0,Unnamed: 1,level,average damage
VNC,Pikachu,31,57081
VNC,Tank Cao,34,53533
Dirilis,MILORD,32,34532
Dirilis,CRIMEAN,33,30232


## From arrays

In [5]:
multi_index_array = pd.MultiIndex.from_arrays([['VNC', 'VNC', 'Dirilis', 'Dirilis'], ['Pikachu', 'Tank Cao', 'MILORD', 'CRIMEAN']])
multi_index_array

MultiIndex(levels=[['Dirilis', 'VNC'], ['CRIMEAN', 'MILORD', 'Pikachu', 'Tank Cao']],
           codes=[[1, 1, 0, 0], [2, 3, 1, 0]])

In [6]:
#set name for each level
multi_index_array.names = ['clan', 'player']

In [7]:
TF = pd.DataFrame(data, index = multi_index_array, columns = columns)
TF

Unnamed: 0_level_0,Unnamed: 1_level_0,level,average damage
clan,player,Unnamed: 2_level_1,Unnamed: 3_level_1
VNC,Pikachu,31,57081
VNC,Tank Cao,34,53533
Dirilis,MILORD,32,34532
Dirilis,CRIMEAN,33,30232


## From product

In [8]:
multi_index_product = pd.MultiIndex.from_product([['VNC', 'Dirilis'], ['EU', 'SG']], names = ['clan', 'server'])
multi_index_product

MultiIndex(levels=[['Dirilis', 'VNC'], ['EU', 'SG']],
           codes=[[1, 1, 0, 0], [0, 1, 0, 1]],
           names=['clan', 'server'])

In [9]:
pd.DataFrame(data, index = multi_index_product, columns = columns)

Unnamed: 0_level_0,Unnamed: 1_level_0,level,average damage
clan,server,Unnamed: 2_level_1,Unnamed: 3_level_1
VNC,EU,31,57081
VNC,SG,34,53533
Dirilis,EU,32,34532
Dirilis,SG,33,30232


# Name

In [10]:
#Use MultiIndex.names = ['Name1', 'Name2', ...] for labeling MultiIndex
#Or when you create a MultiIndex, pass the keyword: names = ['Name1', 'Name2', ...]

scores = np.random.randint(6,10, (4,4))
index = pd.MultiIndex.from_product([['Trung', 'Kien'], [1, 2]],
names = ['Name', 'Semester'])

columns = pd.MultiIndex.from_arrays([['Science', 'Science', 'Social', 'Social'], ['Math', 'Physic', 'Literature', 'English']],
names = ['Type', 'Subject'])

pd.DataFrame(scores, index = index, columns = columns)


Unnamed: 0_level_0,Type,Science,Science,Social,Social
Unnamed: 0_level_1,Subject,Math,Physic,Literature,English
Name,Semester,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Trung,1,8,9,9,7
Trung,2,6,7,7,6
Kien,1,7,9,9,7
Kien,2,7,6,9,7


# pivot

In [11]:
df = pd.DataFrame([
['Trung', 'Math', 10],
['Trung', 'Physic', 9],
['Trung', 'Programming', 10],
['Kien', 'Math', 9],
['Kien', 'Physic', 9.6],
['Kien', 'Programming', 10]
], columns = ['Name', 'Subject', 'Score'])
df

Unnamed: 0,Name,Subject,Score
0,Trung,Math,10.0
1,Trung,Physic,9.0
2,Trung,Programming,10.0
3,Kien,Math,9.0
4,Kien,Physic,9.6
5,Kien,Programming,10.0


In [12]:
df.pivot(index = 'Name', columns = 'Subject', values = 'Score')

Subject,Math,Physic,Programming
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Kien,9.0,9.6,10.0
Trung,10.0,9.0,10.0


In [13]:
#equivalent
df.set_index(['Name', 'Subject']).unstack()

Unnamed: 0_level_0,Score,Score,Score
Subject,Math,Physic,Programming
Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Kien,9.0,9.6,10.0
Trung,10.0,9.0,10.0


In [14]:
#Omitting the values keyword will create a hierarchical index column
df['Title'] = df['Score'].apply(lambda v: 'Good' if v < 9.5 else 'Exellent')
df

Unnamed: 0,Name,Subject,Score,Title
0,Trung,Math,10.0,Exellent
1,Trung,Physic,9.0,Good
2,Trung,Programming,10.0,Exellent
3,Kien,Math,9.0,Good
4,Kien,Physic,9.6,Exellent
5,Kien,Programming,10.0,Exellent


In [15]:
df.pivot(index = 'Name', columns = 'Subject')

Unnamed: 0_level_0,Score,Score,Score,Title,Title,Title
Subject,Math,Physic,Programming,Math,Physic,Programming
Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Kien,9.0,9.6,10.0,Good,Exellent,Exellent
Trung,10.0,9.0,10.0,Exellent,Good,Exellent


# Stack, unstack

## unstack

```python
Series.unstack(level=-1, fill_value=None)
```

In [21]:
#Series.unstack() and DataFrame.stack()
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]
pop = pd.Series(populations, index = pd.MultiIndex.from_tuples(index))
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [22]:
pop.unstack()

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [26]:
#By default, the innermost level is unstacked
#You can unstack a different level by passing a level number of name
pop.unstack(0)

Unnamed: 0,California,New York,Texas
2000,33871648,18976457,20851820
2010,37253956,19378102,25145561


In [28]:
pop.unstack?

## stack

```python
DataFrame.stack(level = -1, dropna = True)
```

In [24]:
df = pd.DataFrame([[5, 4], [4.5, 6]], index = ['Samsung', 'Apple'], columns = ['Phone', 'Computer'])
df

Unnamed: 0,Phone,Computer
Samsung,5.0,4
Apple,4.5,6


In [25]:
df.stack()

Samsung  Phone       5.0
         Computer    4.0
Apple    Phone       4.5
         Computer    6.0
dtype: float64

# Slicing

In [32]:
np.random.seed(101)
scores = np.random.randint(5,10, (6,4))
index = pd.MultiIndex.from_product([['Trung', 'Kien', 'Van Van'], [1,2]], names = ['Name', 'Semester'])
columns = pd.MultiIndex.from_arrays([['Science', 'Science', 'Social', 'Social'], ['Math', 'Physic', 'Literature', 'English']],
names = ['Type', 'Subjects'])
df = pd.DataFrame(scores, index = index, columns = columns)
df

Unnamed: 0_level_0,Type,Science,Science,Social,Social
Unnamed: 0_level_1,Subjects,Math,Physic,Literature,English
Name,Semester,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Trung,1,8,6,8,6
Trung,2,5,9,5,9
Kien,1,9,5,9,5
Kien,2,6,8,7,5
Van Van,1,8,8,8,7
Van Van,2,9,5,6,8


In [34]:
#select information of Trung
df.loc['Trung']


Type,Science,Science,Social,Social
Subjects,Math,Physic,Literature,English
Semester,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,8,6,8,6
2,5,9,5,9


In [39]:
#information about of Kien in the 2 semester
df.loc[('Kien', 2)]

Type     Subjects  
Science  Math          6
         Physic        8
Social   Literature    7
         English       5
Name: (Kien, 2), dtype: int32

In [40]:
#information about Science of Kien in the 2 semester
df.loc[('Kien', 2), 'Science']

Subjects
Math      6
Physic    8
Name: (Kien, 2), dtype: int32

In [43]:
#Fancy Indexing
df.loc[['Trung', 'Kien']]

Unnamed: 0_level_0,Type,Science,Science,Social,Social
Unnamed: 0_level_1,Subjects,Math,Physic,Literature,English
Name,Semester,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Trung,1,8,6,8,6
Trung,2,5,9,5,9
Kien,1,9,5,9,5
Kien,2,6,8,7,5


## Index Slice

In [46]:
pd.IndexSlice?

In [45]:
#Scores of Trung and Van Van in Math and Literature in Semester 1
ix = pd.IndexSlice
df.loc[ix[['Trung', 'Van Van'], 1], ix[:, ['Math', 'Literature']]]

Unnamed: 0_level_0,Type,Science,Social
Unnamed: 0_level_1,Subjects,Math,Literature
Name,Semester,Unnamed: 2_level_2,Unnamed: 3_level_2
Trung,1,8,8
Van Van,1,8,8


# Aggregate

In [48]:
np.random.seed(108)
scores =  np.random.randint(7,10, (6,3))
index = pd.MultiIndex.from_product([['Trung', 'Kien'], ['Init', 'Mid', 'Final']], names = ['Name', 'Exam'])
columns = ['Math', 'Physic', 'English']

df = pd.DataFrame(scores, index = index, columns = columns)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Math,Physic,English
Name,Exam,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Trung,Init,8,9,9
Trung,Mid,9,7,9
Trung,Final,8,8,9
Kien,Init,9,8,8
Kien,Mid,7,8,9
Kien,Final,7,9,9


In [51]:
#overal average score of each student
df.mean(axis = 0, level = 0)

Unnamed: 0_level_0,Math,Physic,English
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Trung,8.333333,8.0,9.0
Kien,7.666667,8.333333,8.666667


In [52]:
#overal average score of each time
df.mean(axis = 0, level = 1)

Unnamed: 0_level_0,Math,Physic,English
Exam,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Init,8.5,8.5,8.5
Mid,8.0,7.5,9.0
Final,7.5,8.5,9.0


# reset_index

```python
sr.reset_index(level=None, drop=False, name=None, inplace=False)
```

In [54]:
#Turn MultiIndex into Columns

np.random.seed(7)
scores = np.random.randint(7,10,4)
index = pd.MultiIndex.from_product([['Trung', 'Kien'], [1,2]], names = ['Name', 'Semester'])
sr = pd.Series(scores, index = index)
sr

Name   Semester
Trung  1           7
       2           8
Kien   1           9
       2           7
dtype: int32

In [55]:
#reset the innermost level
sr.reset_index()

Unnamed: 0,Name,Semester,0
0,Trung,1,7
1,Trung,2,8
2,Kien,1,9
3,Kien,2,7


In [59]:
#reset the first level, set name to VN Pikachu
sr.reset_index(level = 0, name = 'VN Pikachu')

Unnamed: 0_level_0,Name,VN Pikachu
Semester,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Trung,7
2,Trung,8
1,Kien,9
2,Kien,7


In [62]:
#drop the first level
sr.reset_index(level = 0, drop = True)

Semester
1    7
2    8
1    9
2    7
dtype: int32

# Reorder

In [64]:
scores = np.random.randint(7,10, (4,4))
index = pd.MultiIndex.from_product([['Trung', 'Kien'], [1,2]], names = ['Name', 'Semester'])
columns = pd.MultiIndex.from_arrays([['Science', 'Science', 'Social', 'Social'], ['Math', 'Physic', 'History', 'Literature']], names = ['Category', 'Subject'])

data = pd.DataFrame(scores, index = index, columns = columns)
data

Unnamed: 0_level_0,Category,Science,Science,Social,Social
Unnamed: 0_level_1,Subject,Math,Physic,History,Literature
Name,Semester,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Trung,1,8,9,7,8
Trung,2,9,9,9,7
Kien,1,9,7,7,7
Kien,2,7,9,9,7


In [65]:
data.swaplevel(0, 1, axis = 'index')

Unnamed: 0_level_0,Category,Science,Science,Social,Social
Unnamed: 0_level_1,Subject,Math,Physic,History,Literature
Semester,Name,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1,Trung,8,9,7,8
2,Trung,9,9,9,7
1,Kien,9,7,7,7
2,Kien,7,9,9,7


In [66]:
data.swaplevel(0, 1, axis = 'columns')

Unnamed: 0_level_0,Subject,Math,Physic,History,Literature
Unnamed: 0_level_1,Category,Science,Science,Social,Social
Name,Semester,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Trung,1,8,9,7,8
Trung,2,9,9,9,7
Kien,1,9,7,7,7
Kien,2,7,9,9,7


# Sorting

In [68]:
data

Unnamed: 0_level_0,Category,Science,Science,Social,Social
Unnamed: 0_level_1,Subject,Math,Physic,History,Literature
Name,Semester,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Trung,1,8,9,7,8
Trung,2,9,9,9,7
Kien,1,9,7,7,7
Kien,2,7,9,9,7


In [69]:
#sort subject lex
data.sort_index(axis = 'columns', level = 1)

Unnamed: 0_level_0,Category,Social,Social,Science,Science
Unnamed: 0_level_1,Subject,History,Literature,Math,Physic
Name,Semester,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Trung,1,7,8,8,9
Trung,2,9,7,9,9
Kien,1,7,7,9,7
Kien,2,9,7,7,9


# Tips

In [2]:
df = pd.DataFrame({
    'school_code': ['s001','s002','s003','s001','s002','s004'],
    'class': ['V', 'V', 'VI', 'VI', 'V', 'VI'],
    'name': ['Alberto Franco','Gino Mcneill','Ryan Parkes', 'Eesha Hinton', 'Gino Mcneill', 'David Parkes'],
    'date_of_birth': ['15/05/2002','17/05/2002','16/02/1999','25/09/1998','11/05/2002','15/09/1997'],
    'weight': [35, 32, 33, 30, 31, 32]},
     index =  ['t1', 't2', 't3', 't4', 't5', 't6'])

df

Unnamed: 0,school_code,class,name,date_of_birth,weight
t1,s001,V,Alberto Franco,15/05/2002,35
t2,s002,V,Gino Mcneill,17/05/2002,32
t3,s003,VI,Ryan Parkes,16/02/1999,33
t4,s001,VI,Eesha Hinton,25/09/1998,30
t5,s002,V,Gino Mcneill,11/05/2002,31
t6,s004,VI,David Parkes,15/09/1997,32


In [4]:
df1 = df.set_index(['school_code', 'class', 'name'])
df1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,date_of_birth,weight
school_code,class,name,Unnamed: 3_level_1,Unnamed: 4_level_1
s001,V,Alberto Franco,15/05/2002,35
s002,V,Gino Mcneill,17/05/2002,32
s003,VI,Ryan Parkes,16/02/1999,33
s001,VI,Eesha Hinton,25/09/1998,30
s002,V,Gino Mcneill,11/05/2002,31
s004,VI,David Parkes,15/09/1997,32


*How to access a single level in MultiIndexc?*

In [5]:
df1.index.levels

FrozenList([['s001', 's002', 's003', 's004'], ['V', 'VI'], ['Alberto Franco', 'David Parkes', 'Eesha Hinton', 'Gino Mcneill', 'Ryan Parkes']])

In [7]:
# access the 1-th level (0-based)
df1.index.levels[1]

Index(['V', 'VI'], dtype='object', name='class')

**`pd.IndexSlice`** for selection based on MultiIndex

In [11]:
idx = pd.IndexSlice
# select everything from the first index level
# select rows having 'V' in the second index level
df1.loc[idx[:, 'V'], :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,date_of_birth,weight
school_code,class,name,Unnamed: 3_level_1,Unnamed: 4_level_1
s001,V,Alberto Franco,15/05/2002,35
s002,V,Gino Mcneill,17/05/2002,32
s002,V,Gino Mcneill,11/05/2002,31


In [12]:
# you can use a tuple 
# select rows having s001 in the first level and V in the second level
df1.loc[('s001', 'V'), :]

  return self._getitem_tuple(key)


Unnamed: 0_level_0,date_of_birth,weight
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alberto Franco,15/05/2002,35


In [13]:
# get the names of each levels
df1.index.names

FrozenList(['school_code', 'class', 'name'])

In [14]:
# get the names of the multiindex
df1.index.name

In [15]:
# set the names of the multiIndex
df1.index.name = 'ID'
df1.index.name

'ID'

Using **`Index.set_names()`** to set the name of index (SingleIndex or MultiIndex)