# Proof of Series is 1d and datframe is 2d 

In [1]:
# https://www.youtube.com/live/QzoRUwz8DoM?si=XHO68I1HnUD5tsWZ&t=847

In [2]:
import pandas as pd
import numpy as np

# Having multiple index in series

In [3]:
index_val = [('cse',2019), ('csed',2020), ('ecse',2021), ('csed',2022), ('cse',2023),('rcse',2024), ('csed',2025)]
series = pd.Series(np.random.randint(1,100,7), index=index_val)
print(series)
print('|------------------------|')
print(series[('csed', 2025)]) 

(cse, 2019)     47
(csed, 2020)    93
(ecse, 2021)    33
(csed, 2022)    89
(cse, 2023)     88
(rcse, 2024)    72
(csed, 2025)    18
dtype: int32
|------------------------|
18


In [4]:
# Why making multindex series in this way is not a good option ? -> https://www.youtube.com/live/QzoRUwz8DoM?si=ykdCCJD2x3T0JRqY&t=1187
# because we can't access by single index like for example: Only `csed` or `2025`

In [5]:
# The correct way of making multindex series is to use `pd.MultiIndex.from_tuples` or pd.MultiIndex.from_product` -> 
# https://www.youtube.com/live/QzoRUwz8DoM?si=tAYvRJUTiSWxtSFO&t=1247

In [6]:
# 1. Using `pd.MultiIndex.from_tuples` to create a multi-index series

index_val = [('cse',2019), ('csed',2020), ('ecse',2021), ('csed',2022), ('cse',2023),('rcse',2024), ('csed',2025)]
multi_index1 = pd.MultiIndex.from_tuples(index_val)
print(multi_index1)
print(multi_index1.levels) # level gives all the unique values in the 
print(multi_index1.levels[0])
print(multi_index1.levels[1])

MultiIndex([( 'cse', 2019),
            ('csed', 2020),
            ('ecse', 2021),
            ('csed', 2022),
            ( 'cse', 2023),
            ('rcse', 2024),
            ('csed', 2025)],
           )
[['cse', 'csed', 'ecse', 'rcse'], [2019, 2020, 2021, 2022, 2023, 2024, 2025]]
Index(['cse', 'csed', 'ecse', 'rcse'], dtype='object')
Index([2019, 2020, 2021, 2022, 2023, 2024, 2025], dtype='int64')


In [7]:
# 2. Using `pd.MultiIndex.from_product` to create a multi-index series
# so it will associate each value in the first list with all the values in the second list

index_val =  [('cse', 'csed', 'ecse', 'rcse'), (2024, 2025)]
multi_index2 = pd.MultiIndex.from_product(index_val)
print(multi_index2)

print(multi_index2.levels)

MultiIndex([( 'cse', 2024),
            ( 'cse', 2025),
            ('csed', 2024),
            ('csed', 2025),
            ('ecse', 2024),
            ('ecse', 2025),
            ('rcse', 2024),
            ('rcse', 2025)],
           )
[['cse', 'csed', 'ecse', 'rcse'], [2024, 2025]]


In [8]:
# creating a series with multindex object that we make above
rng = np.random.default_rng(77) # for reproducibility
multiIndex = pd.Series(rng.integers(1,100,8), index=multi_index2)
# so its like a hierarical tree structure, see -> https://www.youtube.com/live/QzoRUwz8DoM?si=HVVyVHUMJJw-WmjA&t=1547
multiIndex

cse   2024     6
      2025    78
csed  2024    63
      2025    55
ecse  2024    79
      2025    25
rcse  2024    86
      2025    34
dtype: int64

In [9]:
# how to fetch items from multiIndex series ?
multiIndex.loc[('csed')] # so it will give all the values of cse in 2024

2024    63
2025    55
dtype: int64

In [10]:
multiIndex.loc[('csed',2025)] # so it will give all the values of cse in 2024

np.int64(55)

In [11]:
# multindex series to dataframe using 
# `unstack` method
multiIndex.unstack()
# so it will give the values of `cse`,`csed`,`ecse`,`rcse` of 2024 and 2025 in a dataframe format

Unnamed: 0,2024,2025
cse,6,78
csed,63,55
ecse,79,25
rcse,86,34


In [12]:
pd.Series(index=multi_index1, data=[1,2,3,4,5,6,7]) # so it will give the values of cse in 2024 and 2025 in a dataframe format

cse   2019    1
csed  2020    2
ecse  2021    3
csed  2022    4
cse   2023    5
rcse  2024    6
csed  2025    7
dtype: int64

In [13]:
# multindex series to dataframe, using `unstack` method

pd.Series(index=multi_index1, data=[1,2,3,4,5,6,7]).unstack() # so it will give the values of cse in 2024 and 2025 in a dataframe format

Unnamed: 0,2019,2020,2021,2022,2023,2024,2025
cse,1.0,,,,5.0,,
csed,,2.0,,4.0,,,7.0
ecse,,,3.0,,,,
rcse,,,,,,6.0,


In [14]:
# dataframe to multindex series using `stack` method
pd.Series(index=multi_index1, data=[1,2,3,4,5,6,7]).unstack().stack()

# Note: stack() and unstack() method explain in detail in below section

cse   2019    1.0
      2023    5.0
csed  2020    2.0
      2022    4.0
      2025    7.0
ecse  2021    3.0
rcse  2024    6.0
dtype: float64

## [But Why to use multi index series? 🤔](https://www.youtube.com/live/QzoRUwz8DoM?si=7RidvYCQQtHNAuqQ&t=2017)

## multiIndex dataframes

In [15]:
# multindex dataframes -> https://www.youtube.com/live/QzoRUwz8DoM?si=3ITFbPdL4nivLYYH&t=2137

In [16]:
multi_index1

MultiIndex([( 'cse', 2019),
            ('csed', 2020),
            ('ecse', 2021),
            ('csed', 2022),
            ( 'cse', 2023),
            ('rcse', 2024),
            ('csed', 2025)],
           )

In [17]:
branch_df1 = pd.DataFrame(
    [
        [1, 2, 3],
        [4, 5, 6],
        [7, 8, 9],
        [10, 11, 12],
        [13, 14, 15],
        [16, 17, 18],
        [19, 20, 21],
    ],
    index = multi_index1,
    columns= ['avg_package', 'avg_salary', 'avg_ability']
)
branch_df1.info()
branch_df1

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 7 entries, ('cse', np.int64(2019)) to ('csed', np.int64(2025))
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   avg_package  7 non-null      int64
 1   avg_salary   7 non-null      int64
 2   avg_ability  7 non-null      int64
dtypes: int64(3)
memory usage: 774.0+ bytes


Unnamed: 0,Unnamed: 1,avg_package,avg_salary,avg_ability
cse,2019,1,2,3
csed,2020,4,5,6
ecse,2021,7,8,9
csed,2022,10,11,12
cse,2023,13,14,15
rcse,2024,16,17,18
csed,2025,19,20,21


In [18]:
# 1 important thing about indexes and columns in multindexes dataframes, is that pandas not treat indexes and columns as a single entity, so we can access them separately
# https://www.youtube.com/live/QzoRUwz8DoM?si=0VRVHTmiM_JLRkeu&t=2417
# so we can also make dataframe where columns will have hierarchical structure

In [19]:
# multiindex df from columns perspective -> https://www.youtube.com/live/QzoRUwz8DoM?si=adFShaAS0Dqztlmn&t=2547
branch_df2 = pd.DataFrame(
    [
        [1, 2, 10, 2],
        [3, 4, 0, 0],
        [5, 6, 10, 1],
        [7, 8, 0, 0],
    ],
    index = [2019, 2020, 2021, 2022],
    columns = pd.MultiIndex.from_product([['delhi', 'mumbai'], ['avg_package', 'students']])
)

branch_df2


Unnamed: 0_level_0,delhi,delhi,mumbai,mumbai
Unnamed: 0_level_1,avg_package,students,avg_package,students
2019,1,2,10,2
2020,3,4,0,0
2021,5,6,10,1
2022,7,8,0,0


In [20]:
branch_df2['delhi','avg_package'] 

2019    1
2020    3
2021    5
2022    7
Name: (delhi, avg_package), dtype: int64

In [21]:
branch_df2.loc[2022] # so we get multiIndex series

delhi   avg_package    7
        students       8
mumbai  avg_package    0
        students       0
Name: 2022, dtype: int64

In [22]:
multi_index1

MultiIndex([( 'cse', 2019),
            ('csed', 2020),
            ('ecse', 2021),
            ('csed', 2022),
            ( 'cse', 2023),
            ('rcse', 2024),
            ('csed', 2025)],
           )

In [23]:
# MultiIndex in terms of both rows and columns

branch_df3 = pd.DataFrame(
    data = [
        [1, 2, 10, 2], # we give 8 rows because we have 8 rows in total (in `multi_index1`), and we give 4 columns because we have 4 columns in total 
        [3, 4, 0, 0],
        [5, 6, 10, 1],
        [7, 8, 0, 0],
        [9, 10, 0, 0],
        [11, 12, 0, 0],
        [13, 14, 0, 0],
    ],
    index = multi_index1,
    columns = pd.MultiIndex.from_product([['delhi', 'mumbai'], ['avg_package', 'students']])
)
branch_df3

    

Unnamed: 0_level_0,Unnamed: 1_level_0,delhi,delhi,mumbai,mumbai
Unnamed: 0_level_1,Unnamed: 1_level_1,avg_package,students,avg_package,students
cse,2019,1,2,10,2
csed,2020,3,4,0,0
ecse,2021,5,6,10,1
csed,2022,7,8,0,0
cse,2023,9,10,0,0
rcse,2024,11,12,0,0
csed,2025,13,14,0,0


In [24]:
branch_df3['delhi']['avg_package']['cse'][2019] # so we can say its 4d data(because we 4 indexes to reach a single value) represented in lower dimension(2d) `dataframe` 

np.int64(1)

## Stacking and unstacking in detail(dataframe methods)

In [25]:

branch_df4 = pd.DataFrame(
        data = [
        [1,2],
        [3,4],
        [5,6],
        [7,8],
        [9,10],
        [11,12],
        [13,14],
        [15,16],
        ],
        index = pd.MultiIndex.from_product([['cse','ece'], [2019, 2020, 2021, 2022]]),
        columns = ['avg_package', 'students']
    )
# -> 
branch_df4



Unnamed: 0,Unnamed: 1,avg_package,students
cse,2019,1,2
cse,2020,3,4
cse,2021,5,6
cse,2022,7,8
ece,2019,9,10
ece,2020,11,12
ece,2021,13,14
ece,2022,15,16


In [26]:
# explaanation of what happens on unstack() call on branch_df4 -> https://www.youtube.com/live/QzoRUwz8DoM?si=-ZuQRphEo_AlkBme&t=3027

branch_df4.unstack()

Unnamed: 0_level_0,avg_package,avg_package,avg_package,avg_package,students,students,students,students
Unnamed: 0_level_1,2019,2020,2021,2022,2019,2020,2021,2022
cse,1,3,5,7,2,4,6,8
ece,9,11,13,15,10,12,14,16


In [27]:
# Exaplanation of what happens when unstack calls 2 times on branch_df4 - https://www.youtube.com/live/QzoRUwz8DoM?si=jYpMl2Hd-_nsyF89&t=3227
branch_df4.unstack().unstack()
# from this what we observe is, that `unstack` keeps convert index(last index first) into columns(nested inside previous columns)

avg_package  2019  cse     1
                   ece     9
             2020  cse     3
                   ece    11
             2021  cse     5
                   ece    13
             2022  cse     7
                   ece    15
students     2019  cse     2
                   ece    10
             2020  cse     4
                   ece    12
             2021  cse     6
                   ece    14
             2022  cse     8
                   ece    16
dtype: int64

In [28]:
branch_df4

Unnamed: 0,Unnamed: 1,avg_package,students
cse,2019,1,2
cse,2020,3,4
cse,2021,5,6
cse,2022,7,8
ece,2019,9,10
ece,2020,11,12
ece,2021,13,14
ece,2022,15,16


In [29]:
# now we will do opposite, means `stack` -> https://www.youtube.com/live/QzoRUwz8DoM?si=K2Fo8VB-be616Mjg&t=3417
# so stack converts columns into indexes, and it will keep converting columns into indexes(last column first) until all columns are converted into indexes
branch_df4.stack()


cse  2019  avg_package     1
           students        2
     2020  avg_package     3
           students        4
     2021  avg_package     5
           students        6
     2022  avg_package     7
           students        8
ece  2019  avg_package     9
           students       10
     2020  avg_package    11
           students       12
     2021  avg_package    13
           students       14
     2022  avg_package    15
           students       16
dtype: int64

In [30]:
""" And one thing more we observe is that if all the columns or rows are converting into indexes or columns respectively, 
then it will convert into a series, automatically """

' And one thing more we observe is that if all the columns or rows are converting into indexes or columns respectively, \nthen it will convert into a series, automatically '

In [31]:
# so what we done till now in this section, is that :
# 1. First we represent higher dimension data in lower dimension using dataframe(2d)
""" 2. And then we deciding(using `stack` and `unstack` methods)  how much row(we can say index too) will handle, 
and how much column will handle  """

' 2. And then we deciding(using `stack` and `unstack` methods)  how much row(we can say index too) will handle, \nand how much column will handle  '

## Wroking with multi-index dataframes
- [See this video to learn the topics realted to this section](https://www.youtube.com/live/QzoRUwz8DoM?si=gzsSAlxxWBmmIm9k&t=4137)

In [32]:

# first we will make 4d dataframe
branch_df5 = pd.DataFrame(
    data=[
        [1,2,0,0],
        [3,4,0,0],
        [5,6,0,0],
        [7,8,0,0],
        [9,10,0,0],
        [11,12,0,0],
        [13,14,0,0],
        [15,16,0,0],
    ],
    index = pd.MultiIndex.from_product([['cse','ece'], [2019, 2020, 2021, 2022]]),
    columns = pd.MultiIndex.from_product([['delhi','mumbai'], ['avg_package', 'students']])
)
branch_df5

Unnamed: 0_level_0,Unnamed: 1_level_0,delhi,delhi,mumbai,mumbai
Unnamed: 0_level_1,Unnamed: 1_level_1,avg_package,students,avg_package,students
cse,2019,1,2,0,0
cse,2020,3,4,0,0
cse,2021,5,6,0,0
cse,2022,7,8,0,0
ece,2019,9,10,0,0
ece,2020,11,12,0,0
ece,2021,13,14,0,0
ece,2022,15,16,0,0


In [33]:
# head and tail -> https://www.youtube.com/live/QzoRUwz8DoM?si=PoBJPrbyw4JkLU20&t=4097
branch_df5.head(5) # so it will give first 5 rows, btw 5 inside head an tail is by default

Unnamed: 0_level_0,Unnamed: 1_level_0,delhi,delhi,mumbai,mumbai
Unnamed: 0_level_1,Unnamed: 1_level_1,avg_package,students,avg_package,students
cse,2019,1,2,0,0
cse,2020,3,4,0,0
cse,2021,5,6,0,0
cse,2022,7,8,0,0
ece,2019,9,10,0,0


In [34]:
# shape -> https://www.youtube.com/live/QzoRUwz8DoM?si=Z5wzEvpiSGrFFZor&t=4117
branch_df5.shape # remember we said that we convert higher dimension data into lower dimension, here in 8 rows and 4 columns,
""" so we have 32 values in total, so shape will be (8,4) but if we see the shape of `branch_df5` it is (8,4) but it has 32 values in total,
so we can say that it is a 4d data represented in dataframe
"""

' so we have 32 values in total, so shape will be (8,4) but if we see the shape of `branch_df5` it is (8,4) but it has 32 values in total,\nso we can say that it is a 4d data represented in dataframe\n'

In [35]:
# info() -> https://www.youtube.com/live/QzoRUwz8DoM?si=Qs8EU_i5-8Gr6DJ1&t=4137
branch_df5.info() 

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 8 entries, ('cse', np.int64(2019)) to ('ece', np.int64(2022))
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   (delhi, avg_package)   8 non-null      int64
 1   (delhi, students)      8 non-null      int64
 2   (mumbai, avg_package)  8 non-null      int64
 3   (mumbai, students)     8 non-null      int64
dtypes: int64(4)
memory usage: 632.0+ bytes


In [36]:
# duplication check -> https://www.youtube.com/live/QzoRUwz8DoM?si=BBPWzrdn4SVTqPZy&t=4197
branch_df5.duplicated() # so it will give the boolean values of each row, if it is duplicated or not

cse  2019    False
     2020    False
     2021    False
     2022    False
ece  2019    False
     2020    False
     2021    False
     2022    False
dtype: bool

In [37]:
# isnull() -> https://www.youtube.com/live/QzoRUwz8DoM?si=NPi9ks0oVDcSf3kH&t=4207
branch_df5.isnull() # so it will give the boolean values of each row and column, if it is null or not

Unnamed: 0_level_0,Unnamed: 1_level_0,delhi,delhi,mumbai,mumbai
Unnamed: 0_level_1,Unnamed: 1_level_1,avg_package,students,avg_package,students
cse,2019,False,False,False,False
cse,2020,False,False,False,False
cse,2021,False,False,False,False
cse,2022,False,False,False,False
ece,2019,False,False,False,False
ece,2020,False,False,False,False
ece,2021,False,False,False,False
ece,2022,False,False,False,False


In [38]:
# extracting rows single -> https://www.youtube.com/live/QzoRUwz8DoM?si=0IcDdhBX63VtCZTn&t=4267
branch_df5.loc['cse'].loc[2022] # or pass in tuple
branch_df5.loc[('cse',2022)] # so it same as what we do in above line

delhi   avg_package    7
        students       8
mumbai  avg_package    0
        students       0
Name: (cse, 2022), dtype: int64

In [39]:
branch_df5

Unnamed: 0_level_0,Unnamed: 1_level_0,delhi,delhi,mumbai,mumbai
Unnamed: 0_level_1,Unnamed: 1_level_1,avg_package,students,avg_package,students
cse,2019,1,2,0,0
cse,2020,3,4,0,0
cse,2021,5,6,0,0
cse,2022,7,8,0,0
ece,2019,9,10,0,0
ece,2020,11,12,0,0
ece,2021,13,14,0,0
ece,2022,15,16,0,0


In [40]:
# let;s go more specific, we wanna find average package of ece of delhi , in 2022
# se we go from less specific to more specific and specific
branch_df5.loc[('ece',2022),('delhi','avg_package')]

np.int64(15)

In [41]:
# multiple
branch_df5.iloc[:5:2]
# you can do same with loc -> https://www.youtube.com/live/QzoRUwz8DoM?si=An5qUFCtEsCDQOWb&t=4407

Unnamed: 0_level_0,Unnamed: 1_level_0,delhi,delhi,mumbai,mumbai
Unnamed: 0_level_1,Unnamed: 1_level_1,avg_package,students,avg_package,students
cse,2019,1,2,0,0
cse,2021,5,6,0,0
ece,2019,9,10,0,0


In [42]:
branch_df5.loc[('cse',2019): ('ece',2019):2]
 # But Note: unlike `iloc` indexing, `loc` indexing is inclusive, so it will include the last index too

Unnamed: 0_level_0,Unnamed: 1_level_0,delhi,delhi,mumbai,mumbai
Unnamed: 0_level_1,Unnamed: 1_level_1,avg_package,students,avg_package,students
cse,2019,1,2,0,0
cse,2021,5,6,0,0
ece,2019,9,10,0,0


In [43]:
# Extracting cols
# extract only `delhi` column
branch_df5['delhi']

Unnamed: 0,Unnamed: 1,avg_package,students
cse,2019,1,2
cse,2020,3,4
cse,2021,5,6
cse,2022,7,8
ece,2019,9,10
ece,2020,11,12
ece,2021,13,14
ece,2022,15,16


In [44]:
branch_df5.iloc[:,1:3] 

Unnamed: 0_level_0,Unnamed: 1_level_0,delhi,mumbai
Unnamed: 0_level_1,Unnamed: 1_level_1,students,avg_package
cse,2019,2,0
cse,2020,4,0
cse,2021,6,0
cse,2022,8,0
ece,2019,10,0
ece,2020,12,0
ece,2021,14,0
ece,2022,16,0


In [45]:
# let's do indexing in both rows and columns
branch_df5.iloc[:5:4,1:3] # with fancy indexing `branch_df5.iloc[[0,4],[1,2]]`

Unnamed: 0_level_0,Unnamed: 1_level_0,delhi,mumbai
Unnamed: 0_level_1,Unnamed: 1_level_1,students,avg_package
cse,2019,2,0
ece,2019,10,0


In [46]:
# sort index of multiIndex dataframe -> https://www.youtube.com/live/QzoRUwz8DoM?si=XlaKEdkWsC-__ZM6&t=4717
branch_df5.sort_index(ascending=False) # so it sort all index(of all levels) in descending order



Unnamed: 0_level_0,Unnamed: 1_level_0,delhi,delhi,mumbai,mumbai
Unnamed: 0_level_1,Unnamed: 1_level_1,avg_package,students,avg_package,students
ece,2022,15,16,0,0
ece,2021,13,14,0,0
ece,2020,11,12,0,0
ece,2019,9,10,0,0
cse,2022,7,8,0,0
cse,2021,5,6,0,0
cse,2020,3,4,0,0
cse,2019,1,2,0,0


In [47]:
# but what if we wanna sort branch index(level 0 index) in descending order but years(level 2 index) in ascending order
branch_df5.sort_index(ascending=[False,True]) 

Unnamed: 0_level_0,Unnamed: 1_level_0,delhi,delhi,mumbai,mumbai
Unnamed: 0_level_1,Unnamed: 1_level_1,avg_package,students,avg_package,students
ece,2019,9,10,0,0
ece,2020,11,12,0,0
ece,2021,13,14,0,0
ece,2022,15,16,0,0
cse,2019,1,2,0,0
cse,2020,3,4,0,0
cse,2021,5,6,0,0
cse,2022,7,8,0,0


In [48]:
# sort on level1 only
branch_df5.sort_index(level=1, ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,delhi,delhi,mumbai,mumbai
Unnamed: 0_level_1,Unnamed: 1_level_1,avg_package,students,avg_package,students
ece,2022,15,16,0,0
cse,2022,7,8,0,0
ece,2021,13,14,0,0
cse,2021,5,6,0,0
ece,2020,11,12,0,0
cse,2020,3,4,0,0
ece,2019,9,10,0,0
cse,2019,1,2,0,0


In [49]:
# sorting columns (sort on all columns)
branch_df5.sort_index(axis=1,ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,mumbai,mumbai,delhi,delhi
Unnamed: 0_level_1,Unnamed: 1_level_1,students,avg_package,students,avg_package
cse,2019,0,0,2,1
cse,2020,0,0,4,3
cse,2021,0,0,6,5
cse,2022,0,0,8,7
ece,2019,0,0,10,9
ece,2020,0,0,12,11
ece,2021,0,0,14,13
ece,2022,0,0,16,15


In [50]:
branch_df5

Unnamed: 0_level_0,Unnamed: 1_level_0,delhi,delhi,mumbai,mumbai
Unnamed: 0_level_1,Unnamed: 1_level_1,avg_package,students,avg_package,students
cse,2019,1,2,0,0
cse,2020,3,4,0,0
cse,2021,5,6,0,0
cse,2022,7,8,0,0
ece,2019,9,10,0,0
ece,2020,11,12,0,0
ece,2021,13,14,0,0
ece,2022,15,16,0,0


In [51]:
# transpose -> make columns as index and index as columns
branch_df5.T


Unnamed: 0_level_0,Unnamed: 1_level_0,cse,cse,cse,cse,ece,ece,ece,ece
Unnamed: 0_level_1,Unnamed: 1_level_1,2019,2020,2021,2022,2019,2020,2021,2022
delhi,avg_package,1,3,5,7,9,11,13,15
delhi,students,2,4,6,8,10,12,14,16
mumbai,avg_package,0,0,0,0,0,0,0,0
mumbai,students,0,0,0,0,0,0,0,0


In [52]:
# swap level of indexes
branch_df5.swaplevel() # by default axis is 0, so it will swap level 0 and 1, "on indexes"

Unnamed: 0_level_0,Unnamed: 1_level_0,delhi,delhi,mumbai,mumbai
Unnamed: 0_level_1,Unnamed: 1_level_1,avg_package,students,avg_package,students
2019,cse,1,2,0,0
2020,cse,3,4,0,0
2021,cse,5,6,0,0
2022,cse,7,8,0,0
2019,ece,9,10,0,0
2020,ece,11,12,0,0
2021,ece,13,14,0,0
2022,ece,15,16,0,0


In [53]:
# swap level of columns

branch_df5.swaplevel(axis=1) # means it will swap level 1 and 2 "on columns" 

Unnamed: 0_level_0,Unnamed: 1_level_0,avg_package,students,avg_package,students
Unnamed: 0_level_1,Unnamed: 1_level_1,delhi,delhi,mumbai,mumbai
cse,2019,1,2,0,0
cse,2020,3,4,0,0
cse,2021,5,6,0,0
cse,2022,7,8,0,0
ece,2019,9,10,0,0
ece,2020,11,12,0,0
ece,2021,13,14,0,0
ece,2022,15,16,0,0


## Long/Tall vs wide data, and `melt`, `pivot` methods

In [54]:
# Understanding difference between long and wide data -> https://www.youtube.com/live/QzoRUwz8DoM?si=sa7lIJTjXIZ-v5hf&t=5277


In [55]:
# Wide format -> https://www.youtube.com/live/QzoRUwz8DoM?si=x3AGp4LosVywive9&t=5337

In [76]:
# --------------------------------------- Melt  ----------------------------------------
# Understand `melt` method with examples -> https://www.youtube.com/live/QzoRUwz8DoM?si=lWX6pmjqXGhgyjmO&t=5577
# Real world examples ->  https://www.youtube.com/live/QzoRUwz8DoM?si=cY-SJmPT0N5rVnJo&t=6117

In [57]:
# `melt` is used to convert wide format to long format
toBeMeltdf = pd.DataFrame({'cse':[120]})
toBeMeltdf

Unnamed: 0,cse
0,120


In [58]:
toBeMeltdf.melt() 


Unnamed: 0,variable,value
0,cse,120


In [59]:
pd.DataFrame({'cse':[120], 'ece':[130],'mech':[140]})

Unnamed: 0,cse,ece,mech
0,120,130,140


In [60]:
# so you can also provide `var_name` and `value_name` to rename columns, default is 'variable' and 'value' 
pd.DataFrame({'cse':[120], 'ece':[130],'mech':[140]}).melt(var_name='branch', value_name='no_of_students')

Unnamed: 0,branch,no_of_students
0,cse,120
1,ece,130
2,mech,140


In [61]:
import pandas as pd

# prevent specific columns from being included in the melt operation

meltdf2 = pd.DataFrame(
    {
        'branch': ['cse', 'ece', 'mech'],
        '2020': [100, 150, 60],
        '2021': [120, 130, 80],
        '2022': [150, 140, 70]
    }
)

meltdf2

Unnamed: 0,branch,2020,2021,2022
0,cse,100,120,150
1,ece,150,130,140
2,mech,60,80,70


In [62]:
# we wannna exclude 'branch' column to be come in melt operation
meltdf2.melt(id_vars=['branch'], var_name='year', value_name='no_of_students')

Unnamed: 0,branch,year,no_of_students
0,cse,2020,100
1,ece,2020,150
2,mech,2020,60
3,cse,2021,120
4,ece,2021,130
5,mech,2021,80
6,cse,2022,150
7,ece,2022,140
8,mech,2022,70


In [63]:
# ---------------------------------- Pivot table ---------------------------------------
# Understand `pivot table` method with examples -> https://www.youtube.com/live/UE6DmRQJ2w8?si=qYiJLUaFiTKgyJK0&t=407

In [None]:
# we gernerally use pivot tables on categotical columns

import seaborn as sns
artificial_df = sns.load_dataset('tips')
# seaborn provide some toy datasets

artificial_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [77]:
artificial_df.groupby('sex')['total_bill'].mean()

  artificial_df.groupby('sex')['total_bill'].mean()


sex
Male      20.744076
Female    18.056897
Name: total_bill, dtype: float64

In [92]:
artificial_df.groupby(['sex', 'smoker'])['total_bill'].mean().unstack().rename(columns={'Yes':'smoker', 'No':'not_smoker'})

  artificial_df.groupby(['sex', 'smoker'])['total_bill'].mean().unstack().rename(columns={'Yes':'smoker', 'No':'not_smoker'})


smoker,smoker,not_smoker
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,22.2845,19.791237
Female,17.977879,18.105185


In [None]:
# the shortcut of above cell code is pivot table 
# Understand this by example and explanation ->git  https://www.youtube.com/live/UE6DmRQJ2w8?si=525LQc7USJR1Qwkg&t=857

artificial_df.pivot_table(index='sex', columns='smoker', values='total_bill', aggfunc='mean')
# by_default also `aggfunc` value is mean, we can use other functions like sum, min, max, std, var, count etc.

  artificial_df.pivot_table(index='sex', columns='smoker', values='total_bill', aggfunc='mean')


smoker,Yes,No
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,22.2845,19.791237
Female,17.977879,18.105185
