# Reshaping DataFrames

## Exercise 1 - Create a DataFrame

In [1]:
import pandas as pd

df1 = pd.DataFrame(
    
    {
        'city':['A','B','C'],
        'day1':[22,25,28],
        'day2':[10,14,13],
        'day3':[25,22,26],
        'day4':[18,15,17],
        'day5':[12,14,18]
    }

)

df1

Unnamed: 0,city,day1,day2,day3,day4,day5
0,A,22,10,25,18,12
1,B,25,14,22,15,14
2,C,28,13,26,17,18


## Exercise 2 - melt function

* Convert a wide DataFrame to a narrow one. 
* Wide: High number of columns compared to the number of rows. 
* Data is sometimes structured in a way that consecutive measurements or variables are represented as columns. 
* In some cases, representing these columns as rows may fit better to our task.

In [2]:
df1.melt()

Unnamed: 0,variable,value
0,city,A
1,city,B
2,city,C
3,day1,22
4,day1,25
5,day1,28
6,day2,10
7,day2,14
8,day2,13
9,day3,25


## Exercise 3 - id_vars parameter

* Specify column or columns to use as identifier variables.

In [3]:
df1.melt(id_vars=["city"])

Unnamed: 0,city,variable,value
0,A,day1,22
1,B,day1,25
2,C,day1,28
3,A,day2,10
4,B,day2,14
5,C,day2,13
6,A,day3,25
7,B,day3,22
8,C,day3,26
9,A,day4,18


## Exercise 4 - var_name and value_name parameters

* Variable and value column names are given by default. 
* We can use var_name and value_name parameters of melt function to assign new column names.

In [4]:
df1.melt(id_vars=["city"], var_name=["date"], value_name="temperature")

Unnamed: 0,city,date,temperature
0,A,day1,22
1,B,day1,25
2,C,day1,28
3,A,day2,10
4,B,day2,14
5,C,day2,13
6,A,day3,25
7,B,day3,22
8,C,day3,26
9,A,day4,18


## Exercise 5 - sort rows accordingly

In [5]:
df1.melt(
    
    id_vars=["city"], var_name=["date"], value_name="temperature"

).sort_values(

    by=["city", "date"], ignore_index=True

)

Unnamed: 0,city,date,temperature
0,A,day1,22
1,A,day2,10
2,A,day3,25
3,A,day4,18
4,A,day5,12
5,B,day1,25
6,B,day2,14
7,B,day3,22
8,B,day4,15
9,B,day5,14


## Exercise 6 - stack function

* Stack function kind of increases the index level of a DataFrame.
* It returns a reshaped DataFrame or Series having a multi-level index with one or more new inner-most levels compared to the current DataFrame.
* If a DataFrame has a simple column index, stack returns a series whose indices consist of row-column pairs of original DataFrame.
* If a DataFrame has multi-level index, stack increases the index level.

In [6]:
df1

Unnamed: 0,city,day1,day2,day3,day4,day5
0,A,22,10,25,18,12
1,B,25,14,22,15,14
2,C,28,13,26,17,18


In [7]:
df1.stack()

0  city     A
   day1    22
   day2    10
   day3    25
   day4    18
   day5    12
1  city     B
   day1    25
   day2    14
   day3    22
   day4    15
   day5    14
2  city     C
   day1    28
   day2    13
   day3    26
   day4    17
   day5    18
dtype: object

In [8]:
df1.stack().index

MultiIndex([(0, 'city'),
            (0, 'day1'),
            (0, 'day2'),
            (0, 'day3'),
            (0, 'day4'),
            (0, 'day5'),
            (1, 'city'),
            (1, 'day1'),
            (1, 'day2'),
            (1, 'day3'),
            (1, 'day4'),
            (1, 'day5'),
            (2, 'city'),
            (2, 'day1'),
            (2, 'day2'),
            (2, 'day3'),
            (2, 'day4'),
            (2, 'day5')],
           )

## Exercise 7

In [9]:
df1

Unnamed: 0,city,day1,day2,day3,day4,day5
0,A,22,10,25,18,12
1,B,25,14,22,15,14
2,C,28,13,26,17,18


In [10]:
df1.shape

(3, 6)

In [11]:
df1_stacked = df1.stack().to_frame()

df1_stacked

Unnamed: 0,Unnamed: 1,0
0,city,A
0,day1,22
0,day2,10
0,day3,25
0,day4,18
0,day5,12
1,city,B
1,day1,25
1,day2,14
1,day3,22


In [12]:
df1_stacked.shape

(18, 1)

In [13]:
df1_stacked.index

MultiIndex([(0, 'city'),
            (0, 'day1'),
            (0, 'day2'),
            (0, 'day3'),
            (0, 'day4'),
            (0, 'day5'),
            (1, 'city'),
            (1, 'day1'),
            (1, 'day2'),
            (1, 'day3'),
            (1, 'day4'),
            (1, 'day5'),
            (2, 'city'),
            (2, 'day1'),
            (2, 'day2'),
            (2, 'day3'),
            (2, 'day4'),
            (2, 'day5')],
           )

In [14]:
df1_stacked.iloc[0]

0    A
Name: (0, city), dtype: object

In [15]:
df1.iloc[0]

city     A
day1    22
day2    10
day3    25
day4    18
day5    12
Name: 0, dtype: object

In [16]:
df1_stacked.iloc[:6]

Unnamed: 0,Unnamed: 1,0
0,city,A
0,day1,22
0,day2,10
0,day3,25
0,day4,18
0,day5,12


## Exercise 8 - create a DataFrame with multi-index

In [17]:
import numpy as np

tuples = [("A", 1),("A", 2),("A", 3),("B", 1),("B", 2)]

index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])

df2 = pd.DataFrame(
    
    np.random.randint(10, size=(5,2)),
    index=index, 
    columns=["col1", "col2"]

)

df2

Unnamed: 0_level_0,Unnamed: 1_level_0,col1,col2
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
A,1,6,8
A,2,9,3
A,3,1,6
B,1,5,0
B,2,4,3


In [18]:
df2.index

MultiIndex([('A', 1),
            ('A', 2),
            ('A', 3),
            ('B', 1),
            ('B', 2)],
           names=['first', 'second'])

## Exercise 9 - index level of a MultiIndex

In [19]:
df2.index.levels

FrozenList([['A', 'B'], [1, 2, 3]])

In [20]:
len(df2.index.levels)

2

In [21]:
df2_stacked = df2.stack().to_frame()

df2_stacked

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
A,1,col1,6
A,1,col2,8
A,2,col1,9
A,2,col2,3
A,3,col1,1
A,3,col2,6
B,1,col1,5
B,1,col2,0
B,2,col1,4
B,2,col2,3


In [22]:
df2_stacked.index.levels

FrozenList([['A', 'B'], [1, 2, 3], ['col1', 'col2']])

In [23]:
len(df2_stacked.index.levels)

3

## Exercise 10 - unstack

In [24]:
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,col1,col2
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
A,1,6,8
A,2,9,3
A,3,1,6
B,1,5,0
B,2,4,3


In [25]:
df2.unstack()

Unnamed: 0_level_0,col1,col1,col1,col2,col2,col2
second,1,2,3,1,2,3
first,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,6.0,9.0,1.0,8.0,3.0,6.0
B,5.0,4.0,,0.0,3.0,


In [26]:
df2.unstack().shape

(2, 6)

In [27]:
df2.shape

(5, 2)

In [28]:
df2.unstack().index

Index(['A', 'B'], dtype='object', name='first')

## Exercise 11 - fill_value parameter

In [29]:
df2.unstack()

Unnamed: 0_level_0,col1,col1,col1,col2,col2,col2
second,1,2,3,1,2,3
first,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,6.0,9.0,1.0,8.0,3.0,6.0
B,5.0,4.0,,0.0,3.0,


In [30]:
df2.unstack(fill_value=0)

Unnamed: 0_level_0,col1,col1,col1,col2,col2,col2
second,1,2,3,1,2,3
first,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,6,9,1,8,3,6
B,5,4,0,0,3,0


## Exercise 12 - explode function

* Transform each element of a list-like to a row, replicating index values.

In [31]:
df = pd.DataFrame(
    
    {
        "ID": ["a", "b", "c"],
        "measurement": [4, 6, [2,3,8]],
        "day": 1
    }

)

df

Unnamed: 0,ID,measurement,day
0,a,4,1
1,b,6,1
2,c,"[2, 3, 8]",1


In [32]:
df.explode("measurement")

Unnamed: 0,ID,measurement,day
0,a,4,1
1,b,6,1
2,c,2,1
2,c,3,1
2,c,8,1


In [33]:
df.explode("measurement", ignore_index=True)

Unnamed: 0,ID,measurement,day
0,a,4,1
1,b,6,1
2,c,2,1
3,c,3,1
4,c,8,1


## Exercise 13 - explode function

* Multi-column explode made available in version 1.3.0

In [34]:
df = pd.DataFrame(
    
    {
        "ID": [["a", "b"], "b", ["b", "c", "d"]],
        "measurement": [[4, 9], 6, [2,3,8]],
        "day": 1
    }

)

df

Unnamed: 0,ID,measurement,day
0,"[a, b]","[4, 9]",1
1,b,6,1
2,"[b, c, d]","[2, 3, 8]",1


In [35]:
df.explode("measurement", ignore_index=True)

Unnamed: 0,ID,measurement,day
0,"[a, b]",4,1
1,"[a, b]",9,1
2,b,6,1
3,"[b, c, d]",2,1
4,"[b, c, d]",3,1
5,"[b, c, d]",8,1


In [36]:
# important: columns must have matching element counts
df.explode(["ID", "measurement"], ignore_index=True)

Unnamed: 0,ID,measurement,day
0,a,4,1
1,b,9,1
2,b,6,1
3,b,2,1
4,c,3,1
5,d,8,1


## Exercise 14

In [37]:
df = pd.DataFrame(
    
    {
        "ID": [["a", "b"], "b", "c"],
        "measurement": [4, 6, [2,3,8]],
        "day": 1
    }

)

df

Unnamed: 0,ID,measurement,day
0,"[a, b]",4,1
1,b,6,1
2,c,"[2, 3, 8]",1


In [38]:
df.explode("ID", ignore_index=True).explode("measurement", ignore_index=True)

Unnamed: 0,ID,measurement,day
0,a,4,1
1,b,4,1
2,b,6,1
3,c,2,1
4,c,3,1
5,c,8,1
