## Reshaping a DataFrame using Pandas melt()

1. Simplest melt
2. Displaying custom name
3. Displaying multiple ids
4. Specifying columns to melt
5. Pandas melt
6. Bonus: reshaping COVID-19 time-series data
https://towardsdatascience.com/reshaping-a-dataframe-using-pandas-melt-83a151ce1907

In [3]:
# %load command.py

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

%config InlineBackend.figure_format='svg'
plt.rcParams['figure.dpi']=120

pd.options.display.float_format='{:,.2f}'.format
pd.set_option('display.max_colwidth', None)

In [4]:
df_wide = pd.DataFrame({
   "Country": ["France", "US", "UK"],
   "22/01/2020": [1,2,3],
   "23/01/2020": [4,5,6],
   "24/01/2020": [7,8,9],
   "25/01/2020": [10,11,12],
   "26/01/2020": [13,14,15],
  }
)
df_wide

Unnamed: 0,Country,22/01/2020,23/01/2020,24/01/2020,25/01/2020,26/01/2020
0,France,1,4,7,10,13
1,US,2,5,8,11,14
2,UK,3,6,9,12,15


In [6]:
# without any argument
df_wide.melt()

Unnamed: 0,variable,value
0,Country,France
1,Country,US
2,Country,UK
3,22/01/2020,1
4,22/01/2020,2
5,22/01/2020,3
6,23/01/2020,4
7,23/01/2020,5
8,23/01/2020,6
9,24/01/2020,7


In [7]:
df_wide.melt(id_vars='Country')

Unnamed: 0,Country,variable,value
0,France,22/01/2020,1
1,US,22/01/2020,2
2,UK,22/01/2020,3
3,France,23/01/2020,4
4,US,23/01/2020,5
5,UK,23/01/2020,6
6,France,24/01/2020,7
7,US,24/01/2020,8
8,UK,24/01/2020,9
9,France,25/01/2020,10


In [8]:
# with custome name

df_wide.melt(id_vars='Country',
            var_name='Date', 
            value_name='Cases')

Unnamed: 0,Country,Date,Cases
0,France,22/01/2020,1
1,US,22/01/2020,2
2,UK,22/01/2020,3
3,France,23/01/2020,4
4,US,23/01/2020,5
5,UK,23/01/2020,6
6,France,24/01/2020,7
7,US,24/01/2020,8
8,UK,24/01/2020,9
9,France,25/01/2020,10


In [9]:
# Multiple ids

df_wide = pd.DataFrame({
   "Country": ["France", "US", "UK"],
   "Lat": [31.8257, 40.0, 55.3781],
   "Long": [117.2264, -100.0, -3.436],
   "22/01/2020": [1,2,3],
   "23/01/2020": [4,5,6],
   "24/01/2020": [7,8,9],
   "25/01/2020": [10,11,12],
   "26/01/2020": [13,14,15],
  }
)
df_wide

Unnamed: 0,Country,Lat,Long,22/01/2020,23/01/2020,24/01/2020,25/01/2020,26/01/2020
0,France,31.83,117.23,1,4,7,10,13
1,US,40.0,-100.0,2,5,8,11,14
2,UK,55.38,-3.44,3,6,9,12,15


In [10]:
df_wide.melt(id_vars=['Country', 'Lat', 'Long'],
            var_name='Date',
            value_name='Cases')

Unnamed: 0,Country,Lat,Long,Date,Cases
0,France,31.83,117.23,22/01/2020,1
1,US,40.0,-100.0,22/01/2020,2
2,UK,55.38,-3.44,22/01/2020,3
3,France,31.83,117.23,23/01/2020,4
4,US,40.0,-100.0,23/01/2020,5
5,UK,55.38,-3.44,23/01/2020,6
6,France,31.83,117.23,24/01/2020,7
7,US,40.0,-100.0,24/01/2020,8
8,UK,55.38,-3.44,24/01/2020,9
9,France,31.83,117.23,25/01/2020,10


In [13]:
# Specify the columns to melt
df_wide.melt(
id_vars=['Country', 'Lat', 'Long'],
value_vars=['24/01/2020', '25/01/2020'],
var_name='Date',
value_name='Cases')

Unnamed: 0,Country,Lat,Long,Date,Cases
0,France,31.83,117.23,24/01/2020,7
1,US,40.0,-100.0,24/01/2020,8
2,UK,55.38,-3.44,24/01/2020,9
3,France,31.83,117.23,25/01/2020,10
4,US,40.0,-100.0,25/01/2020,11
5,UK,55.38,-3.44,25/01/2020,12


In [14]:
# Pandas melt
pd.melt(df_wide, id_vars=['Country', 'Lat', 'Long'])

Unnamed: 0,Country,Lat,Long,variable,value
0,France,31.83,117.23,22/01/2020,1
1,US,40.0,-100.0,22/01/2020,2
2,UK,55.38,-3.44,22/01/2020,3
3,France,31.83,117.23,23/01/2020,4
4,US,40.0,-100.0,23/01/2020,5
5,UK,55.38,-3.44,23/01/2020,6
6,France,31.83,117.23,24/01/2020,7
7,US,40.0,-100.0,24/01/2020,8
8,UK,55.38,-3.44,24/01/2020,9
9,France,31.83,117.23,25/01/2020,10


### Bonus: Covid-19 time series data preprocessing

In [16]:
confirmed_df = pd.read_csv('time_series_covid19_confirmed_global.csv')
deaths_df = pd.read_csv('time_series_covid19_deaths_global.csv')
recovered_df = pd.read_csv('time_series_covid19_recovered_global.csv')

In [17]:
confirmed_df.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,4/10/20,4/11/20,4/12/20,4/13/20,4/14/20,4/15/20,4/16/20,4/17/20,4/18/20,4/19/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,521,555,607,665,714,784,840,906,933,996
1,,Albania,41.15,20.17,0,0,0,0,0,0,...,416,433,446,467,475,494,518,539,548,562
2,,Algeria,28.03,1.66,0,0,0,0,0,0,...,1761,1825,1914,1983,2070,2160,2268,2418,2534,2629
3,,Andorra,42.51,1.52,0,0,0,0,0,0,...,601,601,638,646,659,673,673,696,704,713
4,,Angola,-11.2,17.87,0,0,0,0,0,0,...,19,19,19,19,19,19,19,19,24,24


In [18]:
print(len(confirmed_df.columns))
confirmed_df.columns

93


Index(['Province/State', 'Country/Region', 'Lat', 'Long', '1/22/20', '1/23/20',
       '1/24/20', '1/25/20', '1/26/20', '1/27/20', '1/28/20', '1/29/20',
       '1/30/20', '1/31/20', '2/1/20', '2/2/20', '2/3/20', '2/4/20', '2/5/20',
       '2/6/20', '2/7/20', '2/8/20', '2/9/20', '2/10/20', '2/11/20', '2/12/20',
       '2/13/20', '2/14/20', '2/15/20', '2/16/20', '2/17/20', '2/18/20',
       '2/19/20', '2/20/20', '2/21/20', '2/22/20', '2/23/20', '2/24/20',
       '2/25/20', '2/26/20', '2/27/20', '2/28/20', '2/29/20', '3/1/20',
       '3/2/20', '3/3/20', '3/4/20', '3/5/20', '3/6/20', '3/7/20', '3/8/20',
       '3/9/20', '3/10/20', '3/11/20', '3/12/20', '3/13/20', '3/14/20',
       '3/15/20', '3/16/20', '3/17/20', '3/18/20', '3/19/20', '3/20/20',
       '3/21/20', '3/22/20', '3/23/20', '3/24/20', '3/25/20', '3/26/20',
       '3/27/20', '3/28/20', '3/29/20', '3/30/20', '3/31/20', '4/1/20',
       '4/2/20', '4/3/20', '4/4/20', '4/5/20', '4/6/20', '4/7/20', '4/8/20',
       '4/9/20', '4/10/20'

In [21]:
dates = confirmed_df.columns[4:]
dates

Index(['1/22/20', '1/23/20', '1/24/20', '1/25/20', '1/26/20', '1/27/20',
       '1/28/20', '1/29/20', '1/30/20', '1/31/20', '2/1/20', '2/2/20',
       '2/3/20', '2/4/20', '2/5/20', '2/6/20', '2/7/20', '2/8/20', '2/9/20',
       '2/10/20', '2/11/20', '2/12/20', '2/13/20', '2/14/20', '2/15/20',
       '2/16/20', '2/17/20', '2/18/20', '2/19/20', '2/20/20', '2/21/20',
       '2/22/20', '2/23/20', '2/24/20', '2/25/20', '2/26/20', '2/27/20',
       '2/28/20', '2/29/20', '3/1/20', '3/2/20', '3/3/20', '3/4/20', '3/5/20',
       '3/6/20', '3/7/20', '3/8/20', '3/9/20', '3/10/20', '3/11/20', '3/12/20',
       '3/13/20', '3/14/20', '3/15/20', '3/16/20', '3/17/20', '3/18/20',
       '3/19/20', '3/20/20', '3/21/20', '3/22/20', '3/23/20', '3/24/20',
       '3/25/20', '3/26/20', '3/27/20', '3/28/20', '3/29/20', '3/30/20',
       '3/31/20', '4/1/20', '4/2/20', '4/3/20', '4/4/20', '4/5/20', '4/6/20',
       '4/7/20', '4/8/20', '4/9/20', '4/10/20', '4/11/20', '4/12/20',
       '4/13/20', '4/14/20', '4/15

In [24]:
confirmed_df_long=confirmed_df.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'],
                                    value_vars=dates,
                                   var_name='Date',
                                   value_name='Confirmed')
confirmed_df_long

deaths_df_long = deaths_df.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], 
                            value_vars=dates, var_name='Date', value_name='Deaths')

recovered_df_long = recovered_df.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], 
                            value_vars=dates, var_name='Date', value_name='Recovered')

(confirmed_df_long.shape, deaths_df_long.shape, recovered_df_long.shape)

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed
0,,Afghanistan,33.00,65.00,1/22/20,0
1,,Albania,41.15,20.17,1/22/20,0
2,,Algeria,28.03,1.66,1/22/20,0
3,,Andorra,42.51,1.52,1/22/20,0
4,,Angola,-11.20,17.87,1/22/20,0
...,...,...,...,...,...,...
23491,Saint Pierre and Miquelon,France,46.89,-56.32,4/19/20,1
23492,,South Sudan,6.88,31.31,4/19/20,4
23493,,Western Sahara,24.22,-12.89,4/19/20,6
23494,,Sao Tome and Principe,0.19,6.61,4/19/20,4


((23496, 6), (23496, 6), (22250, 6))

In [27]:
# Merging confirmed_df_long and deaths_df_long
full_table = confirmed_df_long.merge(
  deaths_df_long, 
  how='left',
  on=['Province/State', 'Country/Region', 'Date', 'Lat', 'Long']
)

full_table

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths
0,,Afghanistan,33.00,65.00,1/22/20,0,0
1,,Albania,41.15,20.17,1/22/20,0,0
2,,Algeria,28.03,1.66,1/22/20,0,0
3,,Andorra,42.51,1.52,1/22/20,0,0
4,,Angola,-11.20,17.87,1/22/20,0,0
...,...,...,...,...,...,...,...
23491,Saint Pierre and Miquelon,France,46.89,-56.32,4/19/20,1,0
23492,,South Sudan,6.88,31.31,4/19/20,4,0
23493,,Western Sahara,24.22,-12.89,4/19/20,6,0
23494,,Sao Tome and Principe,0.19,6.61,4/19/20,4,0


In [29]:
# Merging full_table and recovered_df_long
full_table = full_table.merge(
  recovered_df_long, 
  how='left',
  on=['Province/State', 'Country/Region', 'Date', 'Lat', 'Long']
)

full_table

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered_x,Recovered_y
0,,Afghanistan,33.00,65.00,1/22/20,0,0,0.00,0.00
1,,Albania,41.15,20.17,1/22/20,0,0,0.00,0.00
2,,Algeria,28.03,1.66,1/22/20,0,0,0.00,0.00
3,,Andorra,42.51,1.52,1/22/20,0,0,0.00,0.00
4,,Angola,-11.20,17.87,1/22/20,0,0,0.00,0.00
...,...,...,...,...,...,...,...,...,...
23491,Saint Pierre and Miquelon,France,46.89,-56.32,4/19/20,1,0,0.00,0.00
23492,,South Sudan,6.88,31.31,4/19/20,4,0,0.00,0.00
23493,,Western Sahara,24.22,-12.89,4/19/20,6,0,0.00,0.00
23494,,Sao Tome and Principe,0.19,6.61,4/19/20,4,0,0.00,0.00
