# Data Reshaping


Let me get the data on Covid again:

In [2]:
import pandas as pd
import glob
import os

all_names = glob.glob(os.path.join('FilesToAggregate' , "*2022.csv"))

dfs=[pd.read_csv(name,sep=";") for name in all_names]

covid=pd.concat(dfs,ignore_index=True,copy=False)

We speak of the **long** and the **wide** shape. As you can see the covid data is in the former:

In [3]:
covid.head(10)

Unnamed: 0,regiao,estado,municipio,coduf,codmun,codRegiaoSaude,nomeRegiaoSaude,data,semanaEpi,populacaoTCU2019,casosAcumulado,casosNovos,obitosAcumulado,obitosNovos,Recuperadosnovos,emAcompanhamentoNovos,interior/metropolitana
0,Brasil,,,76,,,,2022-07-01,26,210147125.0,32434063.0,76045,671700,284,30873682.0,888681.0,
1,Brasil,,,76,,,,2022-07-02,26,210147125.0,32471847.0,37784,671858,158,30880584.0,919405.0,
2,Brasil,,,76,,,,2022-07-03,27,210147125.0,32490422.0,18575,671911,53,30906575.0,911936.0,
3,Brasil,,,76,,,,2022-07-04,27,210147125.0,32535923.0,45501,672033,122,30967114.0,896776.0,
4,Brasil,,,76,,,,2022-07-05,27,210147125.0,32610514.0,74591,672429,396,31039055.0,899030.0,
5,Brasil,,,76,,,,2022-07-06,27,210147125.0,32687680.0,77166,672790,361,31077538.0,937352.0,
6,Brasil,,,76,,,,2022-07-07,27,210147125.0,32759730.0,72050,673073,283,31094805.0,991852.0,
7,Brasil,,,76,,,,2022-07-08,27,210147125.0,32830844.0,71114,673339,266,31119463.0,1038042.0,
8,Brasil,,,76,,,,2022-07-09,27,210147125.0,32874501.0,43657,673554,215,31142535.0,1058412.0,
9,Brasil,,,76,,,,2022-07-10,28,210147125.0,32896464.0,21963,673610,56,31181066.0,1041788.0,


Long format is efficient, but some operations may need a wide format:

In [4]:
covidSemanaW=pd.pivot_table(covid,
                            values='casosNovos', 
                            index=['estado'],
                            columns=['semanaEpi'],# to long
                            aggfunc=sum)

covidSemanaW

  covidSemanaW=pd.pivot_table(covid,


semanaEpi,1,2,3,4,5,6,7,8,9,10,...,21,22,23,24,25,26,27,28,29,52
estado,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AC,34,2464,7796,8592,16090,9112,11584,8694,3898,1704,...,2,272,130,152,498,2806,6000,9924,5248,4
AL,1322,4604,8426,15738,22720,24524,15470,4150,5358,1868,...,760,902,926,1824,4024,6872,10260,6532,2488,22
AM,3068,22592,84164,75116,42402,28960,13250,8750,3300,6890,...,234,174,316,454,1498,4122,8542,11454,5836,74
AP,934,2638,13184,30084,11656,5640,1586,408,114,140,...,10,54,52,90,510,1674,5622,11514,5174,64
BA,11606,20504,50574,86258,101036,92694,64332,33138,15742,18264,...,3460,2822,9196,9000,14828,35820,52106,45396,15360,342
CE,7928,15012,43266,173844,206146,44208,29264,18342,9118,6574,...,1124,1440,5366,5532,10160,24080,19974,117574,12950,1420
DF,10210,32672,40410,67600,71952,50346,29998,18872,8512,5668,...,9196,24588,45340,50960,50380,32246,18300,14884,4716,0
ES,9390,53466,120972,207142,186268,117760,56550,30642,12350,9176,...,4450,7884,16232,24248,45596,57746,55014,45894,18942,108
GO,10924,31460,59364,70996,66762,73728,69236,53928,34468,59294,...,25122,36514,49192,51610,73228,78896,70398,63456,18024,406
MA,2466,4754,7614,15546,14406,14856,13076,11534,4458,6582,...,1800,2620,2552,1400,2054,2998,4588,11122,6516,98


Notice the column names:

In [5]:
covidSemanaW.columns

Index([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 52],
      dtype='int64', name='semanaEpi')

Pandas gave a name to all the columns ('_semanaEpi_'), which is saved from the original data.

In [6]:
covidSemanaW.reset_index().rename_axis(index=None, columns=None)

Unnamed: 0,estado,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,52
0,AC,34,2464,7796,8592,16090,9112,11584,8694,3898,...,2,272,130,152,498,2806,6000,9924,5248,4
1,AL,1322,4604,8426,15738,22720,24524,15470,4150,5358,...,760,902,926,1824,4024,6872,10260,6532,2488,22
2,AM,3068,22592,84164,75116,42402,28960,13250,8750,3300,...,234,174,316,454,1498,4122,8542,11454,5836,74
3,AP,934,2638,13184,30084,11656,5640,1586,408,114,...,10,54,52,90,510,1674,5622,11514,5174,64
4,BA,11606,20504,50574,86258,101036,92694,64332,33138,15742,...,3460,2822,9196,9000,14828,35820,52106,45396,15360,342
5,CE,7928,15012,43266,173844,206146,44208,29264,18342,9118,...,1124,1440,5366,5532,10160,24080,19974,117574,12950,1420
6,DF,10210,32672,40410,67600,71952,50346,29998,18872,8512,...,9196,24588,45340,50960,50380,32246,18300,14884,4716,0
7,ES,9390,53466,120972,207142,186268,117760,56550,30642,12350,...,4450,7884,16232,24248,45596,57746,55014,45894,18942,108
8,GO,10924,31460,59364,70996,66762,73728,69236,53928,34468,...,25122,36514,49192,51610,73228,78896,70398,63456,18024,406
9,MA,2466,4754,7614,15546,14406,14856,13076,11534,4458,...,1800,2620,2552,1400,2054,2998,4588,11122,6516,98


We could save this, dropping the last column:

In [7]:
covidSemanaW=covidSemanaW.reset_index().rename_axis(index=None, columns=None)
covidSemanaW.drop(columns=[52],inplace=True)
covidSemanaW.to_csv(os.path.join('FilesToAggregate','covidSemanaW.csv'),index=False)

We should be able to transfor this wide version into a long one:

In [8]:
covidSemanaL=covidSemanaW.set_index('estado').stack().reset_index()
covidSemanaL

Unnamed: 0,estado,level_1,0
0,AC,1,34
1,AC,2,2464
2,AC,3,7796
3,AC,4,8592
4,AC,5,16090
...,...,...,...
778,TO,25,5048
779,TO,26,20306
780,TO,27,12422
781,TO,28,9914


In [10]:
# you can save after renaming
covidSemanaL.rename(columns={'level_1':'semanaEpi',0:'cases'},inplace=True)
covidSemanaW.to_csv(os.path.join('FilesToAggregate','covidSemanaL.csv'),index=False)

Let's make it little more complex:

In [11]:
covidSemanaW2=pd.pivot_table(covid,
                            values=['casosNovos','obitosNovos'], 
                            index=['regiao','estado'],
                            columns=['semanaEpi'],
                            aggfunc=sum)

covidSemanaW2

  covidSemanaW2=pd.pivot_table(covid,


Unnamed: 0_level_0,Unnamed: 1_level_0,casosNovos,casosNovos,casosNovos,casosNovos,casosNovos,casosNovos,casosNovos,casosNovos,casosNovos,casosNovos,...,obitosNovos,obitosNovos,obitosNovos,obitosNovos,obitosNovos,obitosNovos,obitosNovos,obitosNovos,obitosNovos,obitosNovos
Unnamed: 0_level_1,semanaEpi,1,2,3,4,5,6,7,8,9,10,...,21,22,23,24,25,26,27,28,29,52
regiao,estado,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
Centro-Oeste,DF,10210,32672,40410,67600,71952,50346,29998,18872,8512,5668,...,12,4,14,18,76,40,48,26,16,0
Centro-Oeste,GO,10924,31460,59364,70996,66762,73728,69236,53928,34468,59294,...,98,28,90,98,136,224,148,116,56,34
Centro-Oeste,MS,6358,15446,16836,38878,42042,37684,44436,34166,17100,14910,...,30,8,12,56,14,32,38,36,36,0
Centro-Oeste,MT,10264,24628,36222,54572,54968,46384,36546,27832,12688,10802,...,10,18,22,24,56,50,78,82,62,0
Nordeste,AL,1322,4604,8426,15738,22720,24524,15470,4150,5358,1868,...,0,0,0,6,10,28,50,60,42,0
Nordeste,BA,11606,20504,50574,86258,101036,92694,64332,33138,15742,18264,...,18,32,28,50,60,68,64,202,94,2
Nordeste,CE,7928,15012,43266,173844,206146,44208,29264,18342,9118,6574,...,86,76,54,16,50,96,176,78,40,0
Nordeste,MA,2466,4754,7614,15546,14406,14856,13076,11534,4458,6582,...,2,2,2,0,8,4,20,22,8,0
Nordeste,PB,1724,5610,17822,34018,45744,45174,39628,28678,10370,15074,...,6,0,10,2,36,36,42,50,34,2
Nordeste,PE,5938,11820,24892,55764,66848,62060,69096,68468,39620,33348,...,54,58,58,68,72,70,78,90,48,14


Now you have _multi index_:

In [12]:
covidSemanaW2.index

MultiIndex([('Centro-Oeste', 'DF'),
            ('Centro-Oeste', 'GO'),
            ('Centro-Oeste', 'MS'),
            ('Centro-Oeste', 'MT'),
            (    'Nordeste', 'AL'),
            (    'Nordeste', 'BA'),
            (    'Nordeste', 'CE'),
            (    'Nordeste', 'MA'),
            (    'Nordeste', 'PB'),
            (    'Nordeste', 'PE'),
            (    'Nordeste', 'PI'),
            (    'Nordeste', 'RN'),
            (    'Nordeste', 'SE'),
            (       'Norte', 'AC'),
            (       'Norte', 'AM'),
            (       'Norte', 'AP'),
            (       'Norte', 'PA'),
            (       'Norte', 'RO'),
            (       'Norte', 'RR'),
            (       'Norte', 'TO'),
            (     'Sudeste', 'ES'),
            (     'Sudeste', 'MG'),
            (     'Sudeste', 'RJ'),
            (     'Sudeste', 'SP'),
            (         'Sul', 'PR'),
            (         'Sul', 'RS'),
            (         'Sul', 'SC')],
           names=['regiao',

This works well:

In [13]:
covidSemanaW2.reset_index()

Unnamed: 0_level_0,regiao,estado,casosNovos,casosNovos,casosNovos,casosNovos,casosNovos,casosNovos,casosNovos,casosNovos,...,obitosNovos,obitosNovos,obitosNovos,obitosNovos,obitosNovos,obitosNovos,obitosNovos,obitosNovos,obitosNovos,obitosNovos
semanaEpi,Unnamed: 1_level_1,Unnamed: 2_level_1,1,2,3,4,5,6,7,8,...,21,22,23,24,25,26,27,28,29,52
0,Centro-Oeste,DF,10210,32672,40410,67600,71952,50346,29998,18872,...,12,4,14,18,76,40,48,26,16,0
1,Centro-Oeste,GO,10924,31460,59364,70996,66762,73728,69236,53928,...,98,28,90,98,136,224,148,116,56,34
2,Centro-Oeste,MS,6358,15446,16836,38878,42042,37684,44436,34166,...,30,8,12,56,14,32,38,36,36,0
3,Centro-Oeste,MT,10264,24628,36222,54572,54968,46384,36546,27832,...,10,18,22,24,56,50,78,82,62,0
4,Nordeste,AL,1322,4604,8426,15738,22720,24524,15470,4150,...,0,0,0,6,10,28,50,60,42,0
5,Nordeste,BA,11606,20504,50574,86258,101036,92694,64332,33138,...,18,32,28,50,60,68,64,202,94,2
6,Nordeste,CE,7928,15012,43266,173844,206146,44208,29264,18342,...,86,76,54,16,50,96,176,78,40,0
7,Nordeste,MA,2466,4754,7614,15546,14406,14856,13076,11534,...,2,2,2,0,8,4,20,22,8,0
8,Nordeste,PB,1724,5610,17822,34018,45744,45174,39628,28678,...,6,0,10,2,36,36,42,50,34,2
9,Nordeste,PE,5938,11820,24892,55764,66848,62060,69096,68468,...,54,58,58,68,72,70,78,90,48,14


Your problem is the column names:

In [14]:
covidSemanaW2.columns

MultiIndex([( 'casosNovos',  1),
            ( 'casosNovos',  2),
            ( 'casosNovos',  3),
            ( 'casosNovos',  4),
            ( 'casosNovos',  5),
            ( 'casosNovos',  6),
            ( 'casosNovos',  7),
            ( 'casosNovos',  8),
            ( 'casosNovos',  9),
            ( 'casosNovos', 10),
            ( 'casosNovos', 11),
            ( 'casosNovos', 12),
            ( 'casosNovos', 13),
            ( 'casosNovos', 14),
            ( 'casosNovos', 15),
            ( 'casosNovos', 16),
            ( 'casosNovos', 17),
            ( 'casosNovos', 18),
            ( 'casosNovos', 19),
            ( 'casosNovos', 20),
            ( 'casosNovos', 21),
            ( 'casosNovos', 22),
            ( 'casosNovos', 23),
            ( 'casosNovos', 24),
            ( 'casosNovos', 25),
            ( 'casosNovos', 26),
            ( 'casosNovos', 27),
            ( 'casosNovos', 28),
            ( 'casosNovos', 29),
            ( 'casosNovos', 52),
          

Notice that, before making any change, you can easily convert this into a Long format:

In [15]:
covidSemanaW2.stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,casosNovos,obitosNovos
regiao,estado,semanaEpi,Unnamed: 3_level_1,Unnamed: 4_level_1
Centro-Oeste,DF,1,10210,22
Centro-Oeste,DF,2,32672,14
Centro-Oeste,DF,3,40410,26
Centro-Oeste,DF,4,67600,50
Centro-Oeste,DF,5,71952,106
...,...,...,...,...
Sul,SC,26,25198,132
Sul,SC,27,23470,90
Sul,SC,28,24424,106
Sul,SC,29,8920,56


And, more interesting:

In [16]:
covidSemanaW2.stack([0,1])

regiao        estado               semanaEpi
Centro-Oeste  DF      casosNovos   1            10210
                                   2            32672
                                   3            40410
                                   4            67600
                                   5            71952
                                                ...  
Sul           SC      obitosNovos  26             132
                                   27              90
                                   28             106
                                   29              56
                                   52               4
Length: 1620, dtype: int64

In [17]:
covidSemanaW2.stack([0,1]).reset_index()

Unnamed: 0,regiao,estado,level_2,semanaEpi,0
0,Centro-Oeste,DF,casosNovos,1,10210
1,Centro-Oeste,DF,casosNovos,2,32672
2,Centro-Oeste,DF,casosNovos,3,40410
3,Centro-Oeste,DF,casosNovos,4,67600
4,Centro-Oeste,DF,casosNovos,5,71952
...,...,...,...,...,...
1615,Sul,SC,obitosNovos,26,132
1616,Sul,SC,obitosNovos,27,90
1617,Sul,SC,obitosNovos,28,106
1618,Sul,SC,obitosNovos,29,56


In [18]:
# rename

covidSemanaW2_L=covidSemanaW2.stack([0,1]).reset_index()
covidSemanaW2_L.rename(columns={'level_2':'measure',0:'counts'},inplace=True)

In [20]:
# then

covidSemanaW2_L.to_csv(os.path.join('FilesToAggregate','covidSemanaW2_L.csv'),index=False)

But, if you decided to alter this:

In [21]:
covidSemanaW2.columns

MultiIndex([( 'casosNovos',  1),
            ( 'casosNovos',  2),
            ( 'casosNovos',  3),
            ( 'casosNovos',  4),
            ( 'casosNovos',  5),
            ( 'casosNovos',  6),
            ( 'casosNovos',  7),
            ( 'casosNovos',  8),
            ( 'casosNovos',  9),
            ( 'casosNovos', 10),
            ( 'casosNovos', 11),
            ( 'casosNovos', 12),
            ( 'casosNovos', 13),
            ( 'casosNovos', 14),
            ( 'casosNovos', 15),
            ( 'casosNovos', 16),
            ( 'casosNovos', 17),
            ( 'casosNovos', 18),
            ( 'casosNovos', 19),
            ( 'casosNovos', 20),
            ( 'casosNovos', 21),
            ( 'casosNovos', 22),
            ( 'casosNovos', 23),
            ( 'casosNovos', 24),
            ( 'casosNovos', 25),
            ( 'casosNovos', 26),
            ( 'casosNovos', 27),
            ( 'casosNovos', 28),
            ( 'casosNovos', 29),
            ( 'casosNovos', 52),
          

In [25]:
# with something like

["_".join([levels[0],str(levels[1])]) for levels in covidSemanaW2.columns]

['casosNovos_1',
 'casosNovos_2',
 'casosNovos_3',
 'casosNovos_4',
 'casosNovos_5',
 'casosNovos_6',
 'casosNovos_7',
 'casosNovos_8',
 'casosNovos_9',
 'casosNovos_10',
 'casosNovos_11',
 'casosNovos_12',
 'casosNovos_13',
 'casosNovos_14',
 'casosNovos_15',
 'casosNovos_16',
 'casosNovos_17',
 'casosNovos_18',
 'casosNovos_19',
 'casosNovos_20',
 'casosNovos_21',
 'casosNovos_22',
 'casosNovos_23',
 'casosNovos_24',
 'casosNovos_25',
 'casosNovos_26',
 'casosNovos_27',
 'casosNovos_28',
 'casosNovos_29',
 'casosNovos_52',
 'obitosNovos_1',
 'obitosNovos_2',
 'obitosNovos_3',
 'obitosNovos_4',
 'obitosNovos_5',
 'obitosNovos_6',
 'obitosNovos_7',
 'obitosNovos_8',
 'obitosNovos_9',
 'obitosNovos_10',
 'obitosNovos_11',
 'obitosNovos_12',
 'obitosNovos_13',
 'obitosNovos_14',
 'obitosNovos_15',
 'obitosNovos_16',
 'obitosNovos_17',
 'obitosNovos_18',
 'obitosNovos_19',
 'obitosNovos_20',
 'obitosNovos_21',
 'obitosNovos_22',
 'obitosNovos_23',
 'obitosNovos_24',
 'obitosNovos_25',
 'o

In [26]:
NewNames=["_".join([levels[0],str(levels[1])]) for levels in covidSemanaW2.columns]
covidSemanaW2.columns=NewNames

#now you have
covidSemanaW2

Unnamed: 0_level_0,Unnamed: 1_level_0,casosNovos_1,casosNovos_2,casosNovos_3,casosNovos_4,casosNovos_5,casosNovos_6,casosNovos_7,casosNovos_8,casosNovos_9,casosNovos_10,...,obitosNovos_21,obitosNovos_22,obitosNovos_23,obitosNovos_24,obitosNovos_25,obitosNovos_26,obitosNovos_27,obitosNovos_28,obitosNovos_29,obitosNovos_52
regiao,estado,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Centro-Oeste,DF,10210,32672,40410,67600,71952,50346,29998,18872,8512,5668,...,12,4,14,18,76,40,48,26,16,0
Centro-Oeste,GO,10924,31460,59364,70996,66762,73728,69236,53928,34468,59294,...,98,28,90,98,136,224,148,116,56,34
Centro-Oeste,MS,6358,15446,16836,38878,42042,37684,44436,34166,17100,14910,...,30,8,12,56,14,32,38,36,36,0
Centro-Oeste,MT,10264,24628,36222,54572,54968,46384,36546,27832,12688,10802,...,10,18,22,24,56,50,78,82,62,0
Nordeste,AL,1322,4604,8426,15738,22720,24524,15470,4150,5358,1868,...,0,0,0,6,10,28,50,60,42,0
Nordeste,BA,11606,20504,50574,86258,101036,92694,64332,33138,15742,18264,...,18,32,28,50,60,68,64,202,94,2
Nordeste,CE,7928,15012,43266,173844,206146,44208,29264,18342,9118,6574,...,86,76,54,16,50,96,176,78,40,0
Nordeste,MA,2466,4754,7614,15546,14406,14856,13076,11534,4458,6582,...,2,2,2,0,8,4,20,22,8,0
Nordeste,PB,1724,5610,17822,34018,45744,45174,39628,28678,10370,15074,...,6,0,10,2,36,36,42,50,34,2
Nordeste,PE,5938,11820,24892,55764,66848,62060,69096,68468,39620,33348,...,54,58,58,68,72,70,78,90,48,14


If you start with something like this, you could recover the Multi index:

In [27]:
pd.MultiIndex.from_tuples(covidSemanaW2.columns.str.split('_').map(tuple))

MultiIndex([( 'casosNovos',  '1'),
            ( 'casosNovos',  '2'),
            ( 'casosNovos',  '3'),
            ( 'casosNovos',  '4'),
            ( 'casosNovos',  '5'),
            ( 'casosNovos',  '6'),
            ( 'casosNovos',  '7'),
            ( 'casosNovos',  '8'),
            ( 'casosNovos',  '9'),
            ( 'casosNovos', '10'),
            ( 'casosNovos', '11'),
            ( 'casosNovos', '12'),
            ( 'casosNovos', '13'),
            ( 'casosNovos', '14'),
            ( 'casosNovos', '15'),
            ( 'casosNovos', '16'),
            ( 'casosNovos', '17'),
            ( 'casosNovos', '18'),
            ( 'casosNovos', '19'),
            ( 'casosNovos', '20'),
            ( 'casosNovos', '21'),
            ( 'casosNovos', '22'),
            ( 'casosNovos', '23'),
            ( 'casosNovos', '24'),
            ( 'casosNovos', '25'),
            ( 'casosNovos', '26'),
            ( 'casosNovos', '27'),
            ( 'casosNovos', '28'),
            ( 'casos