In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
line_numbers = ['01','02','03','04','05','15' ]

In [3]:
# checking if the time series size and range for each metro line file
date_dict={}
for line in line_numbers:
    df = pd.read_csv(f'final_datasets/l{line}_pes_complete.csv')
    date_dict[line] = pd.to_datetime(df['month'].unique())
    print(f'{line},{date_dict[line].size},{date_dict[line][0]},{date_dict[line][-1]}')

01,36,2021-01-01 00:00:00,2023-12-01 00:00:00
02,36,2021-01-01 00:00:00,2023-12-01 00:00:00
03,36,2021-01-01 00:00:00,2023-12-01 00:00:00
04,28,2021-01-01 00:00:00,2023-04-01 00:00:00
05,28,2021-01-01 00:00:00,2023-04-01 00:00:00
15,36,2021-01-01 00:00:00,2023-12-01 00:00:00


Some files have dates beyond April 2023, like lines 1, 2, 3, and 15, although there is no valid data for those (the values are filled with zeros).
Hence, I will limit the data between January 2021 and April 2023.

In [4]:
datelimit = datetime(2023,4,1) 
for line in line_numbers:
    df = pd.read_csv(f'final_datasets/l{line}_pes_complete.csv')
    df['month'] = pd.to_datetime(df['month']) 
    # Pivot the DataFrame
    df = df.loc[df['month']<=datelimit].pivot(index='month', columns='station', values='dpea')
    df.columns = pd.MultiIndex.from_product([[int(line)], df.columns])
    df.columns.names = ['Line', 'Station']
    df.to_csv(f'final_datasets/reshaped_l{line}_pes_complete.csv')
    if 'new_df' in globals():
        new_df = new_df.merge(df, left_index=True, right_index=True, how='outer')
    else:
        new_df = df
new_df.to_csv(f'final_datasets/merged_reshaped_pes_complete.csv')

In [5]:
new_df

Line,1,1,1,1,1,1,1,1,1,1,...,15,15,15,15,15,15,15,15,15,15
Station,Ana Rosa,Armênia,Carandiru,Conceição,Jabaquara,Japão-Liberdade,Jardim São Paulo-Ayrton Senna,Luz,Parada Inglesa,Paraíso,...,Fazenda da Juta,Jardim Colonial,Jardim Planalto,Oratório,Sapopemba,São Lucas,São Mateus,Vila Prudente,Vila Tolstói,Vila União
month,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2021-01-01,40000.0,14000.0,6000.0,15000.0,46000.0,12000.0,6000.0,70000.0,7000.0,49000.0,...,2000.0,0.0,2000.0,3000.0,5000.0,3000.0,11000.0,31000.0,3000.0,3000.0
2021-02-01,43000.0,15000.0,6000.0,16000.0,48000.0,12000.0,6000.0,74000.0,7000.0,53000.0,...,2000.0,0.0,3000.0,3000.0,5000.0,3000.0,12000.0,31000.0,3000.0,4000.0
2021-03-01,35000.0,12000.0,5000.0,12000.0,40000.0,9000.0,5000.0,57000.0,6000.0,43000.0,...,2000.0,0.0,2000.0,3000.0,4000.0,2000.0,11000.0,27000.0,2000.0,3000.0
2021-04-01,33000.0,12000.0,5000.0,12000.0,39000.0,9000.0,4000.0,58000.0,5000.0,42000.0,...,2000.0,0.0,2000.0,3000.0,4000.0,2000.0,10000.0,26000.0,2000.0,3000.0
2021-05-01,39000.0,14000.0,6000.0,14000.0,45000.0,11000.0,5000.0,68000.0,6000.0,48000.0,...,2000.0,0.0,2000.0,3000.0,5000.0,3000.0,12000.0,29000.0,3000.0,3000.0
2021-06-01,41000.0,14000.0,6000.0,15000.0,46000.0,12000.0,6000.0,69000.0,7000.0,50000.0,...,2000.0,0.0,3000.0,3000.0,5000.0,3000.0,12000.0,28000.0,3000.0,3000.0
2021-07-01,42000.0,15000.0,6000.0,15000.0,48000.0,12000.0,6000.0,69000.0,7000.0,52000.0,...,2000.0,0.0,3000.0,4000.0,5000.0,3000.0,13000.0,35000.0,3000.0,4000.0
2021-08-01,47000.0,15000.0,7000.0,16000.0,51000.0,13000.0,7000.0,74000.0,8000.0,58000.0,...,3000.0,0.0,3000.0,4000.0,6000.0,3000.0,14000.0,38000.0,3000.0,4000.0
2021-09-01,51000.0,16000.0,7000.0,18000.0,55000.0,14000.0,7000.0,89000.0,8000.0,62000.0,...,3000.0,0.0,3000.0,4000.0,6000.0,4000.0,14000.0,41000.0,3000.0,4000.0
2021-10-01,53000.0,17000.0,8000.0,19000.0,57000.0,15000.0,8000.0,96000.0,9000.0,65000.0,...,3000.0,0.0,3000.0,4000.0,6000.0,4000.0,15000.0,42000.0,4000.0,4000.0


In [6]:
new_df.columns

MultiIndex([( 1,                         'Ana Rosa'),
            ( 1,                         'Armênia '),
            ( 1,                       'Carandiru '),
            ( 1,                       'Conceição '),
            ( 1,                        'Jabaquara'),
            ( 1,                 'Japão-Liberdade '),
            ( 1,    'Jardim São Paulo-Ayrton Senna'),
            ( 1,                              'Luz'),
            ( 1,                  'Parada Inglesa '),
            ( 1,                          'Paraíso'),
            ( 1,                'Portuguesa-Tietê '),
            ( 1,                 'Praça da Árvore '),
            ( 1,                       'Santa Cruz'),
            ( 1,                         'Santana '),
            ( 1,                           'Saúde '),
            ( 1,                        'São Bento'),
            ( 1,                     'São Joaquim '),
            ( 1,                       'São Judas '),
            ( 1,            