In [23]:
import pandas as pd
import requests
from datetime import datetime
import tabula

# Importing public resources files

In [24]:
df_public_resources = pd.read_csv("public_resources_list.csv")

In [25]:
links_pax_entries_stations = df_public_resources.query(
    " file_type == 'csv' & resource_name.str.contains('Entrada de Passageiros por Estação - Média Dias Úteis') "
    )
links_pax_entries_stations

Unnamed: 0,resource_name,resource_link,file_type,mod_date
3,Entrada de Passageiros por Estação - Média Dia...,https://transparencia.metrosp.com.br/sites/def...,csv,2023-05-09
10,Entrada de Passageiros por Estação - Média Dia...,https://transparencia.metrosp.com.br/sites/def...,csv,2023-05-09
17,Entrada de Passageiros por Estação - Média Dia...,https://transparencia.metrosp.com.br/sites/def...,csv,2023-05-09


In [26]:
links_pax_entries_stations["resource_name"].to_list()

['Entrada de Passageiros por Estação - Média Dias Úteis - 2023',
 'Entrada de Passageiros por Estação - Média Dias Úteis - 2022',
 'Entrada de Passageiros por Estação - Média Dias Úteis - 2021']

In [27]:
# Mapping the months to its numeric equivalent
month_map = {
    'jan': 1, 'fev': 2, 'mar': 3, 'abr': 4,
    'mai': 5, 'jun': 6, 'jul': 7, 'ago': 8,
    'set': 9, 'out': 10, 'nov': 11, 'dez': 12
}

# Complete PES wrangling for L01 - Blue

In [28]:
def l01_pes(url, year):
    add_skip = 0
    if year == "2022":
        add_skip = -98
    # Read the CSV file
    df_raw = pd.read_csv(
        url, 
        encoding="latin-1", 
        sep=";",
        skiprows=5,
        skipfooter=166+add_skip,
        usecols=range(0, 13),
        engine='python',
        )
    # Reshape the DataFrame using melt    
    df_raw = df_raw.melt(id_vars=["Estação"], var_name="month", value_name="dpea")
    # Remove "*" from the month column
    df_raw["month"] = df_raw["month"].str.replace(r"\*", "", regex=True)
    # Convert month names to datetime objects
    df_raw["month"] = df_raw["month"].apply(lambda x: pd.to_datetime(year + str(month_map[x.lower()]), format="%Y%m"))
    # Rename the "Estação" column to "station"
    df_raw.rename(columns={"Estação": "station"}, inplace=True)
    df_raw['station'] = df_raw['station'].str.replace(r'[¹²2]', '', regex=True)
    # Add a "line" column with value 1
    df_raw["line"] = 1
    # Reorder the columns
    df_raw = df_raw.reindex(columns=["month", "line", "station", "dpea"])
    
    return df_raw

In [29]:
results_l01_pes = []
for index, row in links_pax_entries_stations.iterrows():
    resource_link = row["resource_link"]
    year = row["resource_name"].split(" - ")[-1]
    df_processed = l01_pes(resource_link, year)
    results_l01_pes.append(df_processed)
    

In [30]:
l01_pes_complete = pd.concat(results_l01_pes, ignore_index=True).sort_values("month")
l01_pes_complete

Unnamed: 0,month,line,station,dpea
573,2021-01-01,1,Parada Inglesa,7.0
572,2021-01-01,1,Jardim São Paulo-Ayrton Senna,6.0
571,2021-01-01,1,Santana,33.0
570,2021-01-01,1,Carandiru,6.0
569,2021-01-01,1,Portuguesa-Tietê,31.0
...,...,...,...,...
255,2023-12-01,1,São Judas,
254,2023-12-01,1,Conceição,
253,2023-12-01,1,Jabaquara,
265,2023-12-01,1,Sé,


In [31]:
l01_pes_complete.to_csv("final_datasets/l01_pes_complete.csv", index=False)

# Complete PES wrangling for L02 - Green

In [32]:
def l02_pes(url, year):
    add_skip = 0
    if year == "2022":
        add_skip = -98
    df_raw = pd.read_csv(
        url, 
        encoding="latin-1", 
        sep=";",
        skiprows=35,
        skipfooter=145 + add_skip,
        usecols=range(0, 13),
        engine='python',
        )
    df_raw = df_raw.melt(id_vars=["Estação"], var_name="month", value_name="dpea")
    df_raw["month"] = df_raw["month"].str.replace(r"\*", "", regex=True)
    df_raw["month"] = df_raw["month"].apply(lambda x: pd.to_datetime(year + str(month_map[x.lower()]), format="%Y%m"))
    df_raw.rename(columns={"Estação": "station"}, inplace=True)
    # Remove ¹ and ² characters
    df_raw['station'] = df_raw['station'].str.replace(r'[¹²]', '', regex=True)
    df_raw["line"] = 2
    df_raw = df_raw.reindex(columns=["month", "line", "station", "dpea"])
    
    return df_raw

In [33]:
results_l02_pes = []
for index, row in links_pax_entries_stations.iterrows():
    resource_link = row["resource_link"]
    year = row["resource_name"].split(" - ")[-1]
    df_processed = l02_pes(resource_link, year)
    results_l02_pes.append(df_processed)

In [34]:
l02_pes_complete = pd.concat(results_l02_pes, ignore_index=True).sort_values("month")
l02_pes_complete

Unnamed: 0,month,line,station,dpea
348,2021-01-01,2,Santuário N.S. de Fátima-Sumaré,5.0
347,2021-01-01,2,Clínicas,13.0
346,2021-01-01,2,Consolação,49.0
345,2021-01-01,2,Trianon-Masp,21.0
344,2021-01-01,2,Brigadeiro,25.0
...,...,...,...,...
156,2023-12-01,2,Sacomã,
155,2023-12-01,2,Tamanduateí,
154,2023-12-01,2,Vila Prudente,
160,2023-12-01,2,Ana Rosa,


In [35]:
l02_pes_complete.to_csv("final_datasets/l02_pes_complete.csv", index=False)

# Complete PES wrangling for L03 - red

In [36]:
def l03_pes(url, year):
    add_skip = 0
    if year == "2022":
        add_skip = -98
    df_raw = pd.read_csv(
        url, 
        encoding="latin-1", 
        sep=";",
        skiprows=56,
        skipfooter=120 + add_skip,
        usecols=range(0, 13),
        engine='python',
        )
    df_raw = df_raw.melt(id_vars=["Estação"], var_name="month", value_name="dpea")
    df_raw["month"] = df_raw["month"].str.replace(r"\*", "", regex=True)
    df_raw["month"] = df_raw["month"].apply(lambda x: pd.to_datetime(year + str(month_map[x.lower()]), format="%Y%m"))
    df_raw.rename(columns={"Estação": "station"}, inplace=True)
    # Remove ¹ and ² characters
    df_raw['station'] = df_raw['station'].str.replace(r'[¹²]', '', regex=True)
    df_raw["line"] = 3
    df_raw = df_raw.reindex(columns=["month", "line", "station", "dpea"])
    
    return df_raw

In [37]:
results_l03_pes = []
for index, row in links_pax_entries_stations.iterrows():
    resource_link = row["resource_link"]
    year = row["resource_name"].split(" - ")[-1]
    df_processed = l03_pes(resource_link, year)
    results_l03_pes.append(df_processed)

In [38]:
l03_pes_complete = pd.concat(results_l03_pes, ignore_index=True).sort_values("month")
l03_pes_complete

Unnamed: 0,month,line,station,dpea
448,2021-01-01,3,Marechal Deodoro,18.0
447,2021-01-01,3,Santa Cecília,16.0
446,2021-01-01,3,República,72.0
445,2021-01-01,3,Anhangabaú,32.0
444,2021-01-01,3,Sé,123.0
...,...,...,...,...
200,2023-12-01,3,Patriarca,
199,2023-12-01,3,Artur Alvim,
198,2023-12-01,3,Corinthians-Itaquera,
207,2023-12-01,3,Bresser-Moóca,


In [39]:
l03_pes_complete.to_csv("final_datasets/l03_pes_complete.csv", index=False)

# Complete PES wrangling for L15 - silver

In [40]:
def l15_pes(url, year):
    add_skip = 0
    if year == "2022":
        add_skip = -98
    df_raw = pd.read_csv(
        url, 
        encoding="latin-1", 
        sep=";",
        skiprows=80,
        skipfooter=103 + add_skip,
        usecols=range(0, 13),
        engine='python',
        )
    df_raw = df_raw.melt(id_vars=["Estação"], var_name="month", value_name="dpea")
    df_raw["month"] = df_raw["month"].str.replace(r"\*", "", regex=True)
    df_raw["month"] = df_raw["month"].apply(lambda x: pd.to_datetime(year + str(month_map[x.lower()]), format="%Y%m"))
    df_raw.rename(columns={"Estação": "station"}, inplace=True)
    # Remove ¹ and ² characters
    df_raw['station'] = df_raw['station'].str.replace(r'[¹²]', '', regex=True)
    df_raw["line"] = 15
    df_raw = df_raw.reindex(columns=["month", "line", "station", "dpea"])
    
    return df_raw

In [41]:
results_l15_pes = []
for index, row in links_pax_entries_stations.iterrows():
    resource_link = row["resource_link"]
    year = row["resource_name"].split(" - ")[-1]
    df_processed = l15_pes(resource_link, year)
    results_l15_pes.append(df_processed)

In [42]:
l15_pes_complete = pd.concat(results_l15_pes, ignore_index=True).sort_values("month")
l15_pes_complete

Unnamed: 0,month,line,station,dpea
273,2021-01-01,15,São Mateus,11
272,2021-01-01,15,Fazenda da Juta,2
271,2021-01-01,15,Sapopemba,5
270,2021-01-01,15,Jardim Planalto,2
269,2021-01-01,15,Vila União,3
...,...,...,...,...
127,2023-12-01,15,Jardim Planalto,
128,2023-12-01,15,Sapopemba,
129,2023-12-01,15,Fazenda da Juta,
130,2023-12-01,15,São Mateus,


In [43]:
l15_pes_complete.to_csv("final_datasets/l15_pes_complete.csv", index=False)