In [1]:
import pandas as pd

# Checking PES (passengers per station) data

In [2]:
df_pes = pd.read_csv("alllines_pes_complete.csv", index_col=None, parse_dates=["date"]).drop(columns=["Unnamed: 0"])
df_pes

Unnamed: 0,date,line,station,dpea
0,2021-01-01,1,Parada Inglesa,7000.0
1,2021-01-01,1,Jardim São Paulo-Ayrton Senna,6000.0
2,2021-01-01,1,Santana,33000.0
3,2021-01-01,1,Carandiru,6000.0
4,2021-01-01,1,Portuguesa-Tietê,31000.0
...,...,...,...,...
3139,2023-12-01,15,Jardim Planalto,0.0
3140,2023-12-01,15,Sapopemba,0.0
3141,2023-12-01,15,Fazenda da Juta,0.0
3142,2023-12-01,15,São Mateus,0.0


In [3]:
df_pes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3144 entries, 0 to 3143
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   date     3144 non-null   datetime64[ns]
 1   line     3144 non-null   int64         
 2   station  3144 non-null   object        
 3   dpea     3144 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 98.4+ KB


In [4]:
# checking n-unique dates for each line
for line in df_pes["line"].unique():
    print(f"Line {line} has {df_pes.query(f'line == {line}')['date'].nunique()} unique dates.")

Line 1 has 36 unique dates.
Line 2 has 36 unique dates.
Line 3 has 36 unique dates.
Line 4 has 28 unique dates.
Line 5 has 28 unique dates.
Line 15 has 36 unique dates.


In [5]:
# dropping all rows with datetime higher than April, 2023 (last dpea data available)
df_pes = df_pes.drop(df_pes[df_pes["date"] > "2023-04-01"].index)

In [6]:
# double checking n-unique dates for each line
for line in df_pes["line"].unique():
    print(f"Line {line} has {df_pes.query(f'line == {line}')['date'].nunique()} unique dates.")

Line 1 has 28 unique dates.
Line 2 has 28 unique dates.
Line 3 has 28 unique dates.
Line 4 has 28 unique dates.
Line 5 has 28 unique dates.
Line 15 has 28 unique dates.


In [7]:
df_pes.to_csv("../pes_complete.csv", index=False)

# Checking PTL (Passengers Transported by line) data

In [8]:
df_ptl = pd.read_csv("alllines_ptl_complete.csv", parse_dates=["year_month"])

In [9]:
df_ptl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 342 entries, 0 to 341
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   year_month                342 non-null    datetime64[ns]
 1   line                      342 non-null    int64         
 2   total                     342 non-null    float64       
 3   MDU (Business Days Mean)  342 non-null    float64       
 4   MSD (Saturdays Mean)      342 non-null    float64       
 5   MDO (Sundays Mean)        342 non-null    float64       
 6   MAX (Daily Max)           342 non-null    float64       
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 18.8 KB


In [10]:
df_ptl.rename(
    columns={
        "year_month": "date", 
        "MDU (Business Days Mean)": "business_day_mean",
        "MSD (Saturdays Mean)": "saturday_mean",
        "MDO (Sundays Mean)": "sunday_mean",
        "MAX (Daily Max)": "max"
        },
        inplace=True
    )

In [11]:
# double checking n-unique dates for each line
for line in df_ptl["line"].unique():
    print(f"Line {line} has {df_ptl.query(f'line == {line}')['date'].nunique()} unique dates.")

Line 3 has 57 unique dates.
Line 1 has 57 unique dates.
Line 15 has 57 unique dates.
Line 2 has 57 unique dates.
Line 5 has 57 unique dates.
Line 4 has 57 unique dates.


In [12]:
df_ptl.to_csv("../ptl_complete.csv", index=False)

# Checking PEL (Passengers entry per line) data

In [13]:
df_pel = pd.read_csv("publiclines_pel_complete.csv", parse_dates=["date"])
df_pel

Unnamed: 0,date,line,total,business_day_mean,saturday_mean,sunday_mean,max
0,2017-10-01,1,26029000.0,1059000.0,569000.0,291000.0,1118000.0
1,2017-10-01,2,13219000.0,555000.0,233000.0,142000.0,569000.0
2,2017-10-01,3,30557000.0,1224000.0,684000.0,405000.0,1288000.0
3,2017-10-01,15,260000.0,12000.0,4000.0,1000.0,12000.0
4,2017-11-01,1,24834000.0,1077000.0,571000.0,319000.0,1116000.0
...,...,...,...,...,...,...,...
263,2023-03-01,1,20324000.0,803000.0,491000.0,265000.0,828000.0
264,2023-04-01,3,20602000.0,886000.0,501000.0,296000.0,973000.0
265,2023-04-01,2,10961000.0,492000.0,219000.0,140000.0,512000.0
266,2023-04-01,15,1653000.0,72000.0,37000.0,22000.0,74000.0


In [14]:
# double checking l04 unique dates
l04_pel = pd.read_csv("l04_pel_complete.csv", parse_dates=["date"])
l04_pel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               64 non-null     datetime64[ns]
 1   line               64 non-null     int64         
 2   total              63 non-null     float64       
 3   business_day_mean  63 non-null     float64       
 4   saturday_mean      63 non-null     float64       
 5   sunday_mean        63 non-null     float64       
 6   max                63 non-null     float64       
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 3.6 KB


In [15]:
# double checking l05 unique dates
l05_pel = pd.read_csv("l05_pel_complete.csv", parse_dates=["date"])
l05_pel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               58 non-null     datetime64[ns]
 1   line               58 non-null     int64         
 2   total              57 non-null     float64       
 3   business_day_mean  58 non-null     float64       
 4   saturday_mean      58 non-null     float64       
 5   sunday_mean        58 non-null     float64       
 6   max                58 non-null     float64       
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 3.3 KB


In [16]:
# concat public and private lines
df_pel = pd.concat([df_pel, l04_pel, l05_pel])
df_pel

Unnamed: 0,date,line,total,business_day_mean,saturday_mean,sunday_mean,max
0,2017-10-01,1,26029000.0,1059000.0,569000.0,291000.0,1118000.0
1,2017-10-01,2,13219000.0,555000.0,233000.0,142000.0,569000.0
2,2017-10-01,3,30557000.0,1224000.0,684000.0,405000.0,1288000.0
3,2017-10-01,15,260000.0,12000.0,4000.0,1000.0,12000.0
4,2017-11-01,1,24834000.0,1077000.0,571000.0,319000.0,1116000.0
...,...,...,...,...,...,...,...
53,2022-12-01,5,5960580.0,227190.0,131570.0,76140.0,282670.0
54,2023-01-01,5,5417590.0,212240.0,132160.0,65310.0,235470.0
55,2023-02-01,5,5598180.0,243260.0,143070.0,75610.0,269870.0
56,2023-03-01,5,6955860.0,262290.0,152100.0,78690.0,278550.0


In [17]:
df_pel[df_pel.isna().any(axis=1)]

Unnamed: 0,date,line,total,business_day_mean,saturday_mean,sunday_mean,max
43,2018-08-01,15,,,,,
51,2018-10-01,15,,,,,
119,2020-03-01,15,,,,,
123,2020-04-01,15,,,,,
127,2020-05-01,15,,,,,
20,2019-09-01,4,,,,,
51,2022-10-01,5,,251390.0,145170.0,84530.0,264430.0


In [18]:
# double checking n-unique dates for each line
for line in df_pel["line"].unique():
    print(f"Line {line} has {df_pel.query(f'line == {line}')['date'].nunique()} unique dates.")

Line 1 has 67 unique dates.
Line 2 has 67 unique dates.
Line 3 has 67 unique dates.
Line 15 has 67 unique dates.
Line 4 has 64 unique dates.
Line 5 has 57 unique dates.


In [27]:
df_pel.query("line == 5")["date"].to_list()

[Timestamp('2018-08-01 00:00:00'),
 Timestamp('2018-08-01 00:00:00'),
 Timestamp('2018-09-01 00:00:00'),
 Timestamp('2018-10-01 00:00:00'),
 Timestamp('2018-11-01 00:00:00'),
 Timestamp('2018-12-01 00:00:00'),
 Timestamp('2019-01-01 00:00:00'),
 Timestamp('2019-02-01 00:00:00'),
 Timestamp('2019-03-01 00:00:00'),
 Timestamp('2019-04-01 00:00:00'),
 Timestamp('2019-05-01 00:00:00'),
 Timestamp('2019-06-01 00:00:00'),
 Timestamp('2019-07-01 00:00:00'),
 Timestamp('2019-08-01 00:00:00'),
 Timestamp('2019-09-01 00:00:00'),
 Timestamp('2019-10-01 00:00:00'),
 Timestamp('2019-11-01 00:00:00'),
 Timestamp('2019-12-01 00:00:00'),
 Timestamp('2020-01-01 00:00:00'),
 Timestamp('2020-02-01 00:00:00'),
 Timestamp('2020-03-01 00:00:00'),
 Timestamp('2020-04-01 00:00:00'),
 Timestamp('2020-05-01 00:00:00'),
 Timestamp('2020-06-01 00:00:00'),
 Timestamp('2020-07-01 00:00:00'),
 Timestamp('2020-08-01 00:00:00'),
 Timestamp('2020-09-01 00:00:00'),
 Timestamp('2020-10-01 00:00:00'),
 Timestamp('2020-11-

In [28]:
df_pel.to_csv("../pel_complete.csv", index=False)