### Read all the sensor data

In [1]:
import pandas as pd

files = ['../Dados/2013AEDL.csv', '../Dados/1S2014AEDL.csv', '../Dados/2S2014AEDL.csv', '../Dados/1P2015AEDL.csv', '../Dados/2P2015AEDL.csv']

# Read each file into a list and concatenate them
df = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13657923 entries, 0 to 13657922
Data columns (total 20 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   AGGREGATE_BY_LANE_BUNDLEID  int64  
 1   AGG_ID                      int64  
 2   EQUIPMENTID                 int64  
 3   AGG_PERIOD_START            object 
 4   AGG_PERIOD_LEN_MINS         int64  
 5   NR_LANES                    int64  
 6   LANE_BUNDLE_DIRECTION       object 
 7   TOTAL_VOLUME                int64  
 8   AVG_SPEED_ARITHMETIC        float64
 9   AVG_SPEED_HARMONIC          float64
 10  AVG_LENGTH                  float64
 11  AVG_SPACING                 float64
 12  OCCUPANCY                   float64
 13  LIGHT_VEHICLE_RATE          float64
 14  VOLUME_CLASSE_A             int64  
 15  VOLUME_CLASSE_B             int64  
 16  VOLUME_CLASSE_C             int64  
 17  VOLUME_CLASSE_D             int64  
 18  VOLUME_CLASSE_0             int64  
 19  AXLE_CLASS_VOLUMES 

### Select only the data collected by the sensors we use in our model

In [2]:
# Define the list of sensors in our model
target_ids = ['121726','121727','121731','121732','121733','121734','121735','121736','121741','121742','121754','121755','121756']

# Filter the DataFrame: this keeps only rows where 'EQUIPMENTID' matches the sensor IDs
col = 'EQUIPMENTID'
clean = df[col].fillna('').astype(str).str.strip().str.replace(r'\.0+$', '', regex=True)   # build a normalized string version of the column
filtered_df = df[clean.isin(target_ids)].copy()   # final filtered dataframe using the cleaned column
print(f"Original rows: {len(df)}")
print(f"Filtered rows: {len(filtered_df)}")
print(filtered_df.info())

Original rows: 13657923
Filtered rows: 6883391
<class 'pandas.core.frame.DataFrame'>
Index: 6883391 entries, 76815 to 13657921
Data columns (total 20 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   AGGREGATE_BY_LANE_BUNDLEID  int64  
 1   AGG_ID                      int64  
 2   EQUIPMENTID                 int64  
 3   AGG_PERIOD_START            object 
 4   AGG_PERIOD_LEN_MINS         int64  
 5   NR_LANES                    int64  
 6   LANE_BUNDLE_DIRECTION       object 
 7   TOTAL_VOLUME                int64  
 8   AVG_SPEED_ARITHMETIC        float64
 9   AVG_SPEED_HARMONIC          float64
 10  AVG_LENGTH                  float64
 11  AVG_SPACING                 float64
 12  OCCUPANCY                   float64
 13  LIGHT_VEHICLE_RATE          float64
 14  VOLUME_CLASSE_A             int64  
 15  VOLUME_CLASSE_B             int64  
 16  VOLUME_CLASSE_C             int64  
 17  VOLUME_CLASSE_D             int64  
 18  VOLUME_CLASSE_0

### Convert date/time strings into datetime objects

In [4]:
filtered_df['AGG_PERIOD_START'].head(10)

76815    2013-04-12 01:05:00
76816    2013-04-12 01:05:00
76817    2013-04-12 01:10:00
76818    2013-04-12 01:10:00
76819    2013-04-12 01:15:00
76820    2013-04-12 01:15:00
76821    2013-04-12 01:20:00
76822    2013-04-12 01:20:00
76823    2013-04-12 01:25:00
76824    2013-04-12 01:25:00
Name: AGG_PERIOD_START, dtype: object

In [7]:
a = filtered_df['AGG_PERIOD_START'][76815]
print(type(a))
print(a)

<class 'str'>
2013-04-12 01:05:00


In [12]:
# Convert the column to datetime objects
filtered_df['AGG_PERIOD_START'] = pd.to_datetime(filtered_df['AGG_PERIOD_START'], format='mixed')

a = filtered_df['AGG_PERIOD_START'][76815]
print(type(a))
print(filtered_df['AGG_PERIOD_START'].head(10))
print(filtered_df['AGG_PERIOD_START'].tail(10))

<class 'pandas._libs.tslibs.timestamps.Timestamp'>
76815   2013-04-12 01:05:00
76816   2013-04-12 01:05:00
76817   2013-04-12 01:10:00
76818   2013-04-12 01:10:00
76819   2013-04-12 01:15:00
76820   2013-04-12 01:15:00
76821   2013-04-12 01:20:00
76822   2013-04-12 01:20:00
76823   2013-04-12 01:25:00
76824   2013-04-12 01:25:00
Name: AGG_PERIOD_START, dtype: datetime64[ns]
13657912   2015-12-31 15:50:00
13657913   2015-12-31 15:50:00
13657914   2015-12-31 15:55:00
13657915   2015-12-31 15:55:00
13657916   2015-12-31 15:55:00
13657917   2015-12-31 15:55:00
13657918   2015-12-31 15:55:00
13657919   2015-12-31 15:55:00
13657920   2015-12-31 16:00:00
13657921   2015-12-31 16:00:00
Name: AGG_PERIOD_START, dtype: datetime64[ns]


### Filtering by lane direction

In [13]:
filtered_df.to_parquet('../Dados/filtered_data.parquet', index=False)   # save the filtered data