# Merge and filter data
-  Read all the sensor data
- Filter for only data from sensors present in our model
- Propperly read the timestamps
- Aggregate all the processed data into a new dataframe

In [20]:
import pandas as pd

# File names and relative paths
# files = ['../Dados/2013AEDL.csv', '../Dados/1S2014AEDL.csv', '../Dados/2S2014AEDL.csv', '../Dados/1P2015AEDL.csv', '../Dados/2P2015AEDL.csv']
files = ['./Dataset/2013AEDL.csv', './Dataset/1S2014AEDL.csv', './Dataset/2S2014AEDL.csv', './Dataset/1P2015AEDL.csv', './Dataset/2P2015AEDL.csv']

# Define the list of id of the sensors in our model
target_ids = [121726,121727,121731,121732,121733,121734,121735,121736,121741,121742,121754,121755,121756]

# Filter for the DataFrame: we want to keep only rows where 'EQUIPMENTID' matches the sensor IDs we use
col = 'EQUIPMENTID'

# Create dataframe to concatenate all the useful information
filtered_df = pd.DataFrame()

for file in files:
    df = pd.read_csv(file)
    # df.info()

    # Convert data types before filtering
    df["AGG_PERIOD_START"] = pd.to_datetime(df["AGG_PERIOD_START"])
    df["LANE_BUNDLE_DIRECTION"] = df["LANE_BUNDLE_DIRECTION"].astype("string")
    
    # Filter for only rows with matching sensor ids
    matching_df = df[df[col].isin(target_ids)]
    
    filtered_df = pd.concat([filtered_df, matching_df], ignore_index=True)

    print(f"Original rows: {len(df)}")
    print(f"Filtered rows: {len(matching_df)}")
    # print(filtered_df.info())

filtered_df.info()

Original rows: 3890630
Filtered rows: 2063480
Original rows: 2251668
Filtered rows: 1153163
Original rows: 2745968
Filtered rows: 1312094
Original rows: 1315265
Filtered rows: 648009
Original rows: 3881466
Filtered rows: 1917689
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7094435 entries, 0 to 7094434
Data columns (total 20 columns):
 #   Column                      Dtype         
---  ------                      -----         
 0   AGGREGATE_BY_LANE_BUNDLEID  int64         
 1   AGG_ID                      int64         
 2   EQUIPMENTID                 int64         
 3   AGG_PERIOD_START            datetime64[ns]
 4   AGG_PERIOD_LEN_MINS         int64         
 5   NR_LANES                    int64         
 6   LANE_BUNDLE_DIRECTION       string        
 7   TOTAL_VOLUME                int64         
 8   AVG_SPEED_ARITHMETIC        float64       
 9   AVG_SPEED_HARMONIC          float64       
 10  AVG_LENGTH                  float64       
 11  AVG_SPACING                 f

In [21]:
filtered_df.to_csv("./Dataset/filtered_data.csv", index=False)
# filtered_df.to_parquet('./Dados/filtered_data.parquet', index=False)

# Transform data for MAB

## Eliminate useless columns

In [None]:
df = filtered_df.copy()  # concatenated AEDL data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7094435 entries, 0 to 7094434
Data columns (total 20 columns):
 #   Column                      Dtype         
---  ------                      -----         
 0   AGGREGATE_BY_LANE_BUNDLEID  int64         
 1   AGG_ID                      int64         
 2   EQUIPMENTID                 int64         
 3   AGG_PERIOD_START            datetime64[ns]
 4   AGG_PERIOD_LEN_MINS         int64         
 5   NR_LANES                    int64         
 6   LANE_BUNDLE_DIRECTION       string        
 7   TOTAL_VOLUME                int64         
 8   AVG_SPEED_ARITHMETIC        float64       
 9   AVG_SPEED_HARMONIC          float64       
 10  AVG_LENGTH                  float64       
 11  AVG_SPACING                 float64       
 12  OCCUPANCY                   float64       
 13  LIGHT_VEHICLE_RATE          float64       
 14  VOLUME_CLASSE_A             int64         
 15  VOLUME_CLASSE_B             int64         
 16  VOLUME_CLASSE_C   

In [49]:
df = filtered_df.copy()  # concatenated AEDL data

half_complete_df = df.drop(columns=["AGGREGATE_BY_LANE_BUNDLEID","AGG_ID","AGG_PERIOD_LEN_MINS","NR_LANES","AVG_LENGTH","OCCUPANCY","VOLUME_CLASSE_A","VOLUME_CLASSE_B","VOLUME_CLASSE_C","VOLUME_CLASSE_D","VOLUME_CLASSE_0","AXLE_CLASS_VOLUMES"])
half_complete_df.head(3)
half_complete_df.to_csv("./Dataset/more_complete_data.csv", index=False)
simple_df = half_complete_df.drop(columns=["AVG_SPEED_ARITHMETIC","AVG_SPEED_HARMONIC","AVG_SPACING","LIGHT_VEHICLE_RATE"])
simple_df.head(3)
filtered_df.to_csv("./Dataset/simplest_data.csv", index=False)

## Create tags for arms
### (gather all data from the same sensor, the same time interval and the same direction)

In [50]:
# MAP DATA INTO BINS FOR EACH ARM
# Arm = (time_bin, sensor_id, direction)

# Minutes since midnight
minutes_since_midnight = (simple_df["AGG_PERIOD_START"].dt.hour*60 + simple_df["AGG_PERIOD_START"].dt.minute)


# 30‑minute bins: [0..47]
simple_df["half_time_bin"] = (minutes_since_midnight // 30).astype("int8")

# 5‑minute bins: [0..287]
simple_df["og_time_bin"] = (minutes_since_midnight // 5).astype("int16")

# This maps 00:00–00:04 → bin 0, 00:05–00:09 → bin 1, …, 23:55–23:59 → bin 287, across all days.
# Grouping by such bins is a standard way to compare flows at the same time‑of‑day across dates.

In [51]:
# Define arm identifiers
# Use (5-min bin, sensor, direction) as the arm key
simple_df["og_arm"] = list(
    zip(
        simple_df["og_time_bin"],
        simple_df["EQUIPMENTID"],
        simple_df["LANE_BUNDLE_DIRECTION"]
    )
)

# Define arm identifiers: (30‑min bin, sensor, direction)
simple_df["half_arm"] = list(
    zip(
        simple_df["half_time_bin"],
        simple_df["EQUIPMENTID"],
        simple_df["LANE_BUNDLE_DIRECTION"]
    )
)

In [52]:
simple_df["half_arm"].nunique()

1248

In [53]:
simple_df["og_arm"].nunique()

7488

# Train/Test Split

In [54]:
simple_df = df.sort_values("AGG_PERIOD_START")

cutoff_date = pd.Timestamp("2015-01-01")
simple_train_df = df[df["AGG_PERIOD_START"] < cutoff_date]
simple_test_df  = df[df["AGG_PERIOD_START"] >= cutoff_date]