In [None]:
import pandas as pd

In [None]:
PRIMARY_COLUMNS = [
    "BATTERY_1_CURRENT",
    "BATTERY_1_TEMP",
    "BATTERY_1_VOLTAGE",
    "BATTERY_2_CURRENT",
    "BATTERY_2_TEMP",
    "BATTERY_2_VOLTAGE",
    "BATTERY_3_CURRENT",
    "BATTERY_3_TEMP",
    "BATTERY_3_VOLTAGE",
    "BATTERY_4_CURRENT",
    "BATTERY_4_TEMP",
    "BATTERY_4_VOLTAGE",
    "BATTERY_5_VOLTAGE",
    "BATTERY_COOLING_TEMP",
    "BATTERY_SOC",
    "BATTERY_SOH",
    "ERRORS",
    "ERROR_SIZE",
    "TIMESTAMP_KAFKA",
    "TIMESTAMP_TRUNC",
    "TIMESTAMP_VEHICLE",
    "VEHICLE_GPS_SPEED",
    "VEHICLE_GPS_X",
    "VEHICLE_GPS_Y",
    "VEHICLE_ID",
    "VEHICLE_OUTSIDE_TEMP",
    "VEHICLE_SPEED",
]

In [None]:
df = pd.read_parquet(
        "../data/clean_data.parquet",
        columns=PRIMARY_COLUMNS,
        engine="pyarrow",
    )

df["ERRORS"] = df["ERRORS"].str.replace("\n", "", regex=False).replace('\[  "', "", regex=True).replace('"\]', "", regex=True).replace("\[\]", "", regex=True)

In [None]:
# Define aggregation functions per column
agg_functions = {
    # Battery current columns - average
    'BATTERY_1_CURRENT': 'mean',
    'BATTERY_2_CURRENT': 'mean',
    'BATTERY_3_CURRENT': 'mean',
    'BATTERY_4_CURRENT': 'mean',
    
    # Battery temperature columns - average
    'BATTERY_1_TEMP': 'mean',
    'BATTERY_2_TEMP': 'mean',
    'BATTERY_3_TEMP': 'mean',
    'BATTERY_4_TEMP': 'mean',
    'BATTERY_COOLING_TEMP': 'mean',
    
    # Battery voltage columns - average
    'BATTERY_1_VOLTAGE': 'mean',
    'BATTERY_2_VOLTAGE': 'mean',
    'BATTERY_3_VOLTAGE': 'mean',
    'BATTERY_4_VOLTAGE': 'mean',
    'BATTERY_5_VOLTAGE': 'mean',
    
    # Battery state columns - average
    'BATTERY_SOC': 'mean',  # State of Charge
    'BATTERY_SOH': 'mean',  # State of Health
    
    # Error columns
    'ERRORS': 'unique',        # Sum of errors
    'ERROR_SIZE': 'sum',    # Sum of error sizes
    
    # Timestamp columns
    'TIMESTAMP_KAFKA': 'last',
    'TIMESTAMP_TRUNC': 'last',
    
    # Vehicle data columns
    'VEHICLE_GPS_SPEED': 'mean',
    'VEHICLE_GPS_X': 'mean',
    'VEHICLE_GPS_Y': 'mean',
    'VEHICLE_OUTSIDE_TEMP': 'mean',
    'VEHICLE_SPEED': 'mean'
}

# Perform minute-wise aggregation by first grouping by VEHICLE_ID and then using pd.Grouper
# We set TIMESTAMP_VEHICLE as index temporarily for the Grouper to work
df_with_index = df.set_index('TIMESTAMP_VEHICLE')
df_minute = df_with_index.groupby(['VEHICLE_ID', pd.Grouper(freq='1Min')]).agg(agg_functions)

# Display the first few rows of the minute-aggregated data
df_minute.head()

In [None]:
df_minute = df_minute.reset_index()

In [None]:
import numpy as np

In [None]:
df_minute["ERRORS"] = df_minute["ERRORS"].apply(
    lambda x: ", ".join([str(e) for e in x if str(e).strip() != ""]) if isinstance(x, (list, tuple, pd.Series, np.ndarray)) else (str(x) if str(x).strip() != "" else None)
)

In [None]:
df_minute.shape

In [None]:
df_minute.to_parquet("../data/clean_data_minute.parquet")