In [1]:
# Append the path to the src folder to the system path
# To be able to import the modules in the src folder
import sys
sys.path.append('..\\src\\')

# Environment Variables

environ = 'local'

# This will later be in Google Cloud Storage
#environ = 'gcp'
PROJECT_ID = 'csi5150_project'
CSC_BUCKET = f'{PROJECT_ID}_csc_data_bucket'

DATASETS_PATH = '..\\local_storage\\csc_data'

if environ == 'gcp':
    DATASETS_PATH = f'gs://{CSC_BUCKET}'


Feature Engineering

In [2]:
from scipy.signal import savgol_filter

import numpy as np


def savitzky_golay_filter(data, window_size=5, polynomial_order=3):
    """Applies a Savitzky-Golay filter to the data."""
    return savgol_filter(data, window_size, polynomial_order)

def differentiate(data, time):
    """Differentiates data with respect to time."""
    return np.gradient(data, time)

def add_steering_velocity(df):
    """Adds a new feature to the dataframe: steering velocity."""
    
    df = df.copy()
    
   #  assert (df['t'].diff() > 0).all(), "Timestamps are not strictly increasing!"

    # df.loc[:, 'steeringVelocity'] = differentiate(df['steeringAngleDeg'], df['t'])
    df['steeringVelocity'] = df['steeringAngleDeg'].diff() / (df['t'].diff() + 1e-8)

    # TODO: Add/explore preprocessing for steering velocity
    # Replace inf/-inf with the maximum/minimum finite values
    # df.replace([np.inf, -np.inf], np.nan, inplace=True)
    max_finite = df['steeringVelocity'][np.isfinite(df['steeringVelocity'])].max()
    min_finite = df['steeringVelocity'][np.isfinite(df['steeringVelocity'])].min()
    df['steeringVelocity'] = np.clip(df['steeringVelocity'], min_finite, max_finite)

    # df['steeringVelocity'] = df['steeringVelocity'].rolling(window=5, min_periods=1).mean()
    df.fillna(0, inplace=True)

    # Apply filter to remove noise
    df['steeringVelocity'] = savitzky_golay_filter(df['steeringVelocity'], 5, 2)
    # df['steeringVelocity'] = moving_average(df['steeringVelocity'], 1)

    return df

def add_steering_acceleration(df):
    """Adds a new feature to the dataframe: steering acceleration."""

    df = df.copy()

    # assert (df['t'].diff() > 0).all(), "Timestamps are not strictly increasing!"

    # df.loc[:, 'steeringAcceleration'] = differentiate(df['steeringVelocity'], df['t'])
    df['steeringAcceleration'] = df['steeringVelocity'].diff() / (df['t'].diff() + 1e-8)

    # df.replace([np.inf, -np.inf], np.nan, inplace=True)
    max_finite = df['steeringAcceleration'][np.isfinite(df['steeringAcceleration'])].max()
    min_finite = df['steeringVelocity'][np.isfinite(df['steeringAcceleration'])].min()
    df['steeringAcceleration'] = np.clip(df['steeringAcceleration'], min_finite, max_finite)

    # df['steeringAcceleration'] = df['steeringAcceleration'].rolling(window=5, min_periods=1).mean()
    df.fillna(0, inplace=True)

    # Apply filter to remove noise
    df['steeringAcceleration'] = savitzky_golay_filter(df['steeringAcceleration'], 5, 2)
    # df['steeringVelocity'] = moving_average(df['steeringVelocity'], 1)

    return df

In [3]:
import logging

from data.csc_data import CSCDataset
from data.df_preprocessing import (
  MinMaxScalerDP, CompositeDP, SequencesDP, 
  LowPassFilterDP, FeatureAdderDP, FeatureRemoverDP
)
from functions.constants import FEATURES

DATASET_NAME = 'LEXUS_RX_2020'
IS_PREDICTION = False


# TODO: These will be part of the training data pipeline in the actual code (not a notebook)
# When deploying the model, we will have access to velocity and acceleration (in CARLA)

# Not using MinMaxScalerDP for now (XGBoost is not sensitive to feature scaling)
train_preprocessor = CompositeDP([
    FeatureAdderDP([
        add_steering_velocity,
        add_steering_acceleration
    ]),
    FeatureRemoverDP(['t'])
])

label_preprocessor = None


dataset = CSCDataset(DATASET_NAME, 
                     FEATURES,
                     download=False,
                     train_preprocessor=train_preprocessor,
                     label_preprocessor=label_preprocessor,
                     is_prediction=IS_PREDICTION,
                     logging_level=logging.INFO)
total_dataset_size, num_csv = dataset.get_csv_metadata()

print(f"Total file size of dataset: {round(total_dataset_size, 3)} MB")
print(f"Number of CSV files in dataset: {num_csv}")

print(f"Number of samples in dataset: {len(dataset)}")



Total file size of dataset: 912.013 MB
Number of CSV files in dataset: 6500
Number of samples in dataset: 6500


In [5]:
from torch.utils.data import DataLoader

# Load all data at once
train_loader = DataLoader(dataset, batch_size=len(dataset), shuffle=False)

# Extract features and labels
features_list, targets_list = [], []

for batch_idx, (X, y) in enumerate(train_loader):
    
    try:
        # Check if the data is in the correct format
        assert X.shape[0] == y.shape[0], "Number of samples in features and targets do not match!"
    except AssertionError as e:
        print(e)
    
    # Convert PyTorch tensors to NumPy
    features_list.append(X.numpy())  
    targets_list.append(y.numpy())


# Convert to NumPy arrays
# Shape: (num_samples, num_features)
X_np = np.vstack(features_list)

# Shape: (num_samples,)
y_np = np.hstack(targets_list)   

# Check data shape
print("Features shape:", X_np.shape)
print("Targets shape:", y_np.shape)


Features shape: (6500, 611, 6)
Targets shape: (6500, 611)


XGBoost Model

In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split dataset (80% train, 20% test)
train_size = int(0.8 * len(X_np))
X_train, X_test = X_np[:train_size], X_np[train_size:]
y_train, y_test = y_np[:train_size], y_np[train_size:]


# TODO: Need to reshape into a 2D array for XGBoost (num_samples, num_features) - flatten the 3D array

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

# Train XGBoost Model
model = xgb.XGBRegressor(
    objective="reg:squarederror",
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8
)

model.fit(X_train, y_train, 
          eval_set=[(X_test, y_test)],
          verbose=True)

# Evaluate Model
y_pred = model.predict(X_test)

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

X_train shape: (5200, 611, 6)
X_test shape: (1300, 611, 6)


ValueError: Please reshape the input data into 2-dimensional matrix.