In [1]:
import numpy as np
import pandas as pd
import pickle
import json
import os

from sklearn.model_selection import GridSearchCV
from sklearn.kernel_ridge import KernelRidge

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA

from sklearn.base import BaseEstimator

from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

#### Data Preparation

In [13]:
# Prepare the train / test file
starts = [1 + 5000 * i for i in range(10)]  # train-test: 4-1

target_subs = []
feature_subs = []

for start in starts:
    end = start + 5000 - 1
    folder = f"outputs/{start}_{end}"

    # Define data paths
    target_file = os.path.join(folder, "kpi", "kpi.csv")
    feature_files = [
        os.path.join(folder, "metadata", "metadata.csv"),
        os.path.join(folder, "metadata", "calibration.csv"),
        os.path.join(folder, "metadata", "ego_motion.csv"),
        os.path.join(folder, "metadata", "image_features.csv"),
    ]

    # Read data
    target_df = pd.read_csv(target_file)
    feature_df = pd.concat(
        [pd.read_csv(fp).drop("image_id", axis=1) for fp in feature_files], axis=1
    )

    # Append to list
    target_subs.append(target_df)
    feature_subs.append(feature_df)

# Merge as train / test data
target_data_train = pd.concat(target_subs[:8], axis=0)
feature_data_train = pd.concat(feature_subs[:8], axis=0)

data_train = pd.concat([target_data_train, feature_data_train], axis=1)
print(f"Train Size: {len(data_train)}")

target_data_test = pd.concat(target_subs[8:], axis=0)
feature_data_test = pd.concat(feature_subs[8:], axis=0)

data_test = pd.concat([target_data_test, feature_data_test], axis=1)
print(f"Test Size: {len(data_test)}")

Train Size: 34594
Test Size: 8683


In [14]:
# Save to train / test file
data_train.to_csv("classification_train.csv", index=False)
data_test.to_csv("classification_test.csv", index=False)

In [5]:
def get_data(train_filename: str, test_filename: str, target: str, features: list) -> dict:
    df_train = pd.read_csv(train_filename)
    df_test = pd.read_csv(test_filename)

    data = {
        'train': {
            'features': df_train[features].to_numpy(),
            'target': df_train[target].to_numpy()
        },
        'test': {
            'features': df_test[features].to_numpy(),
            'target': df_test[target].to_numpy()
        }
    }

    return data

#### Pre-Training Setup

In [6]:
def get_krr() -> tuple:
    # Define the KernalRidge model
    model = KernelRidge()

    # Difine the parameter grids
    param_grid = {
        "kernel": ["poly"],
        "alpha": [0.1],
        "gamma": [0.01],
        "degree": [3, 4],
    }
    return model, param_grid

In [7]:
def get_gridsearch(model, param_grid: dict) -> GridSearchCV:
    return GridSearchCV(model, param_grid, cv=8, scoring='neg_mean_squared_error', n_jobs=2)

In [11]:
def get_pca_pipeline() -> tuple:
    # Define a Standard Scaler to normalize inputs
    preprocessor = ColumnTransformer(
        [
            ("num", StandardScaler(), list(range(2, 8))),
            ("cat", OneHotEncoder(handle_unknown="ignore"), list(range(0, 2))),
        ]
    )

    # Define PCA to reduce feature
    pca = PCA()

    # Define the KernalRidge model
    krr = KernelRidge(kernel="poly")

    # Difine the parameter grids
    param_grid = {
        "pca__n_components": [2, 4, 6],
        "krr__alpha": [0.1, 0.01],
        "krr__gamma": [0.01, 0.01],
        "krr__degree": [3, 4],
    }

    # Assemble the pipeline
    pipe = Pipeline(steps=[("pre", preprocessor), ("pca", pca), ("krr", krr)])

    return pipe, param_grid

#### Training

In [9]:
def train(hyper_model: GridSearchCV, data: dict, name: str, features: list) -> BaseEstimator:
    # Train the model
    hyper_model.fit(data['train']['features'], data["train"]['target'])

    # Use the trained model to predict
    y_pred_test = hyper_model.predict(data['test']['features'])
    y_pred_train = hyper_model.predict(data['train']['features'])

    # Assess the model quality
    r2_test = r2_score(data["test"]['target'], y_pred_test)
    mae_test = mean_absolute_error(data["test"]['target'], y_pred_test)
    mse_test = mean_squared_error(data["test"]['target'], y_pred_test)

    r2_train = r2_score(data["train"]['target'], y_pred_train)
    mae_train = mean_absolute_error(data["train"]['target'], y_pred_train)
    mse_train = mean_squared_error(data["train"]['target'], y_pred_train)
    
    print('------- Test -------')
    print(f'r2: {r2_test:.3f} | mean absolote error: {mae_test:.3f} | mean squared error: {mse_test:.3f}')
    
    print('------- Train -------')
    print(f'r2: {r2_train:.3f} | mean absolote error: {mae_train:.3f} | mean squared error: {mse_train:.3f}')

    # Record the parameters if r2_test > 0.9
   
    best_params = hyper_model.best_params_
    print('------- Best Params -------')
    print(best_params)

    try:
        json_str = {
        "best_params": {
            "alpha": best_params['alpha'],
            "gamma": best_params['gamma'],
            "kernel": "polynomial"
        },
        "train": {
            "mae": mae_train,
            "mse": mse_train,
            "r2": r2_train
        },
        "test": {
            "mae": mae_test,
            "mse": mse_test,
            "r2": r2_test
        },
        "feature": features
    } 
    except:
        json_str = {
        "best_params": {
            "alpha": best_params['krr__alpha'],
            "gamma": best_params['krr__gamma'],
            "kernel": "polynomial"
        },
        "train": {
            "mae": mae_train,
            "mse": mse_train,
            "r2": r2_train
        },
        "test": {
            "mae": mae_test,
            "mse": mse_test,
            "r2": r2_test
        },
        "feature": features
    } 
    
    with open(f"{name}.json", mode='w') as f:
        json.dump(json_str, f, indent=2)
    
    with open(f"{name}.pickle", mode='wb') as f:
        pickle.dump(hyper_model, f)

    return hyper_model.best_estimator_

In [12]:
# All features
features = [
    # "country_code",
    "weather",
    # "road_type",
    # "road_condition",
    "time_of_day",
    # "num_vehicles",
    # "longitude",
    # "latitude",
    "solar_angle_elevation",
    # "focal_length_x",
    # "focal_length_y",
    # "principle_point_x",
    # "principle_point_y",
    # "camera_pose_x",
    # "camera_pose_y",
    # "camera_pose_z",
    # "camera_pose_yaw",
    # "camera_pose_pitch",
    # "camera_pose_roll",
    # "horizontal_fov",
    # "vertical_fov",
    # "ego_pose_x",
    # "ego_pose_y",
    # "ego_pose_z",
    # "ego_pose_yaw",
    # "ego_pose_pitch",
    # "ego_pose_roll",
    # "speed_var",
    # "mean_jerk",
    # "max_jerk",
    # "st_jerk",
    # "mean_angular_acc",
    # "max_angular_acc",
    # "st_angular_acc",
    # "mean_lateral_acc",
    # "max_lateral_acc",
    # "st_lateral_acc",
    "luminance",
    "contrast",
    "saturation",
    "sharpness",
    "temperature",
    "edge_density", 
    "entropy"
]

# Perform training
data = get_data("regression_train.csv", "regression_test.csv", "accuracy", features)

pca_pipeline, param_grid = get_pca_pipeline()
# model, param_grid = get_krr()
hyper_model = get_gridsearch(pca_pipeline, param_grid)
best_model = train(hyper_model, data, "krr_polynomial", features)

------- Test -------
r2: 0.008 | mean absolote error: 0.149 | mean squared error: 0.034
------- Train -------
r2: 0.024 | mean absolote error: 0.151 | mean squared error: 0.034
------- Best Params -------
{'krr__alpha': 0.1, 'krr__degree': 3, 'krr__gamma': 0.01, 'pca__n_components': 6}
