In [1]:
import json
from typing import TypedDict

import numpy as np
from constants import DATA_DIR
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.early_stop import no_progress_loss
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor

from astrofit.utils import AsteroidLoader

In [2]:
asteroid_loader = AsteroidLoader(DATA_DIR)

ASTEROIDS_FREQ_DATA_PATH = DATA_DIR / "asteroids_freq_data.json"

In [3]:
with open(ASTEROIDS_FREQ_DATA_PATH, "r") as f:
    asteroids_freq_data = json.load(f)

In [4]:
config, asteroids_data = asteroids_freq_data["config"], asteroids_freq_data["asteroids"]

In [5]:
filtered_data = {name: data for name, data in asteroids_data.items() if not data["is_failed"]}
print(f"Filtered {len(filtered_data)} asteroids ({100*(len(asteroids_data) - len(filtered_data)) / len(asteroids_data):.2f}% failed)")

Filtered 2662 asteroids (40.41% failed)


In [6]:
class AsteroidData(TypedDict):
    is_failed: bool
    reason: str | None
    period: float
    processing_time: float
    freq_features: list[list]  # 1 - 4 sequences of 50 floats (freqs) from 0 to 12
    pow_features: list[list]  # 1 - 4 sequences of 50 floats (pows) from 0 to 1 (the same shape as freq_features)

In [7]:
filtered_data = {name: AsteroidData(**data) for name, data in filtered_data.items()}

In [8]:
np.percentile([data["period"] for data in filtered_data.values()], [0, 5, 25, 50, 75, 95, 100])

array([ 2.134122 ,  3.5124675,  5.3783405,  8.004365 , 12.93966  ,
       25.9748   , 39.8999   ])

In [9]:
train_keys, val_test_keys = train_test_split(list(filtered_data.keys()), test_size=0.2, random_state=884288)
val_keys, test_keys = train_test_split(val_test_keys, test_size=0.33, random_state=884288)

print(f"Train: {len(train_keys)} asteroids ({100 * len(train_keys) / len(filtered_data):.2f})")
print(f"Validation: {len(val_keys)} asteroids ({100*len(val_keys) / len(filtered_data):.2f})")
print(f"Test: {len(test_keys)} asteroids ({100 * len(test_keys) / len(filtered_data):.2f})")

Train: 2129 asteroids (79.98)
Validation: 357 asteroids (13.41)
Test: 176 asteroids (6.61)


In [10]:
train_set, val_set, test_set = (
    {key: filtered_data[key] for key in train_keys},
    {key: filtered_data[key] for key in val_keys},
    {key: filtered_data[key] for key in test_keys},
)

In [11]:
print("Train periods:")
print(np.percentile([data["period"] for data in train_set.values()], [0, 5, 25, 50, 75, 95, 100]))

print("\nValidation periods:")
print(np.percentile([data["period"] for data in val_set.values()], [0, 5, 25, 50, 75, 95, 100]))

print("\nTest periods:")
print(np.percentile([data["period"] for data in test_set.values()], [0, 5, 25, 50, 75, 95, 100]))

Train periods:
[ 2.134122  3.54028   5.385282  8.04945  13.0339   26.0133   39.8999  ]

Validation periods:
[ 2.68212    3.4877328  5.363025   7.81773   12.68499   25.74368
 38.7804   ]

Test periods:
[ 2.738963    3.70729275  5.432665    7.776705   12.583375   25.9855
 39.848     ]


In [48]:
def _extract_features(data_set: dict[str, AsteroidData]) -> tuple[list[np.ndarray], list[np.ndarray]]:
    freq_features = [np.array(sample["freq_features"]) for sample in data_set.values()]
    pow_features = [np.array(sample["pow_features"]) for sample in data_set.values()]
    return freq_features, pow_features


def _standardize_features(features: list[np.ndarray], scaler: StandardScaler) -> list[np.ndarray]:
    flattend = np.concatenate(features)
    transformed = scaler.transform(flattend)

    result = []
    start = 0
    for seq in features:
        end = start + len(seq)
        result.append(transformed[start:end])  # type: ignore
        start = end
    
    return result


# Combine freq and pow features
def _combine_features(freq: list[np.ndarray], pow: list[np.ndarray]) -> list[np.ndarray]:
    return [np.stack((f, p), axis=-1) for f, p in zip(freq, pow)]


def standardize_asteroid_data(
    train_set: dict[str, AsteroidData], val_set: dict[str, AsteroidData], test_set: dict[str, AsteroidData]
) -> tuple[list[np.ndarray], list[np.ndarray], list[np.ndarray], StandardScaler, StandardScaler]:
    train_freq, train_pow = _extract_features(train_set)
    val_freq, val_pow = _extract_features(val_set)
    test_freq, test_pow = _extract_features(test_set)

    # Flatten the features for fitting the scalers
    train_freq_flat = np.concatenate(train_freq)
    train_pow_flat = np.concatenate(train_pow)

    # Create and fit scalers
    freq_scaler = StandardScaler()
    pow_scaler = StandardScaler()

    freq_scaler.fit(train_freq_flat)
    pow_scaler.fit(train_pow_flat)

    # Standardize all sets
    train_freq_std = _standardize_features(train_freq, freq_scaler)
    train_pow_std = _standardize_features(train_pow, pow_scaler)
    val_freq_std = _standardize_features(val_freq, freq_scaler)
    val_pow_std = _standardize_features(val_pow, pow_scaler)
    test_freq_std = _standardize_features(test_freq, freq_scaler)
    test_pow_std = _standardize_features(test_pow, pow_scaler)

    train_std = _combine_features(train_freq_std, train_pow_std)
    val_std = _combine_features(val_freq_std, val_pow_std)
    test_std = _combine_features(test_freq_std, test_pow_std)

    return train_std, val_std, test_std, freq_scaler, pow_scaler


def prepare_for_xgboost(data: list[np.ndarray], max_len: int = 4) -> np.ndarray:
    # padded_data = np.full((len(data), max_len * 100), np.nan)

    # OLD IMPLEMENTATION
    
    concatenated = [sample.reshape(-1, 100) for sample in data]
    padded = [np.pad(seq, ((0, max_len - len(seq)), (0, 0)), mode="constant", constant_values=np.nan) for seq in concatenated]

    return np.stack(padded).reshape(-1, 400)

In [56]:
train_std, val_std, test_std, freq_scaler, pow_scaler = standardize_asteroid_data(train_set, val_set, test_set)

X_train = prepare_for_xgboost(train_std)
X_val = prepare_for_xgboost(val_std)
X_test = prepare_for_xgboost(test_std)

y_train = np.array([asteroid["period"] for asteroid in train_set.values()])
y_val = np.array([asteroid["period"] for asteroid in val_set.values()])
y_test = np.array([asteroid["period"] for asteroid in test_set.values()])

In [57]:
model: XGBRegressor = XGBRegressor(early_stopping_rounds=100, n_estimators=1000, objective="reg:squarederror")
model.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    verbose=True,
)

# # Make predictions
predictions = model.predict(X_test)

[0]	validation_0-rmse:7.07857
[1]	validation_0-rmse:6.98654
[2]	validation_0-rmse:6.96357
[3]	validation_0-rmse:7.05208
[4]	validation_0-rmse:6.99748
[5]	validation_0-rmse:7.00773
[6]	validation_0-rmse:7.00307
[7]	validation_0-rmse:6.98369
[8]	validation_0-rmse:6.96335
[9]	validation_0-rmse:6.96571
[10]	validation_0-rmse:6.97979
[11]	validation_0-rmse:6.98959
[12]	validation_0-rmse:7.02614
[13]	validation_0-rmse:7.01322
[14]	validation_0-rmse:7.00392
[15]	validation_0-rmse:7.04181
[16]	validation_0-rmse:7.03805
[17]	validation_0-rmse:7.02167
[18]	validation_0-rmse:7.00932
[19]	validation_0-rmse:7.02791
[20]	validation_0-rmse:7.01104
[21]	validation_0-rmse:7.01789
[22]	validation_0-rmse:7.01972
[23]	validation_0-rmse:7.01291
[24]	validation_0-rmse:6.99187
[25]	validation_0-rmse:6.98453
[26]	validation_0-rmse:6.99433
[27]	validation_0-rmse:7.00461
[28]	validation_0-rmse:7.00834
[29]	validation_0-rmse:7.04024
[30]	validation_0-rmse:7.03474
[31]	validation_0-rmse:7.01991
[32]	validation_0-

In [58]:
space = {
    "max_depth": hp.quniform("max_depth", 3, 18, 1),
    "gamma": hp.loguniform("gamma", np.log(1e-3), np.log(5)),
    "reg_alpha": hp.loguniform("reg_alpha", np.log(1e-3), np.log(100)),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(1e-3), np.log(100)),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1),
    "min_child_weight": hp.loguniform("min_child_weight", np.log(1e-3), np.log(10)),
    "n_estimators": hp.quniform("n_estimators", 100, 2000, 1),
    "eta": hp.loguniform("eta", np.log(1e-3), np.log(1)),
    "subsample": hp.uniform("subsample", 0.5, 1),
    "grow_policy": hp.choice("grow_policy", [("depthwise", None), ("lossguide", hp.quniform("max_leaves", 0, 256, 1))]),
}

In [59]:
train_std, val_std, test_std, freq_scaler, pow_scaler = standardize_asteroid_data(train_set, val_set, test_set)

X_train = prepare_for_xgboost(train_std)
X_val = prepare_for_xgboost(val_std)
X_test = prepare_for_xgboost(test_std)

y_train = np.array([asteroid["period"] for asteroid in train_set.values()])
y_val = np.array([asteroid["period"] for asteroid in val_set.values()])
y_test = np.array([asteroid["period"] for asteroid in test_set.values()])

In [60]:
def objective(space):
    # Convert some parameters to int
    space["max_depth"] = int(space["max_depth"])
    space["n_estimators"] = int(space["n_estimators"])

    # Handle grow_policy and max_leaves
    grow_policy, max_leaves = space["grow_policy"]
    space["grow_policy"] = grow_policy
    if grow_policy == "lossguide":
        space["max_leaves"] = int(max_leaves)
    else:
        space["max_leaves"] = 0  # not used for 'depthwise'

    model = XGBRegressor(
        **space,
        n_jobs=-1,
        random_state=88,
        tree_method="hist",
        device="cuda",
    )

    # Use cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring="neg_mean_squared_error")

    mean_cv_score = -cv_scores.mean()  # negative because XGBoost minimizes

    return {"loss": mean_cv_score, "status": STATUS_OK}


trials = Trials()

best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials,
    early_stop_fn=no_progress_loss(iteration_stop_count=50, percent_increase=0),
)

print("Best hyperparameters:", best)

  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.





  0%|          | 0/100 [00:27<?, ?trial/s, best loss=?]


KeyboardInterrupt: 

In [None]:
model = XGBRegressor(
    max_depth=int(best["max_depth"]),
    gamma=best["gamma"],      
    reg_alpha=int(best["reg_alpha"]),
    reg_lambda=best["reg_lambda"],
    colsample_bytree=best["colsample_bytree"],
    min_child_weight=int(best["min_child_weight"]),
    n_estimators=int(best["n_estimators"]),
    eta=best["eta"],
    tree_method="exact",
    max_leaves=int(best["max_leaves"]),
    max_bin=int(best["max_bin"]),
    num_parallel_tree=int(best["num_parallel_tree"]),
    eval_metric=mean_squared_error,
    early_stopping_rounds=int(best["early_stopping_rounds"]),
    n_jobs=-1,
)

evaluation = [(train_X, train_y), (test_X, test_y)]
model.fit(train_X, train_y, eval_set=evaluation, verbose=False)

pred_y = model.predict(test_X)
mse = mean_squared_error(test_y, pred_y)

print(f"Mean Squared Error: {mse}\n")

for pred, true in zip(pred_y, test_y):
    print(f"Predicted: {pred:.2f}, True: {true:.2f}")