In [None]:
# Add project root to Python path
import sys
from pathlib import Path
project_root = str(Path().absolute().parent)
if project_root not in sys.path:
    sys.path.append(project_root)

print(f"Added {project_root} to Python path")


# Imports
import mlflow
import joblib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from src.config import ModelConfig, FeatureSelectionConfig, MLFlowConfig, PathConfig,XGBoostFeatureSelectionConfig, OptunaConfig
from src.models.optimizer import OptunaOptimizer
from src.models.xgboost_analyzer import XGBoostAnalyzer
import xgboost as xgb

Added /Users/gregor/Documents/Projects/avm_xgboost to Python path


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path_config = PathConfig()
path_config.setup_dirs()

mlflow_config = MLFlowConfig(tracking_uri="local", experiment_name="model_optimization")
model_config = ModelConfig()
optuna_config = OptunaConfig()

# Setup MLflow tracking
mlflow_config.setup_tracking()

In [3]:

# Load processed data and selected features
processed_data = joblib.load(path_config.interim_dir / "processed_data.joblib")
#selected_features = joblib.load(path_config.interim_dir / "selected_features.joblib")

X_train = processed_data["X_train"]
y_train = processed_data["y_train"]
X_test = processed_data["X_test"]
y_test = processed_data["y_test"]
train_years = processed_data["train_years"]


In [4]:
# Initialize optimizer with selected features
optimizer = OptunaOptimizer(
    optuna_config,
    processed_data["X_train"],
    processed_data["y_train"],
    processed_data["train_years"],
    selected_features=X_train.columns
)

In [5]:
optimizer.config

OptunaConfig(n_trials=100, timeout=1800, study_name='xgboost_optimization', random_state=42)

In [6]:
# Run optimization
with mlflow.start_run():
    best_params = optimizer.optimize()
    print("\nBest parameters:", best_params)

[I 2025-02-16 22:30:51,557] A new study created in memory with name: no-name-f9457711-e85c-40ac-9b03-dd4cb4784913
[I 2025-02-16 22:30:53,286] Trial 0 finished with value: 701.8364434823491 and parameters: {'max_depth': 9, 'learning_rate': 0.47416099566677206, 'n_estimators': 210, 'min_child_weight': 9, 'subsample': 0.959565529445799, 'colsample_bytree': 0.8401371351894575, 'gamma': 1.2738636185576213, 'reg_alpha': 4.9511276147604155, 'reg_lambda': 4.692908566189995}. Best is trial 0 with value: 701.8364434823491.
[I 2025-02-16 22:30:54,301] Trial 1 finished with value: 693.0471834537739 and parameters: {'max_depth': 3, 'learning_rate': 0.4876587424903872, 'n_estimators': 498, 'min_child_weight': 7, 'subsample': 0.7151463605592163, 'colsample_bytree': 0.8980595331168236, 'gamma': 2.6071093006665254, 'reg_alpha': 3.1989091444645497, 'reg_lambda': 4.396311412498139}. Best is trial 1 with value: 693.0471834537739.
[I 2025-02-16 22:30:55,317] Trial 2 finished with value: 714.2288485561305 a


Best parameters: {'max_depth': 10, 'learning_rate': 0.04362057322423887, 'n_estimators': 347, 'min_child_weight': 15, 'subsample': 0.6522133845117385, 'colsample_bytree': 0.9900094274295523, 'gamma': 1.9287966647376933, 'reg_alpha': 1.0772817412118416, 'reg_lambda': 4.893832846272109}


In [8]:
from src.models.xgboost_analyzer import XGBoostAnalyzer

In [9]:
 #Initialize and train XGBoost model with best parameters
xgb_model = XGBoostAnalyzer(config=None, params=best_params)  # Pass None to override default params

# Create datasets with selected features#

# Train the model
xgb_model.train(X_train, y_train)


# Evaluate model
metrics = xgb_model.evaluate(X_test, y_test, dataset_name="test")
print("\nTest Set Metrics:")
for name, value in metrics.items():
    print(f"{name}: {value:.4f}")

# Plot feature importance
#xgb_model.plot_feature_importance()



Test Set Metrics:
test_rmse: 652.9936
test_r2: 0.7525


In [14]:
xgb_model.plot_predictions(X_test, y_test, dataset_name="test")