# Feature Selection

This notebook performs feature selection using mutual information regression and saves the selected features for model training.

In [1]:
# Add project root to Python path
import sys
from pathlib import Path
project_root = str(Path().absolute().parent)
if project_root not in sys.path:
    sys.path.append(project_root)

print(f"Added {project_root} to Python path")


# Imports
import mlflow
import joblib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from src.config import ModelConfig, FeatureSelectionConfig, MLFlowConfig, PathConfig,XGBoostFeatureSelectionConfig
from src.data.data_loader import DataLoader
from src.features.feature_selector import FeatureSelector

Added /Users/gregor/Documents/Projects/avm_xgboost to Python path


## Initialize Configurations

In [2]:
# Initialize configs
path_config = PathConfig()
path_config.setup_dirs()

mlflow_config = MLFlowConfig(tracking_uri="local", experiment_name="feature_selection")
model_config = ModelConfig()
feature_config = FeatureSelectionConfig()

mlflow_config.setup_tracking()

## Load and Examine Data

In [3]:
path_config.raw_data = '../data/raw/avm_modeling_dataset.xlsx'

In [5]:
# Load and split data
data_loader = DataLoader(model_config)

exclude_cols = [
    'year',                # temporal feature, don't scale
    'month'             # temporal feature, don't scale
]

# Load and preprocess data
data_loader = DataLoader(model_config)
X_train, X_test,  y_train, y_test, train_years = data_loader.load_data(
    path_config.raw_data,
    exclude_from_scaling=exclude_cols
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

Training set shape: (47397, 23)
Test set shape: (11823, 23)


## Feature Selection Process

In [8]:
# In your notebook or training script
feature_config = FeatureSelectionConfig(
    n_features=7,
    #selection_method="importance",
    xgboost_params=XGBoostFeatureSelectionConfig(
        params={
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'max_depth': 10,
            'learning_rate': 0.1,
            'n_estimators': 50,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'min_child_weight': 5,
            'random_state': 42
        }
    )
)

# Feature selection with specific XGBoost parameters
with mlflow.start_run():
    feature_selector = FeatureSelector(feature_config)
    
    selected_features_imp = feature_selector.select_by_importance(
            X_train, y_train, X_test )# sequential
    
    selected_features_seq = feature_selector.select_sequential(
            X_train, y_train, X_test
        )

In [15]:
from src.models.xgboost_analyzer import XGBoostAnalyzer

In [21]:
#Create datasets with selected features
X_train_selected = X_train[selected_features_imp]
X_test_selected = X_test[selected_features_imp]

# Initialize and train XGBoost model with feature_config parameters
xgb_model = XGBoostAnalyzer(feature_config)
xgb_model.train(X_train_selected, y_train)

# Evaluate model
metrics = xgb_model.evaluate(X_test_selected, y_test, dataset_name="test")
print("\nTest Set Metrics:")
for name, value in metrics.items():
    print(f"{name}: {value:.4f}")

# Plot actual vs predicted
xgb_model.plot_predictions(X_test_selected, y_test, dataset_name="test")




Test Set Metrics:
test_rmse: 706.5487
test_r2: 0.7103


In [22]:
#Create datasets with selected features
X_train_selected = X_train[selected_features_seq]
X_test_selected = X_test[selected_features_seq]

# Initialize and train XGBoost model with feature_config parameters
xgb_model = XGBoostAnalyzer(feature_config)
xgb_model.train(X_train_selected, y_train)

# Evaluate model
metrics = xgb_model.evaluate(X_test_selected, y_test, dataset_name="test")
print("\nTest Set Metrics:")
for name, value in metrics.items():
    print(f"{name}: {value:.4f}")

# Plot actual vs predicted
xgb_model.plot_predictions(X_test_selected, y_test, dataset_name="test")



Test Set Metrics:
test_rmse: 661.8015
test_r2: 0.7458


In [23]:
# Save processed data with selected features
joblib.dump({
    "X_train": X_train_selected,
    "X_test": X_test_selected,
    "y_train": y_train,
    "y_test": y_test,
    "train_years": train_years
}, path_config.interim_dir / "processed_data.joblib")

['data/interim/processed_data.joblib']

In [18]:
results = xgb_model.model.evals_result()
        


AttributeError: 'XGBoostAnalyzer' object has no attribute 'evals_result'

results = xgb_model.model.evals_result()
        
plt.figure(figsize=(10, 6))
plt.plot(results['validation_0']['rmse'], label='train')
plt.plot(results['validation_1']['rmse'], label='validation')

In [13]:
selected_features_imp

['space_desc_lvl_1',
 'space_desc_lvl_2',
 'longitude',
 'latitude',
 'year',
 'atrium_area',
 'floor_count']

In [14]:
selected_features_seq

['space_desc_lvl_2',
 'net_area',
 'age',
 'longitude',
 'latitude',
 'floor_count',
 'year']

In [9]:
latest_run = mlflow.search_runs(
    experiment_names=["feature_selection"],
    order_by=["start_time DESC"]
)

## Analyze Selected Features

In [None]:
# Plot correlation matrix of selected features
plt.figure(figsize=(12, 8))
correlation_matrix = X_train.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Selected Features')
plt.tight_layout()
plt.show()

In [None]:
# Plot distribution of target variable by year
plt.figure(figsize=(12, 6))
sns.boxplot(x=train_years, y=y_train)
plt.title('Distribution of Price/m² by Year')
plt.xlabel('Year')
plt.ylabel('Price/m²')
plt.xticks(rotation=45)
plt.show()

## Save Selected Features and Processed Data

In [11]:
# Save selected features and transformed data
joblib.dump(selected_features, path_config.interim_dir / "selected_features.joblib")
joblib.dump({
    "X_train": X_train_selected,
    "X_test": X_test_selected,
    "y_train": y_train,
    "y_test": y_test,
    "train_years": train_years
}, path_config.interim_dir / "processed_data.joblib")

print("Saved processed data and selected features to:", path_config.interim_dir)

Saved processed data and selected features to: data/interim
