In [1]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, FunctionTransformer, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import OrdinalEncoder

def train_xgboost_regression(
    data: pd.DataFrame, 
    target: str,
    param_grid: dict = None,
    test_size: float = 0.2, 
    random_state: int = 42
):
    """
    Train an XGBoost regression model with hyperparameter tuning and advanced preprocessing.
    
    The grid search will tune both the XGBoost hyperparameters and the numeric transformation
    option (applied in the 'preprocessor__num' step). The numeric transformer options include:
      - 'log': applies np.log1p,
      - 'yeo-johnson': applies a clipper then a PowerTransformer,
      - 'none': no transformation (passthrough).
      
    The target is log-transformed before training (and inverted on predictions).

    Parameters
    ----------
    data : pd.DataFrame
        Input dataset.
    target : str
        Name of the target column.
    param_grid : dict, optional
        Parameter grid for GridSearchCV. If None, a default grid will be used.
    test_size : float
        Fraction of data used for testing.
    random_state : int
        Random seed for reproducibility.
    
    Returns
    -------
    dict
        A dictionary containing:
          - 'model': best estimator,
          - 'performance': RMSE and R2 on test set,
          - 'best_params': best found parameters,
          - 'feature_importances': DataFrame with feature importances,
          - 'train_data': training split,
          - 'test_data': test split.
    """
    # --- Define numeric transformation pipelines ---
    def create_numeric_pipeline(choice):
        steps = []
        # Placeholder for potential polynomial features
        steps.append(('poly', 'passthrough'))
        if choice == 'log':
            steps.append(('log', FunctionTransformer(np.log1p, validate=True)))
            steps.append(('scaler', 'passthrough'))
        elif choice == 'yeo-johnson':
            steps.append(('scaler', 'passthrough'))
            steps.append(('clipper', FunctionTransformer(lambda X: np.clip(X, -1e2, 1e2), validate=False)))
            steps.append(('power', PowerTransformer(method='yeo-johnson', standardize=False)))
        else:
            steps.append(('scaler', 'passthrough'))
        return Pipeline(steps)
    
    numeric_pipeline_options = {
        'log': create_numeric_pipeline('log'),
        'yeo-johnson': create_numeric_pipeline('yeo-johnson'),
        'none': create_numeric_pipeline(None)
    }
    
    # --- Define feature groups ---
    X = data.drop(target, axis=1)
    
    ordinal_columns = [
        'OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 
        'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 
        'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC'
    ]
    ordinal_columns = [col for col in ordinal_columns if col in X.columns]
    
    nominal_columns = [
        col for col in X.select_dtypes(include=['object', 'category']).columns 
        if col not in ordinal_columns
    ]
    numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    
    # --- Define ordinal mappings and transformers ---
    ordinal_mappings = {
        'ExterQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'ExterCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'BsmtQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'BsmtCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'HeatingQC': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'KitchenQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'FireplaceQu': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'GarageQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'GarageCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'PoolQC': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'OverallQual': {i: i for i in range(1, 11)},
        'OverallCond': {i: i for i in range(1, 11)},
        'BsmtExposure': {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'NA': 0},
        'BsmtFinType1': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'NA': 0},
        'BsmtFinType2': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'NA': 0}
    }
    
    ordinal_transformers = []
    for col in ordinal_columns:
        if col in ordinal_mappings:
            ordinal_transformers.append(
                (f'ord_{col}', 
                 OrdinalEncoder(
                     categories=[list(ordinal_mappings[col].keys())],
                     handle_unknown='use_encoded_value',
                     unknown_value=-1
                 ), 
                 [col])
            )
    
    # --- Build the preprocessor ---
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), nominal_columns),
            # Default numeric transformer; grid search will override this option.
            ('num', numeric_pipeline_options['none'], numerical_columns)
        ] + ordinal_transformers
    )
    
    # --- Target transformation ---
    y = np.log1p(data[target])
    
    # --- Pre-process the external param_grid ---
    # If the grid includes 'preprocessor__num' as string options, convert them.
    if param_grid is not None and 'preprocessor__num' in param_grid:
        new_values = []
        for val in param_grid['preprocessor__num']:
            if isinstance(val, str):
                if val == 'passthrough':
                    new_values.append(numeric_pipeline_options['none'])
                else:
                    new_values.append(numeric_pipeline_options[val])
            else:
                new_values.append(val)
        param_grid['preprocessor__num'] = new_values

    # --- Define default parameter grid if none is provided ---
    if param_grid is None:
        param_grid = {
            'regressor__n_estimators': [100, 200, 300],
            'regressor__max_depth': [3, 5, 7],
            'regressor__learning_rate': [0.01, 0.1, 0.2],
            'regressor__subsample': [0.8, 1.0],
            'regressor__colsample_bytree': [0.8, 1.0],
            'regressor__min_child_weight': [1, 3, 5],
            'regressor__gamma': [0, 0.1, 0.2],
            'preprocessor__num': [
                numeric_pipeline_options['log'], 
                numeric_pipeline_options['yeo-johnson'], 
                numeric_pipeline_options['none']
            ]
        }
    
    # --- Build the overall pipeline ---
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', xgb.XGBRegressor(
            objective='reg:squarederror',
            random_state=random_state
        ))
    ])
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    
    grid_search = GridSearchCV(
        model,
        param_grid,
        cv=5,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=1
    )
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    y_pred = best_model.predict(X_test)
    
    # Invert the log1p transformation on predictions and true values
    y_pred = np.expm1(y_pred)
    y_test = np.expm1(y_test)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    # --- Extract feature names and importances ---
    feature_names = []
    if nominal_columns:
        feature_names.extend(
            best_model.named_steps['preprocessor']
            .named_transformers_['cat']
            .get_feature_names_out(nominal_columns)
        )
    feature_names.extend(numerical_columns)
    feature_names.extend(ordinal_columns)
    
    importances = best_model.named_steps['regressor'].feature_importances_
    feature_importances = pd.DataFrame({
        'feature': feature_names,
        'importance': importances,
        'type': [
            'Categorical' if col in nominal_columns 
            else 'Numerical' if col in numerical_columns 
            else 'Ordinal'
            for col in feature_names
        ]
    }).sort_values('importance', ascending=False)
    
    return {
        'model': best_model,
        'performance': {
            'root_mean_squared_error': rmse,
            'r2_score': r2
        },
        'best_params': best_params,
        'feature_importances': feature_importances,
        'train_data': (X_train, y_train),
        'test_data': (X_test, y_test)
    }


In [2]:
df = pd.read_csv('data/train_rm_OL.csv') # in here outliers are removed
# df = pd.read_csv('data/train_cleaned.csv')
param_grid = {
    'regressor__n_estimators': [800, 1000, 1200],
    'regressor__max_depth': [1, 2, 3, 4],
    'regressor__learning_rate': [0.01, 0.02, 0.05, 0.1],
    'regressor__subsample': [0.9, 1.0],
    'regressor__colsample_bytree': [0.9, 1.0],
    'regressor__min_child_weight': [1, 2, 3, 4],
    'regressor__gamma': [0, 0.01, 0.1],
    'preprocessor__num': ['log']
}


results = train_xgboost_regression(df, 'SalePrice',
                                      param_grid=param_grid,
                                      test_size=0.2,
                                      random_state=42
                                      #    num_transformer=['robust'],
                                      )
print("Best Model Performance:")
for metric, value in results['performance'].items():
    print(f"{metric}: {value}")
    
for parameter, value in results['best_params'].items():
    print(f"{parameter}: {value}")

Fitting 5 folds for each of 2304 candidates, totalling 11520 fits
Best Model Performance:
root_mean_squared_error: 19562.61395750251
r2_score: 0.9366356145525816
preprocessor__num: Pipeline(steps=[('poly', 'passthrough'),
                ('log',
                 FunctionTransformer(func=<ufunc 'log1p'>, validate=True)),
                ('scaler', 'passthrough')])
regressor__colsample_bytree: 0.9
regressor__gamma: 0
regressor__learning_rate: 0.1
regressor__max_depth: 1
regressor__min_child_weight: 4
regressor__n_estimators: 1000
regressor__subsample: 0.9


Fitting 5 folds for each of 2304 candidates, totalling 11520 fits
Best Model Performance:
root_mean_squared_error: 19562.61395750251
r2_score: 0.9366356145525816
preprocessor__num: Pipeline(steps=[('poly', 'passthrough'),
                ('log',
                 FunctionTransformer(func=<ufunc 'log1p'>, validate=True)),
                ('scaler', 'passthrough')])
regressor__colsample_bytree: 0.9
regressor__gamma: 0
regressor__learning_rate: 0.1
regressor__max_depth: 1
regressor__min_child_weight: 4
regressor__n_estimators: 1000
regressor__subsample: 0.9