In [1]:
!pip install xgboost



You should consider upgrading via the 'C:\Work\Data Science\House Prices - Adv Regression\venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb

In [3]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import OrdinalEncoder

def train_xgboost_regression(
    data: pd.DataFrame, 
    target: str, 
    n_estimators: list[int] = [100, 200, 300],
    max_depth: list[int] = [3, 5, 7],
    learning_rate: list[float] = [0.01, 0.1, 0.2],
    subsample: list[float] = [0.8, 1.0],
    colsample_bytree: list[float] = [0.8, 1.0],
    min_child_weight: list[int] = [1, 3, 5],
    gamma: list[float] = [0, 0.1, 0.2],
    num_transformer: str = 'robust',
    test_size: float = 0.2, 
    random_state: int = 42
):
    """
    Train an XGBoost regression model with hyperparameter tuning and advanced preprocessing.
    Supports various numerical transformations and handles categorical features.
    Properly handles ordinal features using ordinal encoding.
    
    Parameters:
    -----------
    data : pd.DataFrame
        Input dataset
    target : str
        Name of the target column
    n_estimators : list
        Number of boosting rounds to test
    max_depth : list
        Maximum tree depth to test
    learning_rate : list
        Step size shrinkage used to prevent overfitting
    subsample : list
        Subsample ratio of the training instances
    colsample_bytree : list
        Subsample ratio of columns when constructing each tree
    min_child_weight : list
        Minimum sum of instance weight needed in a child
    gamma : list
        Minimum loss reduction required to make a further partition
    num_transformer : str
        Numerical feature transformation method
        Options: 'robust', 'standard', 'log', 'log+robust', 'log+standard'
    test_size : float
        Proportion of the dataset to include in the test split
    random_state : int
        Random seed for reproducibility
    
    Returns:
    --------
    dict : A dictionary containing model, performance metrics, 
           best parameters, and feature importances
    """
    # Separate features and target
    X = data.drop(target, axis=1)
    
    # Define feature types
    ordinal_columns = [
        'OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 
        'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 
        'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC'
    ]
    
    # Keep only ordinal columns that actually exist in the dataset
    ordinal_columns = [col for col in ordinal_columns if col in X.columns]
    
    # Separate remaining categorical columns (nominal)
    nominal_columns = [col for col in X.select_dtypes(include=['object', 'category']).columns 
                       if col not in ordinal_columns]
    
    numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
    
    # Define ordinal mappings
    ordinal_mappings = {
        # Quality features (Ex, Gd, TA, Fa, Po)
        'ExterQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'ExterCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'BsmtQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'BsmtCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'HeatingQC': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'KitchenQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'FireplaceQu': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'GarageQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'GarageCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'PoolQC': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        
        # Overall quality and condition (1-10)
        'OverallQual': {i: i for i in range(1, 11)},
        'OverallCond': {i: i for i in range(1, 11)},
        
        # Basement exposure
        'BsmtExposure': {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'NA': 0},
        
        # Basement finish types
        'BsmtFinType1': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'NA': 0},
        'BsmtFinType2': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'NA': 0}
    }
    
    # Log transformer for skewed features
    log_transformer = FunctionTransformer(np.log1p, validate=True)
    
    # Determine scaling and target transformation
    y = data[target]
    use_log_transform = num_transformer in ['log', 'log+robust', 'log+standard']
    
    # Create numerical preprocessing pipeline
    if num_transformer == 'robust': 
        num_pipeline = RobustScaler()
    elif num_transformer == 'standard': 
        num_pipeline = StandardScaler()
    elif num_transformer == 'log+robust':
        num_pipeline = Pipeline([
            ('log', log_transformer),
            ('scaler', RobustScaler())
        ])
        y = np.log1p(y)
    elif num_transformer == 'log+standard':
        num_pipeline = Pipeline([
            ('log', log_transformer),
            ('scaler', StandardScaler())
        ])
        y = np.log1p(y)
    elif num_transformer == 'log':
        num_pipeline = log_transformer
        y = np.log1p(y)
    else:
        num_pipeline = 'passthrough'

    # Create transformers for each ordinal feature
    ordinal_transformers = []
    for col in ordinal_columns:
        if col in ordinal_mappings:
            ordinal_transformers.append(
                (f'ord_{col}', 
                 OrdinalEncoder(
                     categories=[list(ordinal_mappings[col].keys())],
                     handle_unknown='use_encoded_value',
                     unknown_value=-1
                 ), 
                 [col])
            )

    # Create preprocessor with one-hot encoder for nominal categories
    # and ordinal encoder for ordinal categories
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), nominal_columns),
            ('num', num_pipeline, numerical_columns)
        ] + ordinal_transformers
    )

    param_grid = {
        'regressor__n_estimators': n_estimators,
        'regressor__max_depth': max_depth,
        'regressor__learning_rate': learning_rate,
        'regressor__subsample': subsample,
        'regressor__colsample_bytree': colsample_bytree,
        'regressor__min_child_weight': min_child_weight,
        'regressor__gamma': gamma
    }

    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', xgb.XGBRegressor(
            objective='reg:squarederror',
            random_state=random_state
        ))
    ])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    grid_search = GridSearchCV(
        model, 
        param_grid, 
        cv=5, 
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,  # Use all available processors
        verbose=1
    )
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    y_pred = best_model.predict(X_test)

    if use_log_transform:
        # Inverse transform predictions and actual values
        y_pred = np.expm1(y_pred)
        y_test = np.expm1(y_test)
        
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # Get feature names from model
    feature_names = []
    
    # Add one-hot encoded nominal features
    if nominal_columns:
        feature_names.extend(
            best_model.named_steps['preprocessor']
            .named_transformers_['cat']
            .get_feature_names_out(nominal_columns)
        )
    
    # Add numerical features
    feature_names.extend(numerical_columns)
    
    # Add ordinal features
    feature_names.extend(ordinal_columns)
    
    # Extract feature importances
    importances = best_model.named_steps['regressor'].feature_importances_
    
    # Create a DataFrame for feature importances
    feature_importances = pd.DataFrame({
        'feature': feature_names,
        'importance': importances,
        'type': ['Numerical' if col in numerical_columns else 
                 'Ordinal' if col in ordinal_columns else 
                 'Categorical' for col in feature_names]
    }).sort_values('importance', ascending=False)

    # Create a function for partial dependence plots
    def plot_partial_dependence(feature, num_points=50):
        """
        Create data for partial dependence plots for a given feature
        """
        # Not implemented in this version
        pass

    return {
        'model': best_model,
        'performance': {
            'root_mean_squared_error': rmse,
            'r2_score': r2
        },
        'best_params': best_params,
        'feature_importances': feature_importances,
        'train_data': (X_train, y_train),
        'test_data': (X_test, y_test)
    }

In [4]:
# n_estimators: list[int] = [100, 200, 300],
#     max_depth: list[int] = [3, 5, 7],
#     learning_rate: list[float] = [0.01, 0.1, 0.2],
#     subsample: list[float] = [0.8, 1.0],
#     colsample_bytree: list[float] = [0.8, 1.0],
#     min_child_weight: list[int] = [1, 3, 5],
#     gamma: list[float] = [0, 0.1, 0.2],

In [None]:
df = pd.read_csv('data/train_rm_OL.csv') # in here outliers are removed
# df = pd.read_csv('data/train_cleaned.csv')

# Train model with hyperparameter tuning
results = train_xgboost_regression(df, 'SalePrice',
                                   n_estimators=[100, 200, 300],
                                   max_depth=[3, 5],
                                   learning_rate=[0.01, 0.1, 1],
                                   subsample=[0.5, 0.8, 1.0],
                                   colsample_bytree=[0.5, 0.8, 1.0],
                                   min_child_weight=[1, 3, 5],
                                   gamma=[0, 0.1, 0.2],
                                   num_transformer='log+robust',)


Fitting 5 folds for each of 1458 candidates, totalling 7290 fits


In [7]:
print(results['performance'], results['best_params'])

{'root_mean_squared_error': 17973.93913516368, 'r2_score': 0.9465093357529916} {'regressor__colsample_bytree': 0.8, 'regressor__gamma': 0, 'regressor__learning_rate': 0.05, 'regressor__max_depth': 3, 'regressor__min_child_weight': 3, 'regressor__n_estimators': 400, 'regressor__subsample': 0.5}
