#### More about libraries used in this notebook:

cv-score-predict: 
- [https://www.kaggle.com/discussions/general/666850](https://www.kaggle.com/discussions/general/666850)
- [https://pypi.org/project/cv-score-predict/](https://pypi.org/project/cv-score-predict/)

category-embedding:
 - [https://pypi.org/project/category-embedding/](https://pypi.org/project/category-embedding/)

In [None]:
pip install cv-score-predict category-embedding

In [None]:
import pandas as pd
import numpy as np
import optuna
import lightgbm as lgb
import matplotlib.pyplot as plt

from pathlib import Path
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SequentialFeatureSelector
from cv_score_predict import cv_score_predict
from category_embedding import CategoryEmbedding

# Use gpu if available
from catboost.utils import get_gpu_device_count
device = 'gpu' if get_gpu_device_count() > 0 else 'cpu'
device

In [None]:
# Load data
main_dir = Path('/kaggle/input/house-prices-advanced-regression-techniques')
X = pd.read_csv(main_dir / 'train.csv').drop(columns='Id')
y = np.log1p(X.pop('SalePrice')) # log-transform target
X_test = pd.read_csv(main_dir / 'test.csv').drop(columns='Id')
submit = pd.read_csv(main_dir / 'sample_submission.csv')

#### Select features

In [None]:
def columns_types(X):
    '''Return lists of columns types'''
    # Columns types
    cat_cols = X.select_dtypes(include=['object', 'category']).columns.to_list()
    num_cols = [c for c in X.columns if c not in cat_cols]
    
    # Unique values per column
    cat_unique = X[cat_cols].nunique()
    num_unique = X[num_cols].nunique()
    
    # Columns' cardinality
    cat_low_card_cols = list(cat_unique.index[cat_unique.le(2)]) 
    num_low_card_cols = list(num_unique.index[num_unique.le(2)]) # will treat as categorical
    low_card_cols = cat_low_card_cols + num_low_card_cols # will encode into 1 dimension
    
    cat_med_card_cols = list(cat_unique.index[cat_unique.gt(2) & cat_unique.le(10)]) 
    num_med_card_cols = list(num_unique.index[num_unique.gt(2) & num_unique.le(10)])
    med_card_cols = cat_med_card_cols + num_med_card_cols # will encode into 2 dims
    
    high_card_cols = list(cat_unique.index[cat_unique.gt(10)])
    
    # Final columns types
    cat_cols = low_card_cols + med_card_cols + high_card_cols
    num_cols = [c for c in num_cols 
                if (c not in num_low_card_cols) 
                and (c not in num_med_card_cols)]
    
    # Create a list of encoding dimensions for each column
    enc_dims_list = []
    for c in cat_cols:
        if c in low_card_cols:
            enc_dims_list.append(1)  # encode to 1 dimension
        elif c in med_card_cols:
            enc_dims_list.append(2)  # encode to 2 dimensions
        else:
            enc_dims_list.append(10) # encode to 10 dimensions
    
    # Sanity check
    assert(len(X.columns) == len(cat_cols + num_cols))
    assert(len(cat_cols) == len(enc_dims_list))

    return num_cols, cat_cols, enc_dims_list
    
def feature_selector(X, y):
    '''Plot model's feature importance and select the best features'''
    # Avoid mutation
    X = X.copy()
    
    # Detect columns type
    _, cat_cols, _ = columns_types(X)
    
    # Convert to numerical categories
    oe = OrdinalEncoder(dtype=int, 
                        handle_unknown='use_encoded_value', 
                        unknown_value=-1,
                        encoded_missing_value=-1,
                       ).set_output(transform='pandas')
    X[cat_cols] = oe.fit_transform(X[cat_cols]).astype('category')

    # Create model
    model = lgb.LGBMRegressor(verbosity=-1)

    # Feature selector
    sfs = SequentialFeatureSelector(
        model, 
        n_features_to_select='auto', 
        tol=0.001, # average per feature is ~0.0015 (rmse with all features is ~0.125)
        direction='forward',
        scoring='neg_root_mean_squared_error',
        cv=3,
        n_jobs=-1,
        )
    sfs.fit(X, y)

    # Plot feature importance
    model.fit(X, y)
    fig, ax = plt.subplots(figsize=(8, 14))
    lgb.plot_importance(
        model, 
        importance_type='gain', 
        max_num_features=None, 
        height=0.5,
        grid=False,
        precision=0,
        title='LGBM feature importance',
        ax=ax,
        );
    return X.columns[sfs.get_support(indices=True)].to_list()

In [None]:
# Select best features
select_feats = False

if select_feats:
    best_features = feature_selector(X, y)
    
else: # Previously selected
    best_features = [
        'MSSubClass', 'MSZoning', 'Neighborhood', 'OverallQual', 'OverallCond', 
        'YearBuilt', 'BsmtFinSF1', 'TotalBsmtSF', 'CentralAir', 'GrLivArea', 
        'Fireplaces', 'GarageCars', 'ScreenPorch',
    ]
X = X[best_features]
X_test = X_test[best_features]
num_cols, cat_cols, enc_dims_list = columns_types(X)

for c in (num_cols, cat_cols, enc_dims_list):
    print(c)

#### Encoder tuning

In [None]:
def enc_objective(
    trial, X_train, y_train, X_val, y_val, cat_cols=[], num_cols=[], random_state=42
):
    '''Optuna objective for CategoryEmbedding tuning'''
    # Model capacity
    hidden_units = trial.suggest_categorical('hidden_units', [32, 64, 128, 256])
    n_blocks = trial.suggest_int('n_blocks', 1, 3)

    # Regularization
    dropout_rate = trial.suggest_float('dropout_rate', 0.0, 0.5)
    l2_emb = trial.suggest_float('l2_emb', 1e-7, 1e-5, log=True)
    l2_dense = trial.suggest_float('l2_dense', 1e-7, 1e-5, log=True)

    # Optimization
    lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)
    batch_size = trial.suggest_categorical('batch_size', [64, 128, 256])

    # Build model with these params
    model = CategoryEmbedding(
        task='regression',
        log_target=False, # Target is already transformed
        categorical_cols=cat_cols,
        numeric_cols=num_cols,
        embedding_dims=enc_dims_list,
        hidden_units=hidden_units,
        n_blocks=n_blocks,
        dropout_rate=dropout_rate,
        l2_emb=l2_emb,
        l2_dense=l2_dense,
        lr=lr,
        batch_size=batch_size,
        epochs=100,
        verbose=0,
    )
    model.fit(X_train, y_train)
    preds = model.predict(X_val)

    return np.sqrt(mean_squared_error(y_val, preds))

In [None]:
%%time

# Tune categorical encoder
tune_encoder = False

if tune_encoder:
    # Split data
    X_train, X_val, y_train, y_val = train_test_split(X, y)

    # Impute missing values
    cat_imp = SimpleImputer(strategy='constant', fill_value='##missing##')
    num_imp = SimpleImputer(strategy='median')

    X_train[cat_cols] = cat_imp.fit_transform(X_train[cat_cols])
    X_train[num_cols] = num_imp.fit_transform(X_train[num_cols])
    X_val[cat_cols] = cat_imp.transform(X_val[cat_cols])
    X_val[num_cols] = num_imp.transform(X_val[num_cols])
    
    def objective(trial):
        return enc_objective(
            trial, X_train, y_train, X_val, y_val, 
            cat_cols=cat_cols, num_cols=num_cols, random_state=42,
        )
    sampler = optuna.samplers.TPESampler(multivariate=True, seed=42)
    study = optuna.create_study(direction='minimize', sampler=sampler)
    study.optimize(objective, n_trials=50)
    
    # Show best results
    completed_trials = [t for t in study.trials 
                        if t.state == optuna.trial.TrialState.COMPLETE]
    print(f'Number of completed trials: {len(completed_trials)}')
    print(f'Best score: {study.best_trial.value:.3f}')
    print(f'Best params:')
    for k, v in study.best_trial.params.items(): 
        print(f"'{k}': {v},")

#### Category encoding

In [None]:
%%time

# Best parameters for the encoder resulted from optuna tuning
enc_params = { # Custom embedding dims - rmse 0.152
    'hidden_units': 128, 
    'n_blocks': 2, 
    'dropout_rate': 0.0013647988526280241, 
    'l2_emb': 2.9022738929868166e-06, 
    'l2_dense': 1.1505737606096325e-07, 
    'lr': 0.009647280154032394, 
    'batch_size': 64,
}
# Encode categorical
enc = CategoryEmbedding(
    task='regression',
    log_target=False, # Target is already log transformed
    categorical_cols=cat_cols,
    numeric_cols=num_cols,
    embedding_dims=enc_dims_list,
    epochs=100,
    scaled_num_out=True,   # return scaled numeric features
    verbose=0,
    **enc_params,
)
# Impute missing values
cat_imp = SimpleImputer(strategy='constant', fill_value='##missing##')
num_imp = SimpleImputer(strategy='mean')

X_enc = X.copy()
X_enc[cat_cols] = cat_imp.fit_transform(X_enc[cat_cols])
X_enc[num_cols] = num_imp.fit_transform(X_enc[num_cols])
X_enc = enc.fit_transform(X_enc, y)

X_test_enc = X_test.copy()
X_test_enc[cat_cols] = cat_imp.transform(X_test_enc[cat_cols])
X_test_enc[num_cols] = num_imp.transform(X_test_enc[num_cols])
X_test_enc = enc.transform(X_test_enc)

# Predict using encoder model to use it as meta-feature
enc_preds = enc.predict(X_test)

X_enc.info()

## Models tuning

In [None]:
def model_objective(trial, X, y, model_type='lgb', processor=None, process_categorical=True,
                    n_splits=3, device='cpu', random_state=42):
    '''
    Objective function for GBMs models hyperparameter tuning
    '''
    # Define hyperparameter search space
    if model_type == 'lgb':
        tune_params = {
            'lgb': {
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                'num_leaves': trial.suggest_int('num_leaves', 8, 256),
                'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 200),
                'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
                'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
                'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
                'subsample': trial.suggest_float('subsample', 0.5, 1.0),
                'subsample_freq': trial.suggest_int('subsample_freq', 0, 10),
                
                'n_estimators': 5000,
                'metric': 'rmse',
                'objective': 'regression',
                'device': device,
                'verbosity': -1,
                'n_jobs': -1,
                }}
    elif model_type == 'xgb':
        tune_params = {
            'xgb': {
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                'max_depth': trial.suggest_int('max_depth', 3, 20),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
                'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
                'subsample': trial.suggest_float('subsample', 0.5, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
                'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
                'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
    
                'n_estimators': 5000,
                'eval_metric': 'rmse',
                'objective': 'reg:squarederror',
                'device': 'cuda' if device == 'gpu' else 'cpu',
                'verbosity': 0,
                'n_jobs': -1,
                }}
    elif model_type == 'cb':
        tune_params = {
            'cb': {
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                'depth': trial.suggest_int('depth', 4, 10),
                'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
                'random_strength': trial.suggest_float('random_strength', 1e-8, 10.0, log=True),
                'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
                'subsample': trial.suggest_float('subsample', 0.5, 1.0),
                'rsm': trial.suggest_float('rsm', 0.5, 1.0),
    
                'iterations': 5000,
                'objective': 'RMSE',
                'eval_metric': 'RMSE',
                'task_type': 'GPU' if device == 'gpu' else 'CPU',
                'border_count': 128 if device == 'gpu' else 254,
                'verbose': 0,
                'thread_count': -1,
            }}
    # Train and predict
    oof_preds_df, _, _ = cv_score_predict(
        X=X,
        y=y,
        X_test=None,
        pred_type='regression',
        processor=processor,
        process_categorical=True,        
        models=model_type,
        params_dict=tune_params,
        random_state=random_state,     
        n_splits=n_splits,
        early_stopping_rounds=50,
        verbose=0,
    )
    return np.sqrt(mean_squared_error(y, oof_preds_df.mean(axis=1)))

In [None]:
%%time

tune_model = None # Set to 'lgb', 'xgb', 'cb', or None to skip

if tune_model != None: 
    # Define objective function with closure
    def objective(trial):
        return model_objective(
            trial, X_enc, y, model_type=tune_model, processor=None, 
            process_categorical=True, n_splits=5, device=device, random_state=42,
        )
    # Optuna study
    sampler = optuna.samplers.TPESampler(multivariate=True, seed=42)
    study = optuna.create_study(direction='minimize', sampler=sampler)
    study.optimize(objective, n_trials=50, timeout=60*60*12)  # 12 hours timeout

    # Show best results
    completed_trials = [t for t in study.trials 
                        if t.state == optuna.trial.TrialState.COMPLETE]
    print(f'Number of completed trials: {len(completed_trials)}')
    print(f'Best score: {study.best_trial.value:.3f}')
    print(f'Best params:')
    for k, v in study.best_trial.params.items(): 
        print(f"'{k}': {v},")
    

## Train models and cross-validate

In [None]:
# Custom parameters with high iterations number to allow space for early stopping
params_dict={ 
    'lgb': { # Encoded categoricals - rmse 0.127
        'learning_rate': 0.010963839052288087,
        'num_leaves': 175,
        'min_data_in_leaf': 12,
        'reg_alpha': 0.00012761285111661527,
        'reg_lambda': 4.682367483456758e-08,
        'feature_fraction': 0.4404415618024636,
        'subsample': 0.6000289421974677,
        'subsample_freq': 2,

        'n_estimators': 5000,
        'metric': 'rmse',
        'objective': 'regression',
        'device': device,
        'verbosity': -1,
        'n_jobs': -1,
    },    
    'xgb': { # Encoded categoricals - rmse 0.126
        'learning_rate': 0.01779816671245614,
        'max_depth': 4,
        'min_child_weight': 20,
        'gamma': 0.00044639940791729504,
        'subsample': 0.6027616353743741,
        'colsample_bytree': 0.595895749281706,
        'reg_lambda': 0.2443730806255085,
        'reg_alpha': 5.8430092166515796e-08,
        
        'n_estimators': 5000,
        'eval_metric': 'rmse',
        'objective': 'reg:squarederror',
        'device': 'cuda' if device == 'gpu' else 'cpu',
        'verbosity': 0,
        'n_jobs': -1,
    },
     'cb': { # Encoded categoricals - rmse 0.125
        'learning_rate': 0.03585432324860276,
        'depth': 5,
        'l2_leaf_reg': 0.0014782377549724024,
        'random_strength': 1.1584710121736592e-05,
        'bagging_temperature': 0.15955650079740968,
        'subsample': 0.5189339778585789,
        'rsm': 0.95732561692537,
         
        'iterations': 5000,
        'objective': 'RMSE',
        'eval_metric': 'RMSE',
        'task_type': 'GPU' if device == 'gpu' else 'CPU',
        'border_count': 128 if device == 'gpu' else 254,
        'verbose': 0,
        'thread_count': -1,
     },
}
# Run Multi-Seed, Multi-Model CV 
oof_preds_df, test_preds_df, trained_pipelines = cv_score_predict(
    X=X_enc,
    y=y,
    X_test=X_test_enc,
    pred_type='regression',
    process_categorical=False,        
    models=['lgb', 'xgb', 'cb'],
    params_dict=params_dict,
    random_state=[42, 123, 999],     # Repeat CV for stability
    n_splits=5,
    early_stopping_rounds=50,
    return_trained=False,
    verbose=2,
)
print()
test_preds_df.info()

#### Train a meta-model

In [None]:
# Train a meta-model on OOF space 
from sklearn.linear_model import LinearRegression
meta_model = LinearRegression()
meta_model.fit(oof_preds_df, y)

# For test-time, average across folds to match OOF structure
test_meta_features = pd.DataFrame(index=test_preds_df.index)
for col in oof_preds_df.columns:
    # Each OOF column is like 'lgb_seed_42'
    # Find all test columns that start with this prefix + '_fold_'
    matching_cols = [c for c in test_preds_df.columns if c.startswith(col + '_fold_')]
    test_meta_features[col] = test_preds_df[matching_cols].mean(axis=1)

# Final stacked prediction
submit.SalePrice = np.expm1(meta_model.predict(test_meta_features)) # inverse of log1p
submit.to_csv('submission.csv', index=False)
submit.head()