## 1. Pre-processing

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
# Data Pre-processing for Numerical Features
def numerical_pipeline(strategy='mean'):
    """
    Create a pipeline for numerical features: imputation followed by scaling.
    """
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy=strategy)),
        ('scaler', StandardScaler())
    ])
    return num_pipeline

# Data Pre-processing for Categorical Features
def categorical_pipeline(strategy='most_frequent', encoding='onehot'):
    """
    Create a pipeline for categorical features: imputation followed by encoding.
    """
    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy=strategy)),
        ('encoder', OneHotEncoder(handle_unknown='ignore') if encoding == 'onehot' else None)
    ])
    return cat_pipeline

# Full Pre-processing Pipeline combining both Numerical and Categorical
def full_preprocessor(num_features, cat_features, num_strategy='mean', cat_strategy='most_frequent', cat_encoding='onehot'):
    """
    Create a full pre-processing pipeline combining both numerical and categorical pipelines.
    """
    preprocessor = ColumnTransformer([
        ('num', numerical_pipeline(num_strategy), num_features),
        ('cat', categorical_pipeline(cat_strategy, cat_encoding), cat_features)
    ])
    return preprocessor

## 2. Metrics

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, adjusted_r2
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [None]:
# Metric Functions for Regression including adjusted r2. 
def regression_metrics(y_true, y_pred, n_features):
    """
    Calculate and return the commonly used regression metrics including MSE, RMSE, MAE, R2, and Adjusted R2 score.
    """
    n_samples = len(y_true)
    mse = mean_squared_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    # Calculate Adjusted R2
    adj_r2 = 1 - ((1 - r2) * (n_samples - 1) / (n_samples - n_features - 1))
    
    return {'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'R2': r2, 'Adjusted R2': adj_r2}

# Metric Functions for Classification
def classification_metrics(y_true, y_pred, y_prob=None):
    """
    Calculate and return the commonly used classification metrics including Accuracy, Precision, Recall, F1, and AUC.
    """
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='weighted', zero_division=1)
    rec = recall_score(y_true, y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_true, y_pred, average='weighted')
    auc = roc_auc_score(y_true, y_prob, multi_class='ovr', average='weighted') if y_prob is not None else 'NA'
    return {'Accuracy': acc, 'Precision': prec, 'Recall': rec, 'F1': f1, 'AUC': auc}


## 3. Training and Performance Evaluating

In [None]:
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import VotingClassifier, VotingRegressor
from sklearn.model_selection import GridSearchCV


In [2]:
# Ensemble Block, ref: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html
def ensemble_models(models: list, voting='soft', task='classification'):
    """
    Create and return an ensemble model using the provided base models.
    """
    if task == 'classification':
        return VotingClassifier(estimators=models, voting=voting)
    else:
        return VotingRegressor(estimators=models)

In [None]:
# Xgb
param_grid_xgb = {
    'learning_rate': [0.01, 0.05],  # Typical values for learning rate
    'max_depth': [4,5,6],          # Maximum depth of the individual estimators
    'min_child_weight': [1],       # Minimum sum of instance weight needed in a child
    'subsample': [1.0],        # Subsample ratio of the training instances
    'colsample_bytree': [0.6], # Subsample ratio of columns when constructing each tree
    'n_estimators': [100,200],     # Number of boosting rounds to be run
#    'gamma': [0.1, 0.2],              # Minimum loss reduction required to make a further partition
#    'scale_pos_weight': [1, 3],       # Control the balance of positive and negative weights
#    'reg_alpha': [0, 0.1],          # L1 regularization term on weights
#    'reg_lambda': [1, 2]            # L2 regularization term on weights
}

# LightGBM
param_grid_lgb = {
    'learning_rate': [0.01, 0.05],  # Learning rate
    'num_leaves': [31, 63],         # Maximum tree leaves for base learners
    'max_depth': [4, 5, 6],         # Maximum tree depth for base learners
    'n_estimators': [100, 200],     # Number of boosting rounds
    'min_split_gain': [0.0, 0.1],   # Minimum loss gain to perform a split
    'min_child_weight': [1e-3, 1],  # Minimum sum of instance weight (hessian) needed in a child (leaf)
    'subsample': [1.0, 0.8],        # Subsample ratio of the training instance
    'colsample_bytree': [0.6, 1.0]  # Subsample ratio of columns when constructing each tree
}
# CatBoost
param_grid_cat = {
    'learning_rate': [0.01, 0.05],  # Learning rate
    'depth': [4, 6, 8],             # Depth of the trees
    'iterations': [100, 200],       # Number of boosting rounds
#    'l2_leaf_reg': [1, 3],          # L2 regularization term on weights
#    'border_count': [32, 64],       # Number of splits for numerical features
#    'bagging_temperature': [0, 1]   # Controls the intensity of Bayesian bagging
}

In [None]:
# Model Workflows without embedding the models inside the function

# Initialize Models
xgb_model = XGBRegressor() # Or XGBClassifier
cat_model = LGBMRegressor() # Or LGBMClassifier
lgb_model =  CatBoostRegressor() # Or CatBoostClassifier


In [None]:
# Sample data
X, y = DUMMY_DATA
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Perform hyperparameter tuning using GridSearchCV
grid_search_xgb = GridSearchCV(xgb_model, param_grid_xgb, cv=5)
grid_search_xgb.fit(X_train, y_train)

grid_search_cat = GridSearchCV(cat_model, param_grid_cat, cv=5)
grid_search_cat.fit(X_train, y_train)

grid_search_lgb = GridSearchCV(lgb_model, param_grid_lgb, cv=5)
grid_search_lgb.fit(X_train, y_train)

In [None]:
# Get the best parameters for each model
best_params_xgb = grid_search_xgb.best_params_
best_params_cat = grid_search_cat.best_params_
best_params_lgb = grid_search_lgb.best_params_

# Set the best parameters to each model
xgb_model.set_params(**best_params_xgb)
cat_model.set_params(**best_params_cat)
lgb_model.set_params(**best_params_lgb)

# Fit models with the entire training set after hyperparameter tuning
xgb_model.fit(X_train, y_train)
cat_model.fit(X_train, y_train)
lgb_model.fit(X_train, y_train)

# Create ensemble model


# Create ensemble model using the provided function
ensemble_model = ensemble_models(
    models=[('xgb', xgb_model),('lgb', lgb_model), ('cat', cat_model)],
    task='regression'
)

# Fit ensemble model
ensemble_model.fit(X_train, y_train)


In [None]:
# Make predictions
y_pred_xgb = xgb_model.predict(X_test)
y_pred_lgb = lgb_model.predict(X_test)
y_pred_cat = cat_model.predict(X_test)
y_pred_ensemble = ensemble_model.predict(X_test)

# Evaluate models using the metric functions
metrics_xgb = regression_metrics(y_test, y_pred_xgb, n_features=X_train.shape[1])
metrics_lgb = regression_metrics(y_test, y_pred_cat, n_features=X_train.shape[1])
metrics_cat = regression_metrics(y_test, y_pred_cat, n_features=X_train.shape[1])
metrics_ensemble = regression_metrics(y_test, y_pred_ensemble, n_features=X_train.shape[1])

metrics_xgb, metrics_lgb,metrics_cat, metrics_ensemble