In [1]:
exp_index = 0
experiments = {}

In [27]:
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import optuna

from sklearn.model_selection import train_test_split, cross_val_score, KFold, cross_val_predict, cross_validate
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import PowerTransformer, StandardScaler, OneHotEncoder, MinMaxScaler, RobustScaler, QuantileTransformer, Normalizer, MaxAbsScaler, FunctionTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_selection import SelectKBest, SequentialFeatureSelector, SelectFromModel, RFE, RFECV, f_regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn import set_config
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA, KernelPCA, TruncatedSVD, FastICA, FactorAnalysis, SparsePCA, NMF, LatentDirichletAllocation

from sklearn.cluster import KMeans, Birch
from sklearn.mixture import GaussianMixture

# import regressors
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor, BaggingRegressor, StackingRegressor, HistGradientBoostingRegressor, VotingRegressor
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor, PassiveAggressiveRegressor, Perceptron, RidgeClassifier, LogisticRegression
from sklearn.linear_model import Lasso, ElasticNet, Lars, BayesianRidge, ARDRegression, OrthogonalMatchingPursuit, HuberRegressor, TheilSenRegressor, RANSACRegressor
from sklearn.linear_model import LassoLars, LassoLarsIC
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import RidgeCV, ElasticNetCV, LassoCV, LarsCV, OrthogonalMatchingPursuitCV, LassoLarsCV, BayesianRidge, LinearRegression

# pandas deactivate future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

SUBMIT = True
USE_ORIGINAL = True
SEED = 15
SAMPLE = 1
TARGET = 'Strength'
cv = KFold(n_splits=5, shuffle=True, random_state=SEED)

train = pd.read_csv('datasets/train.csv')
test = pd.read_csv('datasets/test.csv')
orig = pd.read_csv('datasets/ConcreteStrengthData.csv')

for i, X in enumerate([train, test, orig]):
    if 'id' in X.columns:
        X.drop(['id'], axis=1, inplace=True)

# Define test set with original data
if not SUBMIT:
    train, test = train_test_split(train, test_size=0.2, random_state=SEED) 

# Add original data to training set
if USE_ORIGINAL:
    train = pd.concat([train, orig], axis=0)
    train.reset_index(inplace=True, drop=True)

del orig

# Sampling for faster training
if SAMPLE < 1:
    train = train.sample(frac=SAMPLE, random_state=SEED)

# set training data
X_train = train.copy()
y_train = X_train.pop(TARGET)
X_test = test.copy()

if not SUBMIT:
    y_test = X_test.pop(TARGET)
else:
    y_test = None

# Feature engineering
class FeatureEngineering(TransformerMixin, BaseEstimator):

    def __init__(self, level=0) -> None:
        self.level = level

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        """ Add new features to the dataset"""
        X = X.copy()

        # # Sum of features
        if self.level >= 2:
            X['cement_slag'] = X['CementComponent'] + X['BlastFurnaceSlag']
            X['cement_flyash'] = X['CementComponent'] + X['FlyAshComponent']
            X['cement_water'] = X['CementComponent'] + X['WaterComponent']
            X['cement_superplasticizer'] = X['CementComponent'] + X['SuperplasticizerComponent']
            X['cement_coarseaggregate'] = X['CementComponent'] + X['CoarseAggregateComponent']
            X['cement_fineaggregate'] = X['CementComponent'] + X['FineAggregateComponent']
            X['cement_age'] = X['CementComponent'] + X['AgeInDays']
            X['slag_flyash'] = X['BlastFurnaceSlag'] + X['FlyAshComponent']
            X['slag_water'] = X['BlastFurnaceSlag'] + X['WaterComponent']
            X['slag_superplasticizer'] = X['BlastFurnaceSlag'] + X['SuperplasticizerComponent']
            X['slag_coarseaggregate'] = X['BlastFurnaceSlag'] + X['CoarseAggregateComponent']
            X['slag_fineaggregate'] = X['BlastFurnaceSlag'] + X['FineAggregateComponent']
            X['slag_age'] = X['BlastFurnaceSlag'] + X['AgeInDays']
            X['flyash_water'] = X['FlyAshComponent'] + X['WaterComponent']
            X['flyash_superplasticizer'] = X['FlyAshComponent'] + X['SuperplasticizerComponent']
            X['flyash_coarseaggregate'] = X['FlyAshComponent'] + X['CoarseAggregateComponent']

        # # Ratio of features
        if self.level >= 2:
            # X['cement_slag_ratio'] = X['CementComponent'] / X['BlastFurnaceSlag']  # Inf values
            # X['cement_flyash_ratio'] = X['CementComponent'] / X['FlyAshComponent']  # Inf values
            X['cement_water_ratio'] = X['CementComponent'] / X['WaterComponent']
            # X['cement_superplasticizer_ratio'] = X['CementComponent'] / X['SuperplasticizerComponent']  # Inf values
            X['cement_coarseaggregate_ratio'] = X['CementComponent'] / X['CoarseAggregateComponent']
            X['cement_fineaggregate_ratio'] = X['CementComponent'] / X['FineAggregateComponent']
            X['cement_age_ratio'] = X['CementComponent'] / X['AgeInDays']
            # X['slag_flyash_ratio'] = X['BlastFurnaceSlag'] / X['FlyAshComponent'] # Divide by zero
            X['slag_water_ratio'] = X['BlastFurnaceSlag'] / X['WaterComponent']
            # X['slag_superplasticizer_ratio'] = X['BlastFurnaceSlag'] / X['SuperplasticizerComponent']  # Divide by zero
            X['slag_coarseaggregate_ratio'] = X['BlastFurnaceSlag'] / X['CoarseAggregateComponent']
            X['slag_fineaggregate_ratio'] = X['BlastFurnaceSlag'] / X['FineAggregateComponent']
            X['slag_age_ratio'] = X['BlastFurnaceSlag'] / X['AgeInDays']
            X['flyash_water_ratio'] = X['FlyAshComponent'] / X['WaterComponent']
            # X['flyash_superplasticizer_ratio'] = X['FlyAshComponent'] / X['SuperplasticizerComponent'] # Divide by zero
            X['flyash_coarseaggregate_ratio'] = X['FlyAshComponent'] / X['CoarseAggregateComponent']

        # Other features
        if self.level >= 1:
            X['water_age_cement_ratio'] = (X['WaterComponent'] * X['AgeInDays']) / X['CementComponent']
        if self.level >= 3:
            X['has_superplasticizer'] = X['SuperplasticizerComponent'].apply(lambda x: 1 if x > 0 else 0)
            X['has_flyash'] = X['FlyAshComponent'].apply(lambda x: 1 if x > 0 else 0)
            X['has_slag'] = X['BlastFurnaceSlag'].apply(lambda x: 1 if x > 0 else 0)

        # Taken from PHONG NGUYEN: Detailed feature description and feature engineering by ChatGPT
        if self.level >= 3:
            # Calculate TotalComponentWeight
            X['TotalComponentWeight'] = X['CementComponent'] + X['BlastFurnaceSlag'] + X['FlyAshComponent'] + X['WaterComponent'] + X['SuperplasticizerComponent'] + X['CoarseAggregateComponent'] + X['FineAggregateComponent']
            # Calculate Water-Cement-Ratio (WCR)
            X['WCR'] = X['WaterComponent'] / X['CementComponent']
            # Calculate Aggregate-Ratio (AR)
            X['AR'] = (X['CoarseAggregateComponent'] + X['FineAggregateComponent']) / X['CementComponent']
            # Calculate Cement-Age
            X['Cement-Age'] = X['CementComponent'] * X['AgeInDays']
        if self.level >= 2:
            # Calculate Water-Cement-Plus-Pozzolan-Ratio (WCPR)
            X['WCPR'] = X['WaterComponent'] / (X['CementComponent'] + X['BlastFurnaceSlag'] + X['FlyAshComponent'])
        
        self.feature_names_out_ = X.columns.tolist()
        return X
    
    def get_feature_names_out(self):
        return self.feature_names_out_
    

class ClusterFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=5, random_state=None, ohe=True):
        self.n_clusters = n_clusters
        self.random_state = random_state
        self.ohe = ohe
        self.cluster_models = {
            'KMeans': KMeans(n_clusters=n_clusters, random_state=random_state),
            'Birch': Birch(n_clusters=n_clusters),
            # 'MiniBatchKMeans': MiniBatchKMeans(n_clusters=n_clusters, random_state=random_state),
            'GaussianMixture': GaussianMixture(n_components=n_clusters, random_state=random_state),
        }
        self.feature_names_out_ = None

    def fit(self, X, y=None):
        for model in self.cluster_models.values():
            model.fit(X)
        return self
    
    def transform(self, X):
        X_out = X.copy()
        for model_name, model in self.cluster_models.items():
            X_out[f'{model_name}_{self.n_clusters}'] = model.predict(X)
        if self.ohe:
            X_out = pd.get_dummies(X_out, columns=[f'{model_name}_{self.n_clusters}' for model_name in self.cluster_models.keys()])
        self.feature_names_out_ = X_out.columns
        return X_out
    
    def get_feature_names_out(self):
        return self.feature_names_out_



In [25]:
def get_pipeline(regressor, model_name):
    pipeline = make_pipeline(
        # PolynomialFeatures(degree=2, include_bias=False, interaction_only=True),
        # FeatureEngineering(level=2),
        # ClusterFeatures(n_clusters=7, random_state=SEED, ohe=True),

        # StandardScaler(),
        # PowerTransformer(method='yeo-johnson', standardize=True, copy=True),
        # QuantileTransformer(output_distribution='normal', random_state=SEED),
        # RobustScaler(quantile_range=(5.0, 95.0), with_centering=True, with_scaling=True),
        # MinMaxScaler(feature_range=(0, 1)),

        # PCA(n_components=20, random_state=SEED),
        # KernelPCA(n_components=20, kernel='rbf', random_state=SEED),

        # SelectKBest(score_func=f_regression, k=30),
        # SelectFromModel(regressor, max_features=20),
        # SequentialFeatureSelector(regressor, n_features_to_select=20),

        regressor,
        # TransformedTargetRegressor(regressor=regressor, transformer=QuantileTransformer(output_distribution='normal', random_state=SEED))
    )
    return pipeline

In [23]:
set_config(transform_output="pandas")  # set to pandas

# Set categorical features for catboost
cat_features = [col for col in X_train.columns if X_train[col].dtype == 'category']

regressors = {
    'LGBMRegressor1': LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='gbdt'),
    # 'LGBMRegressor2': LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='dart'),
    # 'LGBMRegressor3': LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='goss'),
    'LGBMRegressor4': LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='rf', subsample=.632, subsample_freq=1),
    # 'LGBMRegressor5': LGBMRegressor(random_state=SEED, n_jobs=-1, class_weight='balanced'),
    # 'LGBMRegressor6': LGBMRegressor(random_state=SEED, n_jobs=-1, subsample=0.7),
    # 'LGBMRegressor7': LGBMRegressor(random_state=SEED, n_jobs=-1, colsample_bytree=0.7),
    # 'LGBMRegressor8': LGBMRegressor(random_state=SEED, n_jobs=-1, subsample=0.7, colsample_bytree=0.7),
    # 'LGBMRegressor9': LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='dart', colsample_bytree=0.7),
    # 'LGBMRegressor10': LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='gbdt', num_leaves=48, max_depth=14, learning_rate=0.08, n_estimators=240),
    # 'LGBMRegressor11': LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='gbdt', num_leaves=48, max_depth=14, learning_rate=0.08, n_estimators=240, subsample=0.7, colsample_bytree=0.6),
    'LGBMRegressor12': LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='gbdt', num_leaves=250, max_depth=3, learning_rate=0.015, n_estimators=600, subsample=0.65, colsample_bytree=0.9, reg_alpha=50),
    'XGBRegressor1': XGBRegressor(random_state=SEED, n_jobs=-1),
    # 'XGBRegressor2': XGBRegressor(random_state=SEED, n_jobs=-1, booster='dart'),
    # 'XGBRegressor3': XGBRegressor(random_state=SEED, n_jobs=-1, booster='gblinear'),
    # 'XGBRegressor4': XGBRegressor(random_state=SEED, n_jobs=-1, colsample_bytree=0.7),
    # 'XGBRegressor5': XGBRegressor(random_state=SEED, n_jobs=-1, subsample=0.7),
    # 'XGBRegressor6': XGBRegressor(random_state=SEED, 
    #                               n_jobs=-1, 
    #                               learning_rate=0.055, 
    #                               n_estimators=200, 
    #                               max_depth=8, 
    #                               min_child_weight=1, 
    #                               gamma=0.07, 
    #                               colsample_bytree=0.67, 
    #                               colsample_bylevel=0.67, 
    #                               colsample_bynode=0.8,
    #                               subsample=0.7, 
    #                               objective='reg:squarederror'),
    # 'XGBRegressor7': XGBRegressor(random_state=SEED, n_jobs=-1, objective='reg:squarederror'),
    'XGBRandomForestRegressor': XGBRFRegressor(random_state=SEED, n_jobs=-1),
    'CatBoostRegressor': CatBoostRegressor(random_state=SEED, silent=True),
    'HistGradientBoostingRegressor': HistGradientBoostingRegressor(random_state=SEED),
    # 'HistGradientBoostingRegressor2': HistGradientBoostingRegressor(random_state=SEED, max_iter=200, 
    #                                                                 max_depth=6, learning_rate=0.1, 
    #                                                                 l2_regularization=0.0006754828207682132, max_leaf_nodes=42, 
    #                                                                 min_samples_leaf=9, max_bins=255),
    # 'HistGradientBoostingRegressor3': HistGradientBoostingRegressor(random_state=SEED, max_iter=1000, 
    #                                                                 max_depth=10, learning_rate=0.1, 
    #                                                                 l2_regularization=0.1, max_leaf_nodes=100, 
    #                                                                 min_samples_leaf=20, max_bins=255),
    'RandomForestRegressor': RandomForestRegressor(random_state=SEED, n_jobs=-1),
    'ExtraTreesRegressor': ExtraTreesRegressor(random_state=SEED, n_jobs=-1),
    'AdaBoostRegressor': AdaBoostRegressor(random_state=SEED),
    # 'GradientBoostingRegressor': GradientBoostingRegressor(random_state=SEED),
    # 'BaggingRegressor': BaggingRegressor(random_state=SEED, n_jobs=-1),
    # 'KNeighborsRegressor': KNeighborsRegressor(n_jobs=-1),
    # # 'DecisionTreeRegressor': DecisionTreeRegressor(random_state=SEED),
    # # 'GaussianProcessRegressor': GaussianProcessRegressor(random_state=SEED),
    # 'MLPRegressor1': MLPRegressor(random_state=SEED, max_iter=1000, activation='relu', solver='adam'),
    # # 'MLPRegressor2': MLPRegressor(random_state=SEED, max_iter=1000, activation='relu', solver='lbfgs'),
    # 'MLPRegressor3': MLPRegressor(random_state=SEED, max_iter=5000, activation='tanh', solver='adam'),
    # # 'MLPRegressor4': MLPRegressor(random_state=SEED, max_iter=1000, activation='tanh', solver='lbfgs'),
    # 'MLPRegressor5': MLPRegressor(random_state=SEED, max_iter=1000, activation='logistic', solver='adam'),
    # # 'MLPRegressor6': MLPRegressor(random_state=SEED, max_iter=1000, activation='logistic', solver='lbfgs'),
    # 'MLPRegressor7': MLPRegressor(random_state=SEED, max_iter=1000, activation='identity', solver='adam'),
    # 'MLPRegressor8': MLPRegressor(random_state=SEED, max_iter=1000, activation='identity', solver='lbfgs'),
    # # 'Ridge': Ridge(random_state=SEED),
    # # 'RidgeCV': RidgeCV(alphas=np.logspace(-3, 3, 7), cv=cv),
    # # 'SGDRegressor': SGDRegressor(random_state=SEED, max_iter=1000, tol=1e-3),
    # # 'PassiveAggressiveRegressor': PassiveAggressiveRegressor(random_state=SEED, max_iter=1000, tol=1e-3),
    # # 'LinearRegression': LinearRegression(n_jobs=-1),
    # # 'Lasso': Lasso(random_state=SEED),
    # # 'ElasticNet': ElasticNet(random_state=SEED, max_iter=1000000),
    # 'ElasticNetCV': ElasticNetCV(alphas=np.logspace(-3, 3, 7), cv=cv, l1_ratio=[.1, .5, .7, .9, .95, .99, 1], max_iter=1000000),
    # # 'HuberRegressor': HuberRegressor(max_iter=1000),
    # 'BayesianRidge': BayesianRidge(),
    # # 'ARDRegression': ARDRegression(),
    # # 'TheilSenRegressor': TheilSenRegressor(random_state=SEED),
    # # 'RANSACRegressor': RANSACRegressor(random_state=SEED),
    # # 'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(normalize=False),
    # # 'Lars': Lars(),
    # # 'LassoLars': LassoLars(),
    # # 'LassoLarsIC': LassoLarsIC(normalize=False),
    # # 'VotingRegressor': VotingRegressor(
    # #         estimators=[
    # #             ('Ridge', Ridge(random_state=SEED)),
    # #             ('LGBMRegressor4', LGBMRegressor(random_state=SEED, n_jobs=1, boosting_type='rf', subsample=.632, subsample_freq=1)),
    # #             ('XGBRegressor1', XGBRegressor(random_state=SEED, n_jobs=1)),
    # #             ], 
    # #         n_jobs=-1,
    # #         verbose=0,
    # #         ),
    # # 'BaggingRegressor2': BaggingRegressor(base_estimator=LGBMRegressor(random_state=SEED, n_jobs=1, boosting_type='rf', subsample=.632, subsample_freq=1),
    # #                                       random_state=SEED, 
    # #                                       n_jobs=-1,
    # #                                       max_samples=0.632,
    # #                                       max_features=0.632,
    # #                                       bootstrap=True,
    # #                                       ),
    # # 'StackingRegressor': StackingRegressor(
    # #         estimators=[
    # #             ('Ridge', Ridge(random_state=SEED)),
    # #             ('LGBMRegressor4', LGBMRegressor(random_state=SEED, n_jobs=1, boosting_type='rf', subsample=.632, subsample_freq=1)),
    # #             ('XGBRegressor1', XGBRegressor(random_state=SEED, n_jobs=1)),
    # #             ], 
    # #         final_estimator=Ridge(random_state=SEED),
    # #         cv=cv,
    # #         n_jobs=-1,
    # #         verbose=0,
    # #         )
}

SCALING_MODELS = []  # TODO remove when pipeline is fixed
exp_index += 1
print(f'Experiment {exp_index}: {len(regressors)} models')

for model_name, regressor in regressors.items():
    t0 = time.time()
    feature_importances = pd.DataFrame()

    pipeline = get_pipeline(regressor, model_name)
    
    scoring = {'rmse': make_scorer(mean_squared_error, squared=False)}
    results = cross_validate(pipeline, X_train, y_train, cv=cv, scoring=scoring, return_train_score=False, n_jobs=-1, return_estimator=True, verbose=0)
    scores = results['test_rmse']

    # # get feature importances
    # for i, estimator in enumerate(results['estimator']):
    #     regressor_ = estimator._final_estimator
    #     try:
    #         feature_importance = pd.Series(regressor_.feature_importances_, index=estimator[-2].get_feature_names_out(), name=f'fold{i}')
    #     except:
    #         feature_importance = pd.Series(regressor_.coef_, index=estimator[-2].get_feature_names_out(), name=f'fold{i}')
    #     feature_importances = pd.concat([feature_importances, feature_importance], axis=1)
    # feature_importances['mean'] = feature_importances.mean(axis=1)
    
    # fit on all train data and predict on test
    pipeline.fit(X_train, y_train)        
    y_pred = pipeline.predict(X_test)
    
    if not SUBMIT:
        score_eval = mean_squared_error(y_test, y_pred, squared=False)
    else:
        score_eval = np.nan
    
    print(f'{model_name}: {np.mean(scores):.4f} ± {np.std(scores):.4f}, Time: {time.time() - t0:.2f} seconds, RMSE: {score_eval:.4f}')
    # print(feature_importances.sort_values('mean', ascending=False))

    # Save experiment (index, model, scores, std, fit_time)
    experiments[f'{model_name}_exp{exp_index}'] = (pipeline, np.mean(scores), np.std(scores), time.time() - t0, score_eval)
set_config(transform_output="default")  # reset to default

Experiment 7: 10 models
LGBMRegressor1: 11.7274 ± 0.3469, Time: 0.31 seconds, RMSE: 12.2460
LGBMRegressor4: 11.6288 ± 0.3589, Time: 0.37 seconds, RMSE: 12.2732
LGBMRegressor12: 11.5190 ± 0.3446, Time: 0.63 seconds, RMSE: 12.2278
XGBRegressor1: 12.2989 ± 0.2335, Time: 0.69 seconds, RMSE: 12.5667
XGBRandomForestRegressor: 11.6417 ± 0.3470, Time: 0.65 seconds, RMSE: 12.2416
CatBoostRegressor: 11.7459 ± 0.3348, Time: 9.35 seconds, RMSE: 12.1723
HistGradientBoostingRegressor: 11.7242 ± 0.3150, Time: 1.27 seconds, RMSE: 12.2510
RandomForestRegressor: 12.3896 ± 0.3105, Time: 3.23 seconds, RMSE: 12.8623
ExtraTreesRegressor: 12.7894 ± 0.3564, Time: 2.26 seconds, RMSE: 13.1214
AdaBoostRegressor: 12.4829 ± 0.3110, Time: 0.40 seconds, RMSE: 13.0075


In [50]:
now = time.strftime("%Y-%m-%d %H_%M_%S")
experiments_df = pd.DataFrame(experiments, index=['pipeline', 'mean', 'std', 'fit_time', 'RMSE']).T.sort_values('mean')
experiments_df['mean+std'] = experiments_df['mean'] + experiments_df['std']
experiments_df.to_excel(f'experiments/experiments_df_{now}.xlsx')
experiments_df.head(30)

Unnamed: 0,pipeline,mean,std,fit_time,RMSE,mean+std
MLPRegressor5_exp3,"(FeatureEngineering(level=2), PowerTransformer...",11.617699,0.241854,35.578532,12.313494,11.859553
XGBRandomForestRegressor_exp5,"(PolynomialFeatures(include_bias=False, intera...",11.623963,0.279596,4.648613,12.300664,11.903559
LGBMRegressor4_exp9,"(PolynomialFeatures(include_bias=False, intera...",11.624913,0.269644,1.202784,12.289081,11.894557
LGBMRegressor4_exp14,"(PolynomialFeatures(include_bias=False, intera...",11.632079,0.26927,1.094087,12.284708,11.901349
LGBMRegressor4_exp10,"(PolynomialFeatures(include_bias=False, intera...",11.6323,0.269152,1.104064,12.28405,11.901452
MLPRegressor3_exp14,"(PolynomialFeatures(include_bias=False, intera...",11.632948,0.233214,22.545405,12.363947,11.866162
MLPRegressor5_exp14,"(PolynomialFeatures(include_bias=False, intera...",11.633,0.25903,30.053209,12.384187,11.89203
LGBMRegressor4_exp2,"(FeatureEngineering(level=2), ClusterFeatures(...",11.633405,0.218871,6.781884,12.353572,11.852276
LGBMRegressor4_exp3,"(FeatureEngineering(level=2), PowerTransformer...",11.633911,0.219354,1.194807,12.357485,11.853264
LGBMRegressor4_exp4,"(PolynomialFeatures(include_bias=False, intera...",11.634799,0.267967,1.87698,12.285713,11.902765


# Optuna

In [58]:
def objective(trial):

    regressor = LGBMRegressor(
        random_state=SEED,
        n_jobs=-1,
        # boosting_type=trial.suggest_categorical('boosting_type', ['gbdt', 'rf']),
        num_leaves=trial.suggest_int('num_leaves', 2, 256),
        max_depth=trial.suggest_int('max_depth', 2, 16),
        learning_rate=trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        n_estimators=trial.suggest_int('n_estimators', 100, 1000),
        subsample=trial.suggest_uniform('subsample', 0.632, 1.0),
        subsample_freq=trial.suggest_int('subsample_freq', 1, 10),
        colsample_bytree=trial.suggest_uniform('colsample_bytree', 0.632, 1.0),
        reg_alpha=trial.suggest_loguniform('reg_alpha', 1e-3, 1e3),
    )

    scoring = {'rmse': make_scorer(mean_squared_error, squared=False)}
    results = cross_validate(regressor, X_train, y_train, cv=cv, scoring=scoring, return_train_score=False, n_jobs=-1, return_estimator=True, verbose=0)
    scores = results['test_rmse']
    return np.mean(scores)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=1000)
df_study = study.trials_dataframe()
df_study.to_excel(f'optuna/study_LGBM.xlsx')

print(f'Best trial: {study.best_trial.value:.4f}')
print(f'Best params: {study.best_params}')


[32m[I 2023-03-07 15:54:10,282][0m A new study created in memory with name: no-name-ffb75f2e-b366-4ced-80ba-fa8397a9a9a8[0m
[32m[I 2023-03-07 15:54:14,910][0m Trial 0 finished with value: 11.768324990100746 and parameters: {'num_leaves': 39, 'max_depth': 5, 'learning_rate': 0.0034289256194983187, 'n_estimators': 567, 'subsample': 0.725273305176492, 'subsample_freq': 8, 'colsample_bytree': 0.908178600670366, 'reg_alpha': 0.0017408354672141368}. Best is trial 0 with value: 11.768324990100746.[0m
[32m[I 2023-03-07 15:54:17,004][0m Trial 1 finished with value: 11.640155068753629 and parameters: {'num_leaves': 222, 'max_depth': 3, 'learning_rate': 0.006106331129189935, 'n_estimators': 585, 'subsample': 0.7880300703062415, 'subsample_freq': 7, 'colsample_bytree': 0.8944051096737153, 'reg_alpha': 5.509290285176471}. Best is trial 1 with value: 11.640155068753629.[0m
[32m[I 2023-03-07 15:54:19,171][0m Trial 2 finished with value: 11.754300000537338 and parameters: {'num_leaves': 17,

In [59]:
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_slice

In [60]:
plot_optimization_history(study)

In [61]:
plot_param_importances(study)

In [62]:
optuna.visualization.plot_param_importances(
    study, target=lambda t: t.duration.total_seconds(), target_name="duration"
)

In [63]:
plot_slice(study)

In [26]:
def objective(trial):

    regressor = XGBRegressor(
        random_state=SEED,
        n_jobs=-1,
        booster=trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        max_depth=trial.suggest_int('max_depth', 2, 16),
        learning_rate=trial.suggest_loguniform('learning_rate', 1e-2, 1e-1),
        n_estimators=trial.suggest_int('n_estimators', 100, 1000),
        subsample=trial.suggest_uniform('subsample', 0.632, 1.0),
        colsample_bytree=trial.suggest_uniform('colsample_bytree', 0.632, 1.0),
        reg_alpha=trial.suggest_loguniform('reg_alpha', 1e-3, 1e3),
        reg_lambda=trial.suggest_loguniform('reg_lambda', 1e-3, 1e3),
        eval_metric = 'rmse',
        objective = 'reg:squarederror',
    )

    scoring = {'rmse': make_scorer(mean_squared_error, squared=False)}
    results = cross_validate(regressor, X_train, y_train, cv=cv, scoring=scoring, return_train_score=False, n_jobs=-1, return_estimator=True, verbose=0)
    scores = results['test_rmse']
    return np.mean(scores)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)
df_study = study.trials_dataframe()
df_study.to_excel(f'optuna/study_XGBRegressor.xlsx')



[32m[I 2023-03-07 16:23:04,870][0m A new study created in memory with name: no-name-5c218b7e-7e88-49c6-be2d-ed9f829760de[0m
[32m[I 2023-03-07 16:23:10,701][0m Trial 0 finished with value: 12.295104945343521 and parameters: {'booster': 'dart', 'max_depth': 14, 'learning_rate': 0.016823760176195798, 'n_estimators': 218, 'subsample': 0.9550383127192095, 'colsample_bytree': 0.8925463657221309, 'reg_alpha': 0.7416478562000499, 'reg_lambda': 380.16060170087337}. Best is trial 0 with value: 12.295104945343521.[0m
[32m[I 2023-03-07 16:23:20,856][0m Trial 1 finished with value: 11.752954120525814 and parameters: {'booster': 'dart', 'max_depth': 5, 'learning_rate': 0.0131360550295664, 'n_estimators': 312, 'subsample': 0.6446791711000307, 'colsample_bytree': 0.7974861474804971, 'reg_alpha': 389.28052769820687, 'reg_lambda': 0.01296143022925734}. Best is trial 1 with value: 11.752954120525814.[0m
[32m[I 2023-03-07 16:23:23,522][0m Trial 2 finished with value: 12.262273158605108 and para

In [29]:
model = LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='gbdt', num_leaves=250, max_depth=3, learning_rate=0.015, n_estimators=600, subsample=0.65, colsample_bytree=0.9, reg_alpha=50)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)  

In [31]:
# Save predictions
sub = pd.read_csv('submissions/sample_submission.csv')
sub[TARGET] = y_pred
now = time.strftime("%Y-%m-%d %H_%M_%S")
sub.to_csv(f'submissions/submission_{now}.csv', index=False)