In [87]:
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
from sklearn.model_selection import train_test_split, cross_val_score, KFold, cross_val_predict
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.compose import TransformedTargetRegressor
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, OPTICS, Birch, MeanShift, SpectralClustering, AffinityPropagation, FeatureAgglomeration

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn import set_config
from sklearn import config_context
from sklearn.base import BaseEstimator, TransformerMixin

# import regressors
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor, BaggingRegressor, StackingRegressor, HistGradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor, PassiveAggressiveRegressor, Perceptron, RidgeClassifier, LogisticRegression
from sklearn.linear_model import Lasso, ElasticNet, Lars, BayesianRidge, ARDRegression, OrthogonalMatchingPursuit, HuberRegressor, TheilSenRegressor, RANSACRegressor
from sklearn.linear_model import LassoLars, LassoLarsIC
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import RidgeCV, ElasticNetCV, LassoCV, LarsCV, OrthogonalMatchingPursuitCV, LassoLarsCV, BayesianRidge, LinearRegression

# pandas deactivate future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

SUBMIT = True
USE_ORIGINAL = True
SEED = 15
SAMPLE = 1
TARGET = 'Strength'

train = pd.read_csv('datasets/train.csv')
test = pd.read_csv('datasets/test.csv')
orig = pd.read_csv('datasets/ConcreteStrengthData.csv')

for i, X in enumerate([train, test, orig]):
    if 'id' in X.columns:
        X.drop(['id'], axis=1, inplace=True)

# Define test set with original data
if not SUBMIT:
    train, test = train_test_split(train, test_size=0.2, random_state=SEED) 

# Add original data to training set
if USE_ORIGINAL:
    train = pd.concat([train, orig], axis=0)
    train.reset_index(inplace=True, drop=True)

del orig

# Sampling for faster training
if SAMPLE < 1:
    train = train.sample(frac=SAMPLE, random_state=SEED)

# set training data
X_train = train.copy()
y_train = X_train.pop(TARGET)
X_test = test.copy()

if not SUBMIT:
    y_test = X_test.pop(TARGET)
else:
    y_test = None
    
base_cols = X_train.columns

# Feature engineering
class FeatureEngineering(TransformerMixin, BaseEstimator):

    def __init__(self) -> None:
        pass

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        """ Add new features to the dataset"""
        X = X.copy()

        # Sum of features
        # X['cement_slag'] = X['CementComponent'] + X['BlastFurnaceSlag']
        # X['cement_flyash'] = X['CementComponent'] + X['FlyAshComponent']
        # X['cement_water'] = X['CementComponent'] + X['WaterComponent']
        # X['cement_superplasticizer'] = X['CementComponent'] + X['SuperplasticizerComponent']
        # X['cement_coarseaggregate'] = X['CementComponent'] + X['CoarseAggregateComponent']
        # X['cement_fineaggregate'] = X['CementComponent'] + X['FineAggregateComponent']
        # X['cement_age'] = X['CementComponent'] + X['AgeInDays']
        # X['slag_flyash'] = X['BlastFurnaceSlag'] + X['FlyAshComponent']
        # X['slag_water'] = X['BlastFurnaceSlag'] + X['WaterComponent']
        # X['slag_superplasticizer'] = X['BlastFurnaceSlag'] + X['SuperplasticizerComponent']
        # X['slag_coarseaggregate'] = X['BlastFurnaceSlag'] + X['CoarseAggregateComponent']
        # X['slag_fineaggregate'] = X['BlastFurnaceSlag'] + X['FineAggregateComponent']
        # X['slag_age'] = X['BlastFurnaceSlag'] + X['AgeInDays']
        # X['flyash_water'] = X['FlyAshComponent'] + X['WaterComponent']
        # X['flyash_superplasticizer'] = X['FlyAshComponent'] + X['SuperplasticizerComponent']
        # X['flyash_coarseaggregate'] = X['FlyAshComponent'] + X['CoarseAggregateComponent']

        # Ratio of features
        # X['cement_slag_ratio'] = X['CementComponent'] / X['BlastFurnaceSlag']
        # X['cement_flyash_ratio'] = X['CementComponent'] / X['FlyAshComponent']
        # X['cement_water_ratio'] = X['CementComponent'] / X['WaterComponent']
        # X['cement_superplasticizer_ratio'] = X['CementComponent'] / X['SuperplasticizerComponent']
        # X['cement_coarseaggregate_ratio'] = X['CementComponent'] / X['CoarseAggregateComponent']
        # X['cement_fineaggregate_ratio'] = X['CementComponent'] / X['FineAggregateComponent']
        # X['cement_age_ratio'] = X['CementComponent'] / X['AgeInDays']
        # X['slag_flyash_ratio'] = X['BlastFurnaceSlag'] / X['FlyAshComponent']
        # X['slag_water_ratio'] = X['BlastFurnaceSlag'] / X['WaterComponent']
        # X['slag_superplasticizer_ratio'] = X['BlastFurnaceSlag'] / X['SuperplasticizerComponent']
        # X['slag_coarseaggregate_ratio'] = X['BlastFurnaceSlag'] / X['CoarseAggregateComponent']
        # X['slag_fineaggregate_ratio'] = X['BlastFurnaceSlag'] / X['FineAggregateComponent']
        # X['slag_age_ratio'] = X['BlastFurnaceSlag'] / X['AgeInDays']
        # X['flyash_water_ratio'] = X['FlyAshComponent'] / X['WaterComponent']
        # X['flyash_superplasticizer_ratio'] = X['FlyAshComponent'] / X['SuperplasticizerComponent']
        # X['flyash_coarseaggregate_ratio'] = X['FlyAshComponent'] / X['CoarseAggregateComponent']

        # Other features
        X['water_age_cement_ratio'] = (X['WaterComponent'] * X['AgeInDays']) / X['CementComponent']
        # X['has_superplasticizer'] = X['SuperplasticizerComponent'].apply(lambda x: 1 if x > 0 else 0)
        # X['has_flyash'] = X['FlyAshComponent'].apply(lambda x: 1 if x > 0 else 0)
        # X['has_slag'] = X['BlastFurnaceSlag'].apply(lambda x: 1 if x > 0 else 0)

        # Taken from PHONG NGUYEN: Detailed feature description and feature engineering by ChatGPT
        # # Calculate TotalComponentWeight
        # X['TotalComponentWeight'] = X['CementComponent'] + X['BlastFurnaceSlag'] + X['FlyAshComponent'] + X['WaterComponent'] + X['SuperplasticizerComponent'] + X['CoarseAggregateComponent'] + X['FineAggregateComponent']
        # # Calculate Water-Cement-Ratio (WCR)
        # X['WCR'] = X['WaterComponent'] / X['CementComponent']
        # # Calculate Aggregate-Ratio (AR)
        # X['AR'] = (X['CoarseAggregateComponent'] + X['FineAggregateComponent']) / X['CementComponent']
        # # Calculate Water-Cement-Plus-Pozzolan-Ratio (WCPR)
        # X['WCPR'] = X['WaterComponent'] / (X['CementComponent'] + X['BlastFurnaceSlag'] + X['FlyAshComponent'])
        # # Calculate Cement-Age
        # X['Cement-Age'] = X['CementComponent'] * X['AgeInDays']

        return X
    

set_config(transform_output="pandas")  # set to pandas

clf = make_pipeline(
    # PolynomialFeatures(2), 
    FeatureEngineering()
    )
clf.fit(X_train, y_train)

with config_context(transform_output="pandas"):
    # the output of transform will be a Pandas DataFrame
    X_train= clf.transform(X_train)
    X_test = clf.transform(X_test)

set_config(transform_output="default")  # reset to default
X_train.head()

Unnamed: 0,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,water_age_cement_ratio
0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,1.062857
1,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,37.398601
2,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,17.991696
3,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,273.75
4,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,110.063694


In [88]:
X_train_cv

Unnamed: 0,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,water_age_cement_ratio,Strength
0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,1.062857,10.38
1,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,37.398601,23.52
3,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,273.750000,39.05
4,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,110.063694,74.19
6,135.7,203.5,0.0,185.7,0.0,1076.2,759.3,28,38.316875,35.10
...,...,...,...,...,...,...,...,...,...,...
6431,166.0,259.7,0.0,183.2,12.7,858.8,826.8,28,30.901205,37.92
6432,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,18.193922,44.28
6433,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,17.032899,31.18
6434,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,36.334007,23.70


In [99]:
cv = KFold(n_splits=5, shuffle=True, random_state=SEED)

# Set categorical features for catboost
cat_features = [col for col in X_train.columns if X_train[col].dtype == 'category']

regressors = {
    # 'LGBMRegressor1': LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='gbdt'),
    # 'LGBMRegressor2': LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='dart'),
    # 'LGBMRegressor3': LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='goss'),
    'LGBMRegressor4': LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='rf', subsample=.632, subsample_freq=1),
    # 'LGBMRegressor5': LGBMRegressor(random_state=SEED, n_jobs=-1, class_weight='balanced'),
    # 'LGBMRegressor6': LGBMRegressor(random_state=SEED, n_jobs=-1, subsample=0.7),
    # 'LGBMRegressor7': LGBMRegressor(random_state=SEED, n_jobs=-1, colsample_bytree=0.7),
    # 'LGBMRegressor8': LGBMRegressor(random_state=SEED, n_jobs=-1, subsample=0.7, colsample_bytree=0.7),
    # 'LGBMRegressor9': LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='dart', colsample_bytree=0.7),
    # 'LGBMRegressor10': LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='gbdt', num_leaves=48, max_depth=14, learning_rate=0.08, n_estimators=240),
    # 'LGBMRegressor11': LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='gbdt', num_leaves=48, max_depth=14, learning_rate=0.08, n_estimators=240, subsample=0.7, colsample_bytree=0.6),
    # 'XGBRegressor1': XGBRegressor(random_state=SEED, n_jobs=-1),
    # 'XGBRegressor2': XGBRegressor(random_state=SEED, n_jobs=-1, booster='dart'),
    # 'XGBRegressor3': XGBRegressor(random_state=SEED, n_jobs=-1, booster='gblinear'),
    # 'XGBRegressor4': XGBRegressor(random_state=SEED, n_jobs=-1, colsample_bytree=0.7),
    # 'XGBRegressor5': XGBRegressor(random_state=SEED, n_jobs=-1, subsample=0.7),
    # 'XGBRegressor6': XGBRegressor(random_state=SEED, 
    #                               n_jobs=-1, 
    #                               learning_rate=0.055, 
    #                               n_estimators=200, 
    #                               max_depth=8, 
    #                               min_child_weight=1, 
    #                               gamma=0.07, 
    #                               colsample_bytree=0.67, 
    #                               colsample_bylevel=0.67, 
    #                               colsample_bynode=0.8,
    #                               subsample=0.7, 
    #                               objective='reg:squarederror'),
    # 'XGBRegressor7': XGBRegressor(random_state=SEED, n_jobs=-1, objective='reg:squarederror'),
    # 'XGBRandomForestRegressor': XGBRFRegressor(random_state=SEED, n_jobs=-1),
    # 'CatBoostRegressor': CatBoostRegressor(random_state=SEED, silent=True),
    # 'HistGradientBoostingRegressor': HistGradientBoostingRegressor(random_state=SEED),
    # 'HistGradientBoostingRegressor2': HistGradientBoostingRegressor(random_state=SEED, max_iter=200, 
    #                                                                 max_depth=6, learning_rate=0.1, 
    #                                                                 l2_regularization=0.0006754828207682132, max_leaf_nodes=42, 
    #                                                                 min_samples_leaf=9, max_bins=255),
    # 'HistGradientBoostingRegressor3': HistGradientBoostingRegressor(random_state=SEED, max_iter=1000, 
    #                                                                 max_depth=10, learning_rate=0.1, 
    #                                                                 l2_regularization=0.1, max_leaf_nodes=100, 
    #                                                                 min_samples_leaf=20, max_bins=255),
    # 'RandomForestRegressor': RandomForestRegressor(random_state=SEED, n_jobs=-1),
    # 'ExtraTreesRegressor': ExtraTreesRegressor(random_state=SEED, n_jobs=-1),
    # 'AdaBoostRegressor': AdaBoostRegressor(random_state=SEED),
    # 'GradientBoostingRegressor': GradientBoostingRegressor(random_state=SEED),
    # 'BaggingRegressor': BaggingRegressor(random_state=SEED, n_jobs=-1),
    # 'KNeighborsRegressor': KNeighborsRegressor(n_jobs=-1),
    # 'DecisionTreeRegressor': DecisionTreeRegressor(random_state=SEED),
    # 'GaussianProcessRegressor': GaussianProcessRegressor(random_state=SEED),
    # 'MLPRegressor1': MLPRegressor(random_state=SEED, max_iter=1000, activation='relu', solver='adam'),
    # 'MLPRegressor2': MLPRegressor(random_state=SEED, max_iter=1000, activation='relu', solver='lbfgs'),
    # 'MLPRegressor3': MLPRegressor(random_state=SEED, max_iter=5000, activation='tanh', solver='adam'),
    # 'MLPRegressor4': MLPRegressor(random_state=SEED, max_iter=1000, activation='tanh', solver='lbfgs'),
    # 'MLPRegressor5': MLPRegressor(random_state=SEED, max_iter=1000, activation='logistic', solver='adam'),
    # 'MLPRegressor6': MLPRegressor(random_state=SEED, max_iter=1000, activation='logistic', solver='lbfgs'),
    # 'MLPRegressor7': MLPRegressor(random_state=SEED, max_iter=1000, activation='identity', solver='adam'),
    # 'MLPRegressor8': MLPRegressor(random_state=SEED, max_iter=1000, activation='identity', solver='lbfgs'),
    # 'Ridge': Ridge(random_state=SEED),
    # 'SGDRegressor': SGDRegressor(random_state=SEED, max_iter=1000, tol=1e-3),
    # 'PassiveAggressiveRegressor': PassiveAggressiveRegressor(random_state=SEED, max_iter=1000, tol=1e-3),
    # 'LinearRegression': LinearRegression(),
    # 'Lasso': Lasso(random_state=SEED),
    # 'ElasticNet': ElasticNet(random_state=SEED, max_iter=1e6),
    # 'HuberRegressor': HuberRegressor(max_iter=1000),
    # 'BayesianRidge': BayesianRidge(),
    # 'ARDRegression': ARDRegression(),
    # 'TheilSenRegressor': TheilSenRegressor(random_state=SEED),
    # 'RANSACRegressor': RANSACRegressor(random_state=SEED),
    # 'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(normalize=False),
    # 'Lars': Lars(),
    # 'LassoLars': LassoLars(),
    # 'LassoLarsIC': LassoLarsIC(normalize=False),
    # 'StackingRegressor': StackingRegressor(
    #         estimators=[
    #             ('LGBMRegressor11', LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='gbdt', num_leaves=48, 
    #                                             max_depth=14, learning_rate=0.08, n_estimators=240, subsample=0.7, colsample_bytree=0.6)),
    #             ('XGBRegressor6', XGBRegressor(random_state=SEED, n_jobs=-1, learning_rate=0.055, n_estimators=200,  
    #                                         max_depth=8,  min_child_weight=1, gamma=0.07,  colsample_bytree=0.67, 
    #                                         colsample_bylevel=0.67, colsample_bynode=0.8, subsample=0.7, 
    #                                         objective='reg:squarederror')),
    #             ('CatBoostRegressor', CatBoostRegressor(random_state=SEED, silent=True, cat_features=cat_features)),
    #             # ('ExtraTreesRegressor', ExtraTreesRegressor(random_state=SEED, n_jobs=-1))
    #             ], 
    #         final_estimator=Ridge(random_state=SEED),
    #         cv=cv,
    #         # n_jobs=-1,
    #         verbose=1
    #         )
}

for model_name, regressor in regressors.items():
    t0 = time.time()
    scores = []
    feature_importances = pd.DataFrame()
    
    for i, (train_index, test_index) in tqdm(enumerate(cv.split(X_train))):
        
        X_train_cv, X_test_cv = X_train.iloc[train_index].copy(), X_train.iloc[test_index].copy()
        y_train_cv, y_test_cv = y_train.iloc[train_index].copy(), y_train.iloc[test_index].copy()

        # Groupby duplicated rows and take mean of target column
        # cols = X_train_cv.columns.tolist()
        # X_train_cv = pd.concat([X_train_cv, y_train_cv], axis=1)
        # X_train_cv = X_train_cv.groupby(cols, as_index=False).median()
        # y_train_cv = X_train_cv.pop(TARGET)

        print(X_train_cv.shape)

        # Align y with X
        

        # Models that need scaling and no missing value
        if model_name in ['MLPRegressor1', 'MLPRegressor2', 'MLPRegressor3', 'MLPRegressor4', 'MLPRegressor5', 'MLPRegressor6', 'MLPRegressor7', 
                          'MLPRegressor8', 'SGDRegressor', 'PassiveAggressiveRegressor', 'Perceptron', 'Ridge', 'Lasso', 'ElasticNet', 
                          'HuberRegressor', 'BayesianRidge', 'ARDRegression', 'TheilSenRegressor', 'RANSACRegressor', 'OrthogonalMatchingPursuit', 
                          'Lars', 'LassoLars', 'LassoLarsIC']:
            scaler = StandardScaler()
            X_train_cv = pd.DataFrame(scaler.fit_transform(X_train_cv), columns=X_train_cv.columns)
            X_test_cv = pd.DataFrame(scaler.transform(X_test_cv), columns=X_test_cv.columns)

            # imputer = KNNImputer(n_neighbors=5)
            # X_train_cv = pd.DataFrame(imputer.fit_transform(X_train_cv), columns=X_train_cv.columns)
            # X_test_cv = pd.DataFrame(imputer.transform(X_test_cv), columns=X_test_cv.columns)

            
        regressor.fit(X_train_cv, y_train_cv)        
        y_pred = regressor.predict(X_test_cv)
        score_eval = mean_squared_error(y_test_cv, y_pred, squared=False)
        scores.append(score_eval)
        
        try:
            feature_importance = pd.Series(regressor.feature_importances_, index=X_train_cv.columns, name=f'fold{i}')
        except:
            try:
                feature_importance = pd.Series(regressor.coef_, index=X_train_cv.columns, name=f'fold{i}')
            except:
                feature_importance = pd.Series(np.zeros(X_train_cv.shape[1]), index=X_train_cv.columns, name=f'fold{i}')
        feature_importances = pd.concat([feature_importances, feature_importance], axis=1)
    
    feature_importances['mean'] = feature_importances.mean(axis=1)
    
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    
    if not SUBMIT:
        score_eval = mean_squared_error(y_test, y_pred, squared=False)
    
    print(f'{model_name}: {np.mean(scores):.4f} ± {np.std(scores):.4f}, Time: {time.time() - t0:.2f} seconds, RMSE: {score_eval:.4f}')
    print(feature_importances.sort_values('mean', ascending=False))

1it [00:00,  5.80it/s]

(5149, 9)
(5149, 9)


3it [00:00,  8.31it/s]

(5150, 9)
(5150, 9)
(5150, 9)


5it [00:00,  8.69it/s]

LGBMRegressor4: 11.7352 ± 0.2269, Time: 0.67 seconds, RMSE: 11.9373
                           fold0  fold1  fold2  fold3  fold4   mean
AgeInDays                    571    520    535    540    550  543.2
CementComponent              544    478    472    439    504  487.4
SuperplasticizerComponent    403    450    462    529    442  457.2
water_age_cement_ratio       357    386    391    395    358  377.4
BlastFurnaceSlag             294    308    304    267    236  281.8
WaterComponent               222    323    284    252    262  268.6
FineAggregateComponent       293    237    254    292    259  267.0
CoarseAggregateComponent     158    165    183    153    242  180.2
FlyAshComponent              158    133    115    133    147  137.2





In [None]:
# LGBMRegressor4
# 12.1279 ± 0.2388, Time: 0.54 seconds, RMSE: 12.2312 Benchmark
# 12.1491 ± 0.2170, Time: 0.61 seconds, RMSE: 12.2304 Benchmark + PHONG NGUYEN features
# 12.1461 ± 0.2530, Time: 0.75 seconds, RMSE: 12.2729 Benchmark + other features
# 12.1461 ± 0.2530, Time: 0.54 seconds, RMSE: 12.2729 Benchmark + water_age_cement_ratio
# 11.7352 ± 0.2269, Time: 0.56 seconds, RMSE: 11.9373 Benchmark + water_age_cement_ratio + origin data
# 11.7387 ± 0.2246, Time: 0.63 seconds, RMSE: 11.9454 Benchmark + origin data
# 11.7221 ± 0.2309, Time: 1.24 seconds, RMSE: 11.9058 Benchmark + water_age_cement_ratio + origin data + PolynomialFeatures
# 11.7941 ± 0.2258, Time: 0.73 seconds, RMSE: 11.9935 Benchmark + water_age_cement_ratio + origin data + mean of duplicated rows

