In [None]:
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
from sklearn.model_selection import train_test_split, cross_val_score, KFold, cross_val_predict
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.compose import TransformedTargetRegressor
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, OPTICS, Birch, MeanShift, SpectralClustering, AffinityPropagation, FeatureAgglomeration
from sklearn.preprocessing import QuantileTransformer

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn import set_config
from sklearn import config_context
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA

from sklearn.cluster import KMeans, Birch, MiniBatchKMeans
from sklearn.mixture import GaussianMixture

# import regressors
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor, BaggingRegressor, StackingRegressor, HistGradientBoostingRegressor, VotingRegressor
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor, PassiveAggressiveRegressor, Perceptron, RidgeClassifier, LogisticRegression
from sklearn.linear_model import Lasso, ElasticNet, Lars, BayesianRidge, ARDRegression, OrthogonalMatchingPursuit, HuberRegressor, TheilSenRegressor, RANSACRegressor
from sklearn.linear_model import LassoLars, LassoLarsIC
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import RidgeCV, ElasticNetCV, LassoCV, LarsCV, OrthogonalMatchingPursuitCV, LassoLarsCV, BayesianRidge, LinearRegression

# pandas deactivate future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

SUBMIT = False
USE_ORIGINAL = True
SEED = 15
SAMPLE = 1
TARGET = 'Strength'
TARGET_TRANSFORM = True
SCALING_MODELS = ['Ridge', 'RidgeCV', 'SGDRegressor', 'PassiveAggressiveRegressor', 'LinearRegression', 'Lasso', 'ElasticNet', 'ElasticNetCV', 'HuberRegressor', 
                 'BayesianRidge', 'ARDRegression', 'TheilSenRegressor', 'RANSACRegressor', 'OrthogonalMatchingPursuit', 'Lars', 'LassoLars', 'LassoLarsIC']
CLUSTER_RANGE = range(2, 11)
CLUSTER_FEATURES = False
DIMENSIONALITY_REDUCTION = False
PCA_N_COMPONENTS = 0.9

train = pd.read_csv('datasets/train.csv')
test = pd.read_csv('datasets/test.csv')
orig = pd.read_csv('datasets/ConcreteStrengthData.csv')

for i, X in enumerate([train, test, orig]):
    if 'id' in X.columns:
        X.drop(['id'], axis=1, inplace=True)

# Define test set with original data
if not SUBMIT:
    train, test = train_test_split(train, test_size=0.2, random_state=SEED) 

# Add original data to training set
if USE_ORIGINAL:
    train = pd.concat([train, orig], axis=0)
    train.reset_index(inplace=True, drop=True)

del orig

# Sampling for faster training
if SAMPLE < 1:
    train = train.sample(frac=SAMPLE, random_state=SEED)

# set training data
X_train = train.copy()
y_train = X_train.pop(TARGET)
X_test = test.copy()

if not SUBMIT:
    y_test = X_test.pop(TARGET)
else:
    y_test = None

# Feature engineering
class FeatureEngineering(TransformerMixin, BaseEstimator):

    def __init__(self) -> None:
        pass

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        """ Add new features to the dataset"""
        X = X.copy()

        # # Sum of features
        # X['cement_slag'] = X['CementComponent'] + X['BlastFurnaceSlag']
        # X['cement_flyash'] = X['CementComponent'] + X['FlyAshComponent']
        # X['cement_water'] = X['CementComponent'] + X['WaterComponent']
        # X['cement_superplasticizer'] = X['CementComponent'] + X['SuperplasticizerComponent']
        # X['cement_coarseaggregate'] = X['CementComponent'] + X['CoarseAggregateComponent']
        # X['cement_fineaggregate'] = X['CementComponent'] + X['FineAggregateComponent']
        # X['cement_age'] = X['CementComponent'] + X['AgeInDays']
        # X['slag_flyash'] = X['BlastFurnaceSlag'] + X['FlyAshComponent']
        # X['slag_water'] = X['BlastFurnaceSlag'] + X['WaterComponent']
        # X['slag_superplasticizer'] = X['BlastFurnaceSlag'] + X['SuperplasticizerComponent']
        # X['slag_coarseaggregate'] = X['BlastFurnaceSlag'] + X['CoarseAggregateComponent']
        # X['slag_fineaggregate'] = X['BlastFurnaceSlag'] + X['FineAggregateComponent']
        # X['slag_age'] = X['BlastFurnaceSlag'] + X['AgeInDays']
        # X['flyash_water'] = X['FlyAshComponent'] + X['WaterComponent']
        # X['flyash_superplasticizer'] = X['FlyAshComponent'] + X['SuperplasticizerComponent']
        # X['flyash_coarseaggregate'] = X['FlyAshComponent'] + X['CoarseAggregateComponent']

        # # Ratio of features
        # # X['cement_slag_ratio'] = X['CementComponent'] / X['BlastFurnaceSlag']  # Inf values
        # # X['cement_flyash_ratio'] = X['CementComponent'] / X['FlyAshComponent']  # Inf values
        # X['cement_water_ratio'] = X['CementComponent'] / X['WaterComponent']
        # # X['cement_superplasticizer_ratio'] = X['CementComponent'] / X['SuperplasticizerComponent']  # Inf values
        # X['cement_coarseaggregate_ratio'] = X['CementComponent'] / X['CoarseAggregateComponent']
        # X['cement_fineaggregate_ratio'] = X['CementComponent'] / X['FineAggregateComponent']
        # X['cement_age_ratio'] = X['CementComponent'] / X['AgeInDays']
        # # X['slag_flyash_ratio'] = X['BlastFurnaceSlag'] / X['FlyAshComponent'] # Divide by zero
        # X['slag_water_ratio'] = X['BlastFurnaceSlag'] / X['WaterComponent']
        # # X['slag_superplasticizer_ratio'] = X['BlastFurnaceSlag'] / X['SuperplasticizerComponent']  # Divide by zero
        # X['slag_coarseaggregate_ratio'] = X['BlastFurnaceSlag'] / X['CoarseAggregateComponent']
        # X['slag_fineaggregate_ratio'] = X['BlastFurnaceSlag'] / X['FineAggregateComponent']
        # X['slag_age_ratio'] = X['BlastFurnaceSlag'] / X['AgeInDays']
        # X['flyash_water_ratio'] = X['FlyAshComponent'] / X['WaterComponent']
        # # X['flyash_superplasticizer_ratio'] = X['FlyAshComponent'] / X['SuperplasticizerComponent'] # Divide by zero
        # X['flyash_coarseaggregate_ratio'] = X['FlyAshComponent'] / X['CoarseAggregateComponent']

        # Other features
        X['water_age_cement_ratio'] = (X['WaterComponent'] * X['AgeInDays']) / X['CementComponent']
        # X['has_superplasticizer'] = X['SuperplasticizerComponent'].apply(lambda x: 1 if x > 0 else 0)
        # X['has_flyash'] = X['FlyAshComponent'].apply(lambda x: 1 if x > 0 else 0)
        # X['has_slag'] = X['BlastFurnaceSlag'].apply(lambda x: 1 if x > 0 else 0)

        # Taken from PHONG NGUYEN: Detailed feature description and feature engineering by ChatGPT
        # Calculate TotalComponentWeight
        # X['TotalComponentWeight'] = X['CementComponent'] + X['BlastFurnaceSlag'] + X['FlyAshComponent'] + X['WaterComponent'] + X['SuperplasticizerComponent'] + X['CoarseAggregateComponent'] + X['FineAggregateComponent']
        # # Calculate Water-Cement-Ratio (WCR)
        # X['WCR'] = X['WaterComponent'] / X['CementComponent']
        # # Calculate Aggregate-Ratio (AR)
        # X['AR'] = (X['CoarseAggregateComponent'] + X['FineAggregateComponent']) / X['CementComponent']
        # # Calculate Water-Cement-Plus-Pozzolan-Ratio (WCPR)
        # X['WCPR'] = X['WaterComponent'] / (X['CementComponent'] + X['BlastFurnaceSlag'] + X['FlyAshComponent'])
        # # Calculate Cement-Age
        # X['Cement-Age'] = X['CementComponent'] * X['AgeInDays']
        
        return X


In [None]:
def cluster_features(X_train, X_test):
    for n_clusters in CLUSTER_RANGE:
        cluster_models = {
            'KMeans': KMeans(n_clusters=n_clusters, random_state=SEED),
            'Birch': Birch(n_clusters=n_clusters),
            # 'MiniBatchKMeans': MiniBatchKMeans(n_clusters=n_clusters, random_state=SEED),
            'GaussianMixture': GaussianMixture(n_components=n_clusters, random_state=SEED),
        }
        for model_name, model in cluster_models.items():
            X_train[f'{model_name}_{n_clusters}'] = model.fit_predict(X_train)
            X_test[f'{model_name}_{n_clusters}'] = model.predict(X_test)
    return X_train, X_test

def data_preparation(X_train, X_test, model_name):
    """Pipeline for data preparation."""    
    set_config(transform_output="pandas")  # set to pandas

    clf = make_pipeline(
        PolynomialFeatures(2, include_bias=False, interaction_only=True),
        FeatureEngineering(),
        )
    
    # Add power transformer to the pipeline for linear models
    if model_name in SCALING_MODELS:
        clf = make_pipeline(
            clf,
            PowerTransformer(method='yeo-johnson', standardize=True)
            )
        
    y = X_train.iloc[:, 0] # get any column, not used in transformation
    clf.fit(X_train, y)

    with config_context(transform_output="pandas"):
        # the output of transform will be a Pandas DataFrame
        X_train= clf.transform(X_train)
        X_test = clf.transform(X_test)
        if DIMENSIONALITY_REDUCTION:
            pca = PCA(n_components=PCA_N_COMPONENTS, random_state=SEED)
            X_train = pca.fit_transform(X_train)
            X_test = pca.transform(X_test)

    set_config(transform_output="default")  # reset to default

    # Cluster features
    if CLUSTER_FEATURES:
        X_train, X_test = cluster_features(X_train, X_test)

    return X_train, X_test


def model_preparation(regressor, model_name):
    if TARGET_TRANSFORM and model_name in SCALING_MODELS:
        regressor = TransformedTargetRegressor(regressor=regressor, transformer=QuantileTransformer(output_distribution='normal', random_state=SEED))
    return regressor

In [None]:
set_config(transform_output="pandas")  # set to pandas
cv = KFold(n_splits=5, shuffle=True, random_state=SEED)

# Set categorical features for catboost
cat_features = [col for col in X_train.columns if X_train[col].dtype == 'category']

regressors = {
    # 'LGBMRegressor1': LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='gbdt'),
    # 'LGBMRegressor2': LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='dart'),
    # 'LGBMRegressor3': LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='goss'),
    'LGBMRegressor4': LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='rf', subsample=.632, subsample_freq=1),
    # 'LGBMRegressor5': LGBMRegressor(random_state=SEED, n_jobs=-1, class_weight='balanced'),
    # 'LGBMRegressor6': LGBMRegressor(random_state=SEED, n_jobs=-1, subsample=0.7),
    # 'LGBMRegressor7': LGBMRegressor(random_state=SEED, n_jobs=-1, colsample_bytree=0.7),
    # 'LGBMRegressor8': LGBMRegressor(random_state=SEED, n_jobs=-1, subsample=0.7, colsample_bytree=0.7),
    # 'LGBMRegressor9': LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='dart', colsample_bytree=0.7),
    # 'LGBMRegressor10': LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='gbdt', num_leaves=48, max_depth=14, learning_rate=0.08, n_estimators=240),
    # 'LGBMRegressor11': LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='gbdt', num_leaves=48, max_depth=14, learning_rate=0.08, n_estimators=240, subsample=0.7, colsample_bytree=0.6),
    # 'XGBRegressor1': XGBRegressor(random_state=SEED, n_jobs=-1),
    # 'XGBRegressor2': XGBRegressor(random_state=SEED, n_jobs=-1, booster='dart'),
    # 'XGBRegressor3': XGBRegressor(random_state=SEED, n_jobs=-1, booster='gblinear'),
    # 'XGBRegressor4': XGBRegressor(random_state=SEED, n_jobs=-1, colsample_bytree=0.7),
    # 'XGBRegressor5': XGBRegressor(random_state=SEED, n_jobs=-1, subsample=0.7),
    # 'XGBRegressor6': XGBRegressor(random_state=SEED, 
    #                               n_jobs=-1, 
    #                               learning_rate=0.055, 
    #                               n_estimators=200, 
    #                               max_depth=8, 
    #                               min_child_weight=1, 
    #                               gamma=0.07, 
    #                               colsample_bytree=0.67, 
    #                               colsample_bylevel=0.67, 
    #                               colsample_bynode=0.8,
    #                               subsample=0.7, 
    #                               objective='reg:squarederror'),
    # 'XGBRegressor7': XGBRegressor(random_state=SEED, n_jobs=-1, objective='reg:squarederror'),
    # 'XGBRandomForestRegressor': XGBRFRegressor(random_state=SEED, n_jobs=-1),
    # 'CatBoostRegressor': CatBoostRegressor(random_state=SEED, silent=True),
    # 'HistGradientBoostingRegressor': HistGradientBoostingRegressor(random_state=SEED),
    # 'HistGradientBoostingRegressor2': HistGradientBoostingRegressor(random_state=SEED, max_iter=200, 
    #                                                                 max_depth=6, learning_rate=0.1, 
    #                                                                 l2_regularization=0.0006754828207682132, max_leaf_nodes=42, 
    #                                                                 min_samples_leaf=9, max_bins=255),
    # 'HistGradientBoostingRegressor3': HistGradientBoostingRegressor(random_state=SEED, max_iter=1000, 
    #                                                                 max_depth=10, learning_rate=0.1, 
    #                                                                 l2_regularization=0.1, max_leaf_nodes=100, 
    #                                                                 min_samples_leaf=20, max_bins=255),
    # 'RandomForestRegressor': RandomForestRegressor(random_state=SEED, n_jobs=-1),
    # 'ExtraTreesRegressor': ExtraTreesRegressor(random_state=SEED, n_jobs=-1),
    # 'AdaBoostRegressor': AdaBoostRegressor(random_state=SEED),
    # 'GradientBoostingRegressor': GradientBoostingRegressor(random_state=SEED),
    # 'BaggingRegressor': BaggingRegressor(random_state=SEED, n_jobs=-1),
    # 'KNeighborsRegressor': KNeighborsRegressor(n_jobs=-1),
    # 'DecisionTreeRegressor': DecisionTreeRegressor(random_state=SEED),
    # 'GaussianProcessRegressor': GaussianProcessRegressor(random_state=SEED),
    # 'MLPRegressor1': MLPRegressor(random_state=SEED, max_iter=1000, activation='relu', solver='adam'),
    # 'MLPRegressor2': MLPRegressor(random_state=SEED, max_iter=1000, activation='relu', solver='lbfgs'),
    # 'MLPRegressor3': MLPRegressor(random_state=SEED, max_iter=5000, activation='tanh', solver='adam'),
    # 'MLPRegressor4': MLPRegressor(random_state=SEED, max_iter=1000, activation='tanh', solver='lbfgs'),
    # 'MLPRegressor5': MLPRegressor(random_state=SEED, max_iter=1000, activation='logistic', solver='adam'),
    # 'MLPRegressor6': MLPRegressor(random_state=SEED, max_iter=1000, activation='logistic', solver='lbfgs'),
    # 'MLPRegressor7': MLPRegressor(random_state=SEED, max_iter=1000, activation='identity', solver='adam'),
    # 'MLPRegressor8': MLPRegressor(random_state=SEED, max_iter=1000, activation='identity', solver='lbfgs'),
    # 'Ridge': Ridge(random_state=SEED),
    # 'RidgeCV': RidgeCV(alphas=np.logspace(-3, 3, 7), cv=cv),
    # 'SGDRegressor': SGDRegressor(random_state=SEED, max_iter=1000, tol=1e-3),
    # 'PassiveAggressiveRegressor': PassiveAggressiveRegressor(random_state=SEED, max_iter=1000, tol=1e-3),
    # 'LinearRegression': LinearRegression(n_jobs=-1),
    # 'Lasso': Lasso(random_state=SEED),
    # 'ElasticNet': ElasticNet(random_state=SEED, max_iter=1000000),
    # 'ElasticNetCV': ElasticNetCV(alphas=np.logspace(-3, 3, 7), cv=cv, l1_ratio=[.1, .5, .7, .9, .95, .99, 1], max_iter=1000000),
    # 'HuberRegressor': HuberRegressor(max_iter=1000),
    'BayesianRidge': BayesianRidge(),
    # 'ARDRegression': ARDRegression(),
    # 'TheilSenRegressor': TheilSenRegressor(random_state=SEED),
    # 'RANSACRegressor': RANSACRegressor(random_state=SEED),
    # 'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(normalize=False),
    # 'Lars': Lars(),
    # 'LassoLars': LassoLars(),
    # 'LassoLarsIC': LassoLarsIC(normalize=False),
    # 'VotingRegressor': VotingRegressor(
    #         estimators=[
    #             ('Ridge', Ridge(random_state=SEED)),
    #             ('LGBMRegressor4', LGBMRegressor(random_state=SEED, n_jobs=1, boosting_type='rf', subsample=.632, subsample_freq=1)),
    #             ('XGBRegressor1', XGBRegressor(random_state=SEED, n_jobs=1)),
    #             ], 
    #         n_jobs=-1,
    #         verbose=0,
    #         ),
    # 'BaggingRegressor2': BaggingRegressor(base_estimator=LGBMRegressor(random_state=SEED, n_jobs=1, boosting_type='rf', subsample=.632, subsample_freq=1),
    #                                       random_state=SEED, 
    #                                       n_jobs=-1,
    #                                       max_samples=0.632,
    #                                       max_features=0.632,
    #                                       bootstrap=True,
    #                                       ),
    # 'StackingRegressor': StackingRegressor(
    #         estimators=[
    #             ('Ridge', Ridge(random_state=SEED)),
    #             ('LGBMRegressor4', LGBMRegressor(random_state=SEED, n_jobs=1, boosting_type='rf', subsample=.632, subsample_freq=1)),
    #             ('XGBRegressor1', XGBRegressor(random_state=SEED, n_jobs=1)),
    #             ], 
    #         final_estimator=Ridge(random_state=SEED),
    #         cv=cv,
    #         n_jobs=-1,
    #         verbose=0,
    #         )
}

# for PCA_N_COMPONENTS in [None, 0.2, 0.5, 0.9, 0.95, 0.99]:
for model_name, regressor in regressors.items():
    t0 = time.time()
    scores = []
    feature_importances = pd.DataFrame()
    
    for i, (train_index, test_index) in tqdm(enumerate(cv.split(X_train))):
        
        X_train_cv, X_test_cv = X_train.iloc[train_index].copy(), X_train.iloc[test_index].copy()
        y_train_cv, y_test_cv = y_train.iloc[train_index].copy(), y_train.iloc[test_index].copy()

        X_train_cv, X_test_cv = data_preparation(X_train_cv, X_test_cv, model_name)
        regressor_ = model_preparation(regressor, model_name)

        regressor_.fit(X_train_cv, y_train_cv)        
        y_pred = regressor_.predict(X_test_cv)

        score_eval = mean_squared_error(y_test_cv, y_pred, squared=False)
        scores.append(score_eval)
        
        if isinstance(regressor_, TransformedTargetRegressor):
            regressor_ = regressor_.regressor_

        try:
            feature_importance = pd.Series(regressor_.feature_importances_, index=X_train_cv.columns, name=f'fold{i}')
        except:
            try:
                feature_importance = pd.Series(regressor_.coef_, index=X_train_cv.columns, name=f'fold{i}')
            except:
                feature_importance = pd.Series(np.zeros(X_train_cv.shape[1]), index=X_train_cv.columns, name=f'fold{i}')
        feature_importances = pd.concat([feature_importances, feature_importance], axis=1)
    
    feature_importances['mean'] = feature_importances.mean(axis=1)
    
    X_train_, X_test_ = data_preparation(X_train, X_test, model_name)
    regressor_ = model_preparation(regressor, model_name)
    regressor_.fit(X_train_, y_train)
    y_pred = regressor_.predict(X_test_)
    
    if not SUBMIT:
        score_eval = mean_squared_error(y_test, y_pred, squared=False)
    
    print(f'{model_name}: {np.mean(scores):.4f} ± {np.std(scores):.4f}, Time: {time.time() - t0:.2f} seconds, RMSE: {score_eval:.4f}')
    # print(feature_importances.sort_values('mean', ascending=False))
set_config(transform_output="default")  # reset to default

In [None]:
# LGBMRegressor4
# 12.1279 ± 0.2388, Time: 0.54 seconds, RMSE: 12.2312 Benchmark
# 12.1491 ± 0.2170, Time: 0.61 seconds, RMSE: 12.2304 Benchmark + PHONG NGUYEN features
# 12.1461 ± 0.2530, Time: 0.75 seconds, RMSE: 12.2729 Benchmark + other features
# 12.1461 ± 0.2530, Time: 0.54 seconds, RMSE: 12.2729 Benchmark + water_age_cement_ratio
# 11.7352 ± 0.2269, Time: 0.56 seconds, RMSE: 11.9373 Benchmark + water_age_cement_ratio + origin data
# 11.7387 ± 0.2246, Time: 0.63 seconds, RMSE: 11.9454 Benchmark + origin data
# 11.7221 ± 0.2309, Time: 1.24 seconds, RMSE: 11.9058 Benchmark + water_age_cement_ratio + origin data + PolynomialFeatures
# 11.7941 ± 0.2258, Time: 0.73 seconds, RMSE: 11.9935 Benchmark + water_age_cement_ratio + origin data + mean of duplicated rows
# 11.7221 ± 0.2309, Time: 1.66 seconds, RMSE: 11.9058 Benchmark + water_age_cement_ratio + origin data + PolynomialFeatures + water_age_cement_ratio
# 11.6205 ± 0.2677, Time: 3.17 seconds, RMSE: 12.3050 Benchmark + water_age_cement_ratio + origin data + PolynomialFeatures + water_age_cement_ratio + drop ratio with inf of div0 error ← BEST

In [None]:
# %% Drop one feature at a time and evaluate
# from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import cross_val_score, StratifiedKFold
from lightgbm import LGBMRegressor
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt

model = LGBMRegressor(random_state=SEED, n_jobs=1, boosting_type='rf', subsample=.632, subsample_freq=1)
X_train_prep, X_test_prep = data_preparation(X_train, X_test, 'LGBMRegressor4')

# drop one feature at a time and evaluate
cols = X_train.columns
cv = KFold(n_splits=5, shuffle=True, random_state=SEED)

sfs = SFS(model, 
          # k_features=(5,20),
          forward=False, 
          floating=False, 
          scoring='neg_root_mean_squared_error',
          cv=cv,
          verbose=2,
          n_jobs=-1,
          )

sfs = sfs.fit(X_train_prep, y_train)
fig = plot_sfs(sfs.get_metric_dict(), kind='std_err')

plt.title('Sequential Forward Selection (w. StdErr)')
plt.grid()
plt.show()

print('best combination (ACC: %.3f): %s\n' % (sfs.k_score_, sfs.k_feature_idx_))
print('all subsets:\n', sfs.subsets_)
plot_sfs(sfs.get_metric_dict(), kind='std_err')

In [None]:
X_train_sfs = sfs.transform(X_train)
X_test_sfs = sfs.transform(X_test)

model.fit(X_train_sfs, y_train)
y_pred = model.predict(X_test_sfs)

mean_squared_error(y_test, y_pred, squared=False)

In [None]:
# Make cluster
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, OPTICS, Birch, MeanShift, SpectralClustering, AffinityPropagation
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import NearestNeighbors

# Evaluation metrics for clustering
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score


In [None]:
X_train_prep, X_test_prep = data_preparation(X_train, X_test, 'LinearRegression')

y_train_preds = {}
y_test_preds = {}
scores = {}

for n_clusters in range(3,21):
    cluster_models = {
        'KMeans': KMeans(n_clusters=n_clusters, random_state=SEED),
        'Birch': Birch(n_clusters=n_clusters),
        # 'MeanShift': MeanShift(n_jobs=-1),
        # 'AffinityPropagation': AffinityPropagation(max_iter=1000, random_state=SEED),
        'GaussianMixture': GaussianMixture(n_components=n_clusters, random_state=SEED),
    }

    for model_name, model in cluster_models.items():
        name = f'{model_name}_{n_clusters}'
        model.fit(X_train_prep)
        y_pred = model.predict(X_train_prep)
        y_train_preds[name] = y_pred
        y_test_preds[name] = model.predict(X_test_prep)

        if y_pred.max() != y_pred.min():
            scores[name] = [silhouette_score(X_train_prep, y_pred, random_state=SEED), calinski_harabasz_score(X_train_prep, y_pred), davies_bouldin_score(X_train_prep, y_pred)]
            print(f'{model_name} with {n_clusters} clusters: {scores[name]}')
        else:
            print('All data in one cluster')

In [None]:
pd.DataFrame(y_train_preds)