In [44]:
import numpy as np
import pandas as pd

from scipy.stats import norm, skew, kurtosis

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import statsmodels.graphics.gofplots as sm

from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import PowerTransformer, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import BayesianRidge, LinearRegression, Lars
from sklearn.ensemble import StackingRegressor, BaggingRegressor, VotingRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

from catboost import CatBoostRegressor

import pycaret.regression as pr

import optuna

import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

# 1) Loading Data

In [45]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [46]:
df.drop(['Id'], axis=1, inplace=True)
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [47]:
df1 = df.copy()

# 2) Missing Values

In [48]:
def missing_vals(df):
    
    missing = df.isna().sum()[df.isna().sum() > 0].sort_values(ascending=False).values
    
    percentage = (df.isna().mean()*100)[df.isna().mean()*100 > 0].sort_values(ascending=False).values
    
    names = df.isna().sum()[df.isna().sum() > 0].sort_values(ascending=False).index
    
    dtypes = df[names].dtypes.values
    
    data = np.array([dtypes, missing, percentage]).T
    
    return pd.DataFrame(data=data, index=names, columns=['Dtypes', '#Missing Values', '%Missing Values'])

In [49]:
missing_vals(df1)

Unnamed: 0,Dtypes,#Missing Values,%Missing Values
PoolQC,object,1453,99.520548
MiscFeature,object,1406,96.30137
Alley,object,1369,93.767123
Fence,object,1179,80.753425
FireplaceQu,object,690,47.260274
LotFrontage,float64,259,17.739726
GarageType,object,81,5.547945
GarageYrBlt,float64,81,5.547945
GarageFinish,object,81,5.547945
GarageQual,object,81,5.547945


In [50]:
"""plt.figure(figsize=(20, 7))
sns.heatmap(df1.isna(), cbar=False)
plt.show()"""

'plt.figure(figsize=(20, 7))\nsns.heatmap(df1.isna(), cbar=False)\nplt.show()'

In [51]:
fill_zero = ['MasVnrArea', 'GarageArea', 'GarageYrBlt']
df1[fill_zero] = SimpleImputer(strategy='constant', fill_value=0).fit_transform(df1[fill_zero])

change_cat = ['MSSubClass', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'MoSold', 'YrSold', 'OverallQual', 'OverallCond']
df1[change_cat] = df1[change_cat].astype(object)

fill_none = ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'MasVnrType']
df1[fill_none] = SimpleImputer(strategy='constant', fill_value='None').fit_transform(df1[fill_none])

delete_rows = ['Electrical']
df1.dropna(axis=0, subset=delete_rows, inplace=True)

fill_num = ['LotFrontage']
knn_imputer = KNNImputer(n_neighbors=5)
df1[fill_num] = knn_imputer.fit_transform(df1[fill_num])

In [52]:
missing_vals(df1)

Unnamed: 0,Dtypes,#Missing Values,%Missing Values


In [53]:
df2 = df1.copy()

# 3) Feature Engineering

In [54]:
# Square per Room
df2["SqFtPerRoom"] = df2["GrLivArea"] / (df2["TotRmsAbvGrd"] + df2["FullBath"] + df2["HalfBath"] + df2["KitchenAbvGr"])

# Total Home Quality
df2['Total_Home_Quality'] = df2['OverallQual'] + df2['OverallCond']

# Total Bathrooms
df2['Total_Bathrooms'] = (df2['FullBath'] + (0.5*df2['HalfBath']) + df2['BsmtFullBath'] + (0.5*df2['BsmtHalfBath']))

# HighQualSF
df2["HighQualSF"] = df2["1stFlrSF"] + df2["2ndFlrSF"]

In [55]:
df3 = df2.copy()

# 4) Target Transformation

In [56]:
def skew_kurtosis(df):
    
    numeric_features = df.dtypes[df.dtypes != 'object'].index
    
    skewness_vals = df[numeric_features].apply(axis=0, func=lambda x: skew(x)).values
    
    kurtosis_vals = df[numeric_features].apply(axis=0, func=lambda x: kurtosis(x)).values
    
    data = np.array([skewness_vals, kurtosis_vals]).T
    
    return pd.DataFrame(data=data, index=numeric_features, columns=['Skewness', 'Kurtosis'])

In [57]:
skew_kurtosis(df3[['SalePrice']])

Unnamed: 0,Skewness,Kurtosis
SalePrice,1.880008,6.502799


In [58]:
"""fig, (ax1, ax2) = plt.subplots(ncols=2, nrows=1, figsize=(20, 7))

sns.histplot(df3['SalePrice'], stat='density', color='orange', ax=ax1)
mu, std = norm.fit(df3['SalePrice'])
xx = np.linspace(*ax1.get_xlim(),100)
ax1.set_title('Sales Price Distribution')
sns.lineplot(x=xx, y=norm.pdf(xx, mu, std), ax=ax1)

sm.ProbPlot(df3['SalePrice']).qqplot(line='s', ax=ax2)
ax1.set_title('Normal Probability Plot of Sales Price')

plt.show()"""

"fig, (ax1, ax2) = plt.subplots(ncols=2, nrows=1, figsize=(20, 7))\n\nsns.histplot(df3['SalePrice'], stat='density', color='orange', ax=ax1)\nmu, std = norm.fit(df3['SalePrice'])\nxx = np.linspace(*ax1.get_xlim(),100)\nax1.set_title('Sales Price Distribution')\nsns.lineplot(x=xx, y=norm.pdf(xx, mu, std), ax=ax1)\n\nsm.ProbPlot(df3['SalePrice']).qqplot(line='s', ax=ax2)\nax1.set_title('Normal Probability Plot of Sales Price')\n\nplt.show()"

In [59]:
target_transformer = PowerTransformer(method='yeo-johnson', standardize=False)

df3['Transformed_SalePrice'] = target_transformer.fit_transform(df3[['SalePrice']]).T[0]

"""fig, (ax1, ax2) = plt.subplots(ncols=2, nrows=1, figsize=(20, 7))

sns.histplot(df3['Transformed_SalePrice'], stat='density', color='orange', ax=ax1)
mu, std = norm.fit(df3['Transformed_SalePrice'])
xx = np.linspace(*ax1.get_xlim(),100)
ax1.set_title('Transformed Sales Price Distribution')
sns.lineplot(x=xx, y=norm.pdf(xx, mu, std), ax=ax1)

sm.ProbPlot(df3['Transformed_SalePrice']).qqplot(line='s', ax=ax2)
ax1.set_title('Normal Probability Plot of Transformed Sales Price')

plt.show()"""

"fig, (ax1, ax2) = plt.subplots(ncols=2, nrows=1, figsize=(20, 7))\n\nsns.histplot(df3['Transformed_SalePrice'], stat='density', color='orange', ax=ax1)\nmu, std = norm.fit(df3['Transformed_SalePrice'])\nxx = np.linspace(*ax1.get_xlim(),100)\nax1.set_title('Transformed Sales Price Distribution')\nsns.lineplot(x=xx, y=norm.pdf(xx, mu, std), ax=ax1)\n\nsm.ProbPlot(df3['Transformed_SalePrice']).qqplot(line='s', ax=ax2)\nax1.set_title('Normal Probability Plot of Transformed Sales Price')\n\nplt.show()"

In [60]:
df3.drop(['SalePrice'], axis=1, inplace=True)
df4 = df3.copy()

# 5) Features Transformation

In [61]:
skew_kurtosis(df4.drop(['Transformed_SalePrice'], axis=1))

Unnamed: 0,Skewness,Kurtosis
LotFrontage,2.38206,21.754015
LotArea,12.190881,202.40212
MasVnrArea,2.673798,10.09523
BsmtFinSF1,1.683465,11.079615
BsmtFinSF2,4.249219,20.023898
BsmtUnfSF,0.918367,0.466639
TotalBsmtSF,1.52519,13.232154
1stFlrSF,1.375089,5.724629
2ndFlrSF,0.813466,-0.554484
LowQualFinSF,8.998885,82.885802


In [62]:
skewed_values = skew_kurtosis(df4.drop(['Transformed_SalePrice'], axis=1))

threshold = (np.abs(skewed_values['Skewness']) < 2) | (np.abs(skewed_values['Kurtosis']) < 7)

skewed_values[threshold]

Unnamed: 0,Skewness,Kurtosis
BsmtFinSF1,1.683465,11.079615
BsmtUnfSF,0.918367,0.466639
TotalBsmtSF,1.52519,13.232154
1stFlrSF,1.375089,5.724629
2ndFlrSF,0.813466,-0.554484
GrLivArea,1.364297,4.868582
BsmtFullBath,0.594354,-0.84147
FullBath,0.037821,-0.85804
HalfBath,0.677275,-1.073973
BedroomAbvGr,0.211839,2.215847


In [63]:
skewed_features = skewed_values[threshold].index
skewed_features

parameter_transformer = PowerTransformer(method='yeo-johnson', standardize=False)

df4[skewed_features] = parameter_transformer.fit_transform(df4[skewed_features])

In [64]:
df5 = df4.copy()

# 6) Encoding

## I) Ordinal Encoding

In [65]:
ordinal_features = ['MSSubClass', 'OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC', 'Functional', 'Fence', 'GarageFinish', 'LandSlope', 'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'YrSold', 'MoSold', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'Total_Home_Quality']
ordinal_encoder = OrdinalEncoder()
df5[ordinal_features] = ordinal_encoder.fit_transform(df5[ordinal_features])

In [66]:
standardize_features = df5.dtypes[df5.dtypes != 'object'].index
standardize_features = standardize_features[:-1]

In [67]:
df6 = df5.copy()

## II) OHE

In [68]:
ohe_features = df6[df6.dtypes[df6.dtypes == 'object'].index].columns
ohe_encoder = OneHotEncoder(sparse=False, drop=None)
ohe_encoded = ohe_encoder.fit_transform(df6[ohe_features])

In [69]:
ohe_categories = []
counter = 0

for i in ohe_encoder.categories_:
    for j in i:
        counter += 1
        ohe_categories.append(j + str(counter)) 

df6.drop(ohe_features, axis=1, inplace=True)
other_features = df6.columns.values

In [70]:
concatenated_data = np.concatenate((df6.values, ohe_encoded), axis=1)

transformed_data = pd.DataFrame(data=concatenated_data, columns=[*other_features, *ohe_categories])

In [71]:
df7 = transformed_data.copy()

# 7) Scaling

In [72]:
standard_scaler = StandardScaler()

df7[standardize_features] = standard_scaler.fit_transform(df7[standardize_features])

In [73]:
df8 =df7.copy()

# 10) Splitting Data

In [74]:
X = df8.drop(['Transformed_SalePrice'], axis=1)
y = df8['Transformed_SalePrice']

X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.8, random_state=12345)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=12345)

# 11) Model Selection

In [75]:
_ = pr.setup(data=df8, target='Transformed_SalePrice', session_id=12345)

Unnamed: 0,Description,Value
0,session_id,12345
1,Target,Transformed_SalePrice
2,Original Data,"(1459, 227)"
3,Missing Values,False
4,Numeric Features,63
5,Categorical Features,163
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(1021, 224)"


## I) Model Selection

In [76]:
top3 = pr.compare_models(n_select=3)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,0.0314,0.0024,0.0483,0.9019,0.0055,0.004,2.618
gbr,Gradient Boosting Regressor,0.0356,0.0029,0.0532,0.8813,0.006,0.0046,0.122
lightgbm,Light Gradient Boosting Machine,0.0358,0.0029,0.0533,0.8811,0.0061,0.0046,0.045
xgboost,Extreme Gradient Boosting,0.038,0.0031,0.0548,0.8756,0.0062,0.0049,0.278
et,Extra Trees Regressor,0.0378,0.0031,0.0552,0.8743,0.0063,0.0048,0.309
br,Bayesian Ridge,0.0347,0.0032,0.0534,0.8703,0.006,0.0044,0.016
rf,Random Forest Regressor,0.0382,0.0033,0.0562,0.8684,0.0064,0.0049,0.332
omp,Orthogonal Matching Pursuit,0.0381,0.0033,0.0557,0.8606,0.0063,0.0049,0.255
ridge,Ridge Regression,0.0359,0.0034,0.0554,0.8602,0.0063,0.0046,0.008
huber,Huber Regressor,0.0386,0.0036,0.0587,0.8533,0.0067,0.0049,0.103


In [77]:
top3

[<catboost.core.CatBoostRegressor at 0x1f420404c10>,
 GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                           init=None, learning_rate=0.1, loss='ls', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=12345, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0, warm_start=False),
 LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_job

## II) Finding 'Optimal' Hyper Parameters for Future HyperParameter Optimization

### i) Catboost Regressor

In [78]:
catboost = pr.create_model('catboost')
tuned_catboost = pr.tune_model(catboost, search_library='optuna')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0372,0.0029,0.0539,0.8777,0.0061,0.0048
1,0.033,0.0022,0.0473,0.8778,0.0054,0.0042
2,0.027,0.0014,0.0373,0.9448,0.0042,0.0034
3,0.0369,0.0031,0.0555,0.8783,0.0064,0.0047
4,0.042,0.0037,0.0609,0.8194,0.0069,0.0054
5,0.0398,0.0047,0.0682,0.8026,0.0077,0.0051
6,0.0391,0.0033,0.0575,0.8924,0.0064,0.005
7,0.0407,0.0045,0.0674,0.8655,0.0078,0.0053
8,0.0342,0.0023,0.0476,0.9097,0.0054,0.0044
9,0.0338,0.0024,0.0489,0.9076,0.0056,0.0043


In [79]:
tuned_catboost.get_all_params()

{'nan_mode': 'Min',
 'eval_metric': 'RMSE',
 'iterations': 184,
 'sampling_frequency': 'PerTree',
 'leaf_estimation_method': 'Newton',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'eval_fraction': 0,
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 1,
 'random_strength': 0.3686745166778565,
 'rsm': 1,
 'boost_from_average': True,
 'model_size_reg': 0.5,
 'pool_metainfo_options': {'tags': {}},
 'subsample': 0.800000011920929,
 'use_best_model': False,
 'random_seed': 12345,
 'depth': 9,
 'posterior_sampling': False,
 'border_count': 254,
 'classes_count': 0,
 'auto_class_weights': 'None',
 'sparse_features_conflict_fraction': 0,
 'leaf_estimation_backtracking': 'AnyImprovement',
 'best_model_min_trees': 1,
 'model_shrink_rate': 0,
 'min_data_in_leaf': 1,
 'loss_function': 'RMSE',
 'learning_rate': 0.029326684772968292,

### ii) Gradient Boosting Regressor

In [80]:
gbr = pr.create_model('gbr')
tuned_gbr = pr.tune_model(gbr, search_library='optuna')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0334,0.002,0.0449,0.9153,0.0051,0.0043
1,0.03,0.0022,0.0471,0.8787,0.0053,0.0038
2,0.0303,0.0014,0.0377,0.9434,0.0043,0.0039
3,0.0328,0.0027,0.0515,0.8952,0.0059,0.0042
4,0.0439,0.0042,0.065,0.7949,0.0074,0.0056
5,0.0391,0.0055,0.0743,0.7656,0.0083,0.005
6,0.0354,0.0026,0.0512,0.9146,0.0058,0.0045
7,0.0376,0.0036,0.06,0.8934,0.007,0.0048
8,0.0295,0.0014,0.0379,0.9428,0.0043,0.0038
9,0.0312,0.0018,0.0422,0.9311,0.0048,0.004


In [81]:
"""tuned_gbr"""

'tuned_gbr'

### iii) Light Gradient Boosting Machine Regressor

In [82]:
lightgbm = pr.create_model('lightgbm')
tuned_lightgbm = pr.tune_model(lightgbm, search_library='optuna')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0494,0.0045,0.0671,0.8107,0.0076,0.0063
1,0.0427,0.0038,0.0619,0.7909,0.007,0.0055
2,0.0414,0.0031,0.0556,0.877,0.0063,0.0053
3,0.0472,0.0052,0.0722,0.7942,0.0082,0.006
4,0.0506,0.0047,0.0688,0.7696,0.0078,0.0065
5,0.0554,0.006,0.0778,0.7435,0.0088,0.0071
6,0.0508,0.0056,0.0751,0.8163,0.0084,0.0064
7,0.0541,0.0079,0.0889,0.7664,0.0103,0.007
8,0.0417,0.0038,0.0613,0.8506,0.0069,0.0053
9,0.0456,0.0047,0.0688,0.8171,0.0078,0.0058


In [83]:
tuned_lightgbm

LGBMRegressor(bagging_fraction=0.5739315405737122, bagging_freq=4,
              boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              feature_fraction=0.8113993142957511, importance_type='split',
              learning_rate=0.058407053453446586, max_depth=-1,
              min_child_samples=17, min_child_weight=0.001,
              min_split_gain=0.24533996795787216, n_estimators=235, n_jobs=-1,
              num_leaves=99, objective=None, random_state=12345,
              reg_alpha=1.6379043949015576e-05,
              reg_lambda=2.8109267438304045e-10, silent='warn', subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

# 12) HyperParameter Optimization

In [84]:
kf = KFold(n_splits=10, shuffle=True, random_state=12345)

scores = {'Regressor':[], 'RMSE':[]}

## I) Catboost Regressor

In [121]:
def catboost_optimizer(trial):
    
    iterations = trial.suggest_int('iterations', 110, 190)
    learning_rate = trial.suggest_float('learning_rate', 1e-1, 1)
    depth = trial.suggest_int('depth', 4, 10)
    l2_leaf_reg = trial.suggest_float('l2_leaf_reg', 1e-3, 1e1)
    '''eval_metric = trial.suggest_categorical('eval_metric', ['RMSE', 'MAE'])
    loss_function = trial.suggest_categorical('loss_function', ['RMSE', 'MAE'])'''
    
    parameters = {'learning_rate':learning_rate, 'depth':depth, 'l2_leaf_reg':l2_leaf_reg, 'random_seed':12345, 'loss_function':'RMSE', 'use_best_model':True, 'eval_metric':'RMSE', 'od_type':'Iter', 'od_wait':20, 'task_type':'GPU', 'iterations':iterations}
    
    model = CatBoostRegressor(**parameters)
    model.fit(X_train, y_train, eval_set=(X_test, y_test))
    predictions = model.predict(X_test)
    
    """cv_scores = target_transformer.inverse_transform(np.sqrt(-cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=kf)).reshape(-1, 1))
    cv_score = np.mean(cv_scores)"""
    
    test_score = target_transformer.inverse_transform([[np.sqrt(mean_squared_error(y_test, predictions))]])[0][0]
    
    return test_score

In [122]:
study = optuna.create_study(direction='minimize')
study.optimize(catboost_optimizer, n_trials=1000)

0:	learn: 0.0865090	test: 0.1050727	best: 0.1050727 (0)	total: 7.56ms	remaining: 1.29s
1:	learn: 0.0664115	test: 0.0899095	best: 0.0899095 (1)	total: 13.4ms	remaining: 1.14s
2:	learn: 0.0583938	test: 0.0826103	best: 0.0826103 (2)	total: 19.6ms	remaining: 1.1s
3:	learn: 0.0536245	test: 0.0791149	best: 0.0791149 (3)	total: 26.1ms	remaining: 1.09s
4:	learn: 0.0493953	test: 0.0771258	best: 0.0771258 (4)	total: 32.1ms	remaining: 1.07s
5:	learn: 0.0458333	test: 0.0768377	best: 0.0768377 (5)	total: 38.4ms	remaining: 1.06s
6:	learn: 0.0433172	test: 0.0752884	best: 0.0752884 (6)	total: 44.5ms	remaining: 1.05s
7:	learn: 0.0413583	test: 0.0760924	best: 0.0752884 (6)	total: 50.6ms	remaining: 1.04s
8:	learn: 0.0395042	test: 0.0747666	best: 0.0747666 (8)	total: 56.9ms	remaining: 1.03s
9:	learn: 0.0375283	test: 0.0742586	best: 0.0742586 (9)	total: 63.3ms	remaining: 1.02s
10:	learn: 0.0354336	test: 0.0743023	best: 0.0742586 (9)	total: 69.8ms	remaining: 1.02s
11:	learn: 0.0330406	test: 0.0729217	best: 

In [126]:
catboost_params = study.best_params
catboost_params['random_seed'] = 12345
catboost_params['loss_function'] = 'RMSE'
catboost_params['eval_metric'] = 'RMSE'
catboost_params['task_type'] = 'GPU'
catboost_params['use_best_model'] = True
catboost_params['od_type'] = 'Iter'
catboost_params['od_wait'] = 20

pd.DataFrame(data=catboost_params.values(), index=catboost_params.keys(), columns=['Value']) 

Unnamed: 0,Value
iterations,117
learning_rate,0.376567
depth,4
l2_leaf_reg,0.870004
random_seed,12345
loss_function,RMSE
eval_metric,RMSE
task_type,GPU
use_best_model,True
od_type,Iter


In [127]:
scores['Regressor'].append('CatBoostRegressor')
scores['RMSE'].append(study.best_value)
pd.DataFrame(data=scores)

Unnamed: 0,Regressor,RMSE
0,CatBoostRegressor,0.052995
1,CatBoostRegressor,0.052969
2,CatBoostRegressor,0.052969
3,CatBoostRegressor,0.051307
4,CatBoostRegressor,0.051617
5,CatBoostRegressor,0.049897
6,CatBoostRegressor,0.049897
7,CatBoostRegressor,0.05094
8,CatBoostRegressor,0.05094


In [125]:
optuna.visualization.plot_optimization_history(study)

## II) Gradient Boosting Regressor

In [None]:
def gbr_optimizer(trial):

    learning_rate = trial.suggest_float('learning_rate', 0.1, 0.2)
    n_estimators = trial.suggest_int('n_estimators', 102, 106)
    subsample = trial.suggest_float('subsample', 0.2, 0.4)
    max_depth = trial.suggest_int('max_depth', 3, 8)
    
    parameters = {'loss':'ls', 'learning_rate':learning_rate, 'n_estimators':n_estimators, 'subsample':subsample, 'criterion':'friedman_mse', 'max_depth':max_depth, 'max_features':'auto', 'random_state':12345}
    
    model = GradientBoostingRegressor(**parameters)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    """cv_scores = target_transformer.inverse_transform(np.sqrt(-cross_val_score(model, X_test, y_test, scoring='neg_mean_squared_error', cv=kf)).reshape(-1, 1))
    cv_score = np.mean(cv_scores)"""
    
    test_score = target_transformer.inverse_transform([[np.sqrt(mean_squared_error(y_test, predictions))]])[0][0]
    
    return test_score

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(gbr_optimizer, n_trials=200)

In [None]:
gbr_params = study.best_params
gbr_params['random_state'] = 12345
gbr_params['loss'] = 'ls'
gbr_params['max_features'] = 'auto'
gbr_params['criterion'] = 'friedman_mse'
pd.DataFrame(data=gbr_params.values(), index=gbr_params.keys(), columns=['Value'])

In [None]:
gbr_params

In [None]:
scores['Regressor'].append('GradientBosstingRegressor')
scores['RMSE'].append(study.best_value)
pd.DataFrame(data=scores)

In [None]:
optuna.visualization.plot_optimization_history(study)

## III) Light Gradient Boosting Machine Regressor

In [None]:
def lar_optimizer(trial):

    fit_intercept = trial.suggest_categorical('fit_intercept', [False, True])
    
    parameters = {'fit_intercept':fit_intercept, 'normalize':False, 'n_nonzero_coefs':np.inf}
    
    model = Lars(**parameters)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    """cv_scores = target_transformer.inverse_transform(np.sqrt(-cross_val_score(model, X_test, y_test, scoring='neg_mean_squared_error', cv=kf)).reshape(-1, 1))
    cv_score = np.mean(cv_scores)"""
    
    test_score = target_transformer.inverse_transform([[np.sqrt(mean_squared_error(y_test, predictions))]])[0][0]
    
    return test_score

In [None]:
dummy_params = {'fit_intercept':[False, True]}
study = optuna.create_study(direction='minimize', sampler=optuna.samplers.GridSampler(dummy_params))
study.optimize(lar_optimizer, n_trials=2)

In [None]:
lar_params = study.best_params
lar_params['normalize'] = False
lar_params['n_nonzero_coefs'] = np.inf
pd.DataFrame(data=lar_params.values(), index=lar_params.keys(), columns=['Value'])

In [None]:
scores['Regressor'].append('LeastAngleRegression')
scores['RMSE'].append(study.best_value)
pd.DataFrame(data=scores)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
pd.DataFrame(scores)[pd.DataFrame(scores)['RMSE'] == np.min(pd.DataFrame(scores)['RMSE'])]

In [None]:
pd.DataFrame(data=scores)

# Feature Selection

# 13) Model Training & Results

In [None]:
def train_results(model, X_train, y_train, X_test, y_test, X_valid, y_valid):
    
    model = model.fit(X_train, y_train)
    
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    valid_pred = model.predict(X_valid)
    
    train_RMSE = target_transformer.inverse_transform([[np.sqrt(mean_squared_error(y_train, train_pred))]])[0][0]
    train_R2 = r2_score(y_train, train_pred)
    
    test_RMSE = target_transformer.inverse_transform([[np.sqrt(mean_squared_error(y_test, test_pred))]])[0][0]
    test_R2 = r2_score(y_test, test_pred)
    
    valid_RMSE = target_transformer.inverse_transform([[np.sqrt(mean_squared_error(y_valid, valid_pred))]])[0][0]
    valid_R2 = r2_score(y_valid, valid_pred)
    
    scores = {'Data':['Train', 'Test', 'Validation'], 'RMSE':[train_RMSE, test_RMSE, valid_RMSE], 'R2':[train_R2, test_R2, valid_R2]}
    
    return pd.DataFrame(data=scores)

## I) Catboost Regressor

In [None]:
train_results(CatBoostRegressor(**catboost_params), X_train, y_train, X_test, y_test, X_valid, y_valid)

## II) Linear Regression

In [None]:
train_results(LinearRegression(**lr_params), X_train, y_train, X_test, y_test, X_valid, y_valid)

## III) Least Angle Regression

In [None]:
train_results(Lars(**lar_params), X_train, y_train, X_test, y_test, X_valid, y_valid)