In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

sns.set(style="whitegrid", context="talk")

house_data_train = pd.read_csv('data/final.csv')
df = house_data_train.copy()

# TODO: Remove addition of _ord features from analysis.ipynb - they're added in housing pipeline
df.drop(columns=[c for c in df.columns if "_ord" in c], inplace=True)


In [2]:
df.shape

(1440, 80)

In [3]:
df.columns

Index(['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'YearBuilt', 'YearRemodAdd',
       'GarageYrBlt', 'MoSold', 'YrSold', 'SalePrice', 'OverallQual',
       'OverallCond', 'KitchenQual', 'ExterQual', 'ExterCond', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC',
       'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC', 'MSSubClass',
       'MSZoning', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd'

In [4]:
df.shape

(1440, 80)

In [5]:
X = df.drop("SalePrice", axis=1)
y_train_log = np.log1p(df["SalePrice"])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y_train_log, test_size=0.2)

# Pipelines

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler

from housing_pipeline import (
	Log1pFeatureImputer,
	LotFrontageNeighborhoodImputer,
	MeaningfullNAImputer,
	BooleanFeaturesImputer,
	SFImputer,
	GarageFeaturesImputer,
	BsmtBathImputer,
	MasVnrAreaImputer,
	HousingOrdinalEncoder,
	HousingNominalOneHotEncoder,
	PolyFeaturesImputer
)


impute_NA_features = ["Alley", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual", "GarageCond", "PoolQC", "Fence", "MiscFeature", "MasVnrType", "Electrical"]
ord_cat_features = {
	'OverallQual': [1,2,3,4,5,6,7,8,9,10],
	'OverallCond': [1,2,3,4,5,6,7,8,9,10],
	'KitchenQual': ["NA","Po","Fa","TA","Gd","Ex"],
	'ExterQual': ["Po","Fa","TA","Gd","Ex"],
	'ExterCond': ["Po","Fa","TA","Gd","Ex"],
	'BsmtQual': ["NA","Po","Fa","TA","Gd","Ex"],
	'BsmtCond': ["NA","Po","Fa","TA","Gd","Ex"],
	'BsmtExposure': ["NA","No","Mn","Av","Gd"],
	'BsmtFinType1': ["NA","Unf","LwQ","Rec","BLQ","ALQ","GLQ"],
	'BsmtFinType2': ["NA","Unf","LwQ","Rec","BLQ","ALQ","GLQ"],
	'HeatingQC': ["Po","Fa","TA","Gd","Ex"],
	'FireplaceQu': ["NA","Po","Fa","TA","Gd","Ex"],
	'GarageQual': ["NA","Po","Fa","TA","Gd","Ex"],
	'GarageCond': ["NA","Po","Fa","TA","Gd","Ex"],
	'PoolQC': ["NA","Po", "Fa","TA","Gd","Ex"],
}
nom_cat_features = ["MSSubClass", "MSZoning", "Alley", "LotShape", "LandContour", "Utilities", "LotConfig", "LandSlope", "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "Heating", "CentralAir", "Electrical", "Functional", "GarageType", "PavedDrive", "Fence", "SaleType", "SaleCondition", "Street", "GarageFinish", "MiscFeature"]

custom_pipelines = Pipeline([
	("Log1pFeatureImputer", Log1pFeatureImputer(["LotArea", "GrLivArea"])),
	("LotFrontageNeighborhoodImputer", LotFrontageNeighborhoodImputer()),
	("MeaningfullNAImputer", MeaningfullNAImputer(impute_NA_features)),
	("BooleanFeaturesImputer", BooleanFeaturesImputer()),
	("TotalSFImputer", SFImputer()),
	("GarageFeaturesImputer", GarageFeaturesImputer()),
	("BsmtBathImputer", BsmtBathImputer()),
	("MasVnrAreaImputer", MasVnrAreaImputer()),
	("PolyFeaturesImputer", PolyFeaturesImputer(
        features=["OverallCond", "OverallQual", "YearBuilt", "FloorTotalSF", "TotalBsmtSF", "TotalSF", "GarageAreaPerCar", "TotalBsmtBath"],
        degree=2
	)),
	("HousingOrdinalEncoder", HousingOrdinalEncoder(ord_cat_features)),
	("HousingNominalOneHotEncoder", HousingNominalOneHotEncoder(nom_cat_features)),
    ('simple_imputer', SimpleImputer(strategy="median")), # To avoid errors on test set (3 missing values for basement)
	("scaler", RobustScaler())
])

# Checking the dataset after preprocessing pipeline

In [7]:
def null_mask(X):
	# Always return a boolean array/DataFrame of same shape
	if isinstance(X, (pd.DataFrame, pd.Series)):
		return X.isna()
	Xn = np.asarray(X)
	if np.issubdtype(Xn.dtype, np.number):
		return np.isnan(Xn)
	# object / mixed
	return pd.isna(Xn)

def null_count(mask):
	# Always return a scalar int
	if isinstance(mask, (pd.DataFrame, pd.Series)):
		return int(mask.to_numpy().sum())
	return int(np.asarray(mask).sum())

X_cur = X.copy()
# X_cur = X_test.copy()

def check_pipeline(X: pd.DataFrame, pipelines, is_fit=False):
	for name, step in pipelines.steps:
		if is_fit:
			X = step.transform(X)
		else:
			X = step.fit_transform(X)
		mask = null_mask(X)
		print(f"{name:>20} | shape={np.asarray(X).shape} | nulls={null_count(mask)}")
		
		if isinstance(X, pd.DataFrame):
			per_col = X.isna().sum()
			bad_cols = per_col[per_col > 0].sort_values(ascending=False)
			if len(bad_cols):
				print("   columns with nulls:", bad_cols.head(10).to_dict())



In [None]:
custom_pipelines.fit(X)
check_pipeline(X, custom_pipelines, is_fit=True)

In [None]:
df_test = pd.read_csv('data/test.csv')
id_col = df_test.pop('Id')
X_test = df_test.reindex(columns=X.columns)


In [None]:
check_pipeline(X_test, custom_pipelines, is_fit=True)

Safe to train models, no null values

In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import RobustScaler

def rmse(y_true, y_pred):
	return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(rmse, greater_is_better=False)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:

base = Pipeline([
	("prep", custom_pipelines),
	("model", Ridge())
])

models_and_grids = {
	"ridge": {
		"model": [Ridge(random_state=42, max_iter=20000)],
		"model__alpha": [1.0, 3.0, 10.0, 30.0, 100.0],
		"prep__scaler": [RobustScaler()],
	},
	"lasso": {
		"model": [Lasso(random_state=42, max_iter=20000)],
		"model__alpha": [1e-4, 3e-4, 1e-3, 3e-3, 1e-2],
		"prep__scaler": [RobustScaler()],
	},
	"elasticnet": {
		"model": [ElasticNet(random_state=42, max_iter=20000)],
		"model__alpha": [1e-4, 1e-3, 1e-2],
		"model__l1_ratio": [0.1, 0.5, 0.9],
		"prep__scaler": [RobustScaler()],
	},
	"gbr": {
		"model": [GradientBoostingRegressor(random_state=42)],
		"model__n_estimators": [1000, 2000],
		"model__learning_rate": [0.03, 0.05],
		"model__max_depth": [2, 3],
		"model__subsample": [0.7, 1.0],
		"prep__scaler": ["passthrough"],
	},
	"rf": {
		"model": [RandomForestRegressor(random_state=42, n_jobs=-1)],
		"model__n_estimators": [500, 1000],
		"model__max_depth": [None, 10, 20],
		"model__min_samples_leaf": [1, 2, 5],
		"prep__scaler": ["passthrough"],
	},

	# # Cannot be used - too many features - use GD algorithm
	# "svr": {
	# 	"model": [SVR()],
	# 	"model__C": [3, 10, 30],
	# 	"model__gamma": ["scale", 0.01, 0.03],
	# 	"model__epsilon": [0.05, 0.1],
	# 	"prep__scaler": [RobustScaler()],
	# },
}

results = []
best_estimators = {}

for name, grid in models_and_grids.items():
	print(f"Training {name} with {grid}")
	gs = GridSearchCV(
		estimator=base,
		param_grid=grid,
		scoring=rmse_scorer,
		cv=cv,
		n_jobs=-1,
		verbose=0
	)
	gs.fit(X_train, y_train)
	cv_rmse = -gs.best_score_

	# X_val predict
	val_pred = gs.predict(X_val)
	val_rmse = rmse(y_val, val_pred)
	
	results.append((name, cv_rmse, val_rmse, gs.best_params_))
	best_estimators[name] = gs.best_estimator_

results_df = pd.DataFrame(results, columns=["model", "cv_rmse", "val_rmse", "best_params"]).sort_values("val_rmse")
results_df

In [None]:
model = best_estimators['ridge']
val_pred = model.predict(X_val)

sns.scatterplot(x=y_val, y=val_pred)
plt.xlabel('y_val')
plt.ylabel('y_pred')

In [None]:
model = best_estimators['elasticnet']
val_pred = model.predict(X_val)

sns.scatterplot(x=y_val, y=val_pred)
plt.xlabel('y_val')
plt.ylabel('y_pred')

Most of the predictions are on diagonal which is good. However there are some outliers which need further investigation.

# Param tuning

Selecting Ridge for param tuning as it seems to be the best model

In [None]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import Ridge

cv = KFold(n_splits=10, shuffle=True, random_state=42)

base = Pipeline([
	("prep", custom_pipelines),
	("model", Ridge(random_state=42))
])

param_grid = {
	"model__alpha": [0.1, 0.5, 1.0, 3.0, 10.0, 30.0, 100.0, 200, 500, 1000],
	"model__tol": [0.0001, 0.000001],
    "model__solver": ["svd", "cholesky", "auto"],
    "model__max_iter": [20000, 50000]
}

grid = GridSearchCV(
    estimator=base,
    param_grid=param_grid,
    scoring=rmse_scorer,
    cv=cv,
    n_jobs=-1,
	refit=True,
    return_train_score=True
)

grid.fit(X_train, y_train)

In [None]:
print(f"Best cv score: {-grid.best_score_}")
print(f"Best params: {grid.best_params_}")


In [None]:
val_pred = grid.predict(X_val)
rmse(y_val, val_pred)

# Making first model for submission

In [None]:
model = Ridge(random_state=42, alpha=10, max_iter=20000, solver='cholesky', tol=0.0001)

base = Pipeline([
	("prep", custom_pipelines),
	("model", model)
])

# Fitting on entire train dataset
base.fit(X, y_train_log)

In [None]:
y_pred = base.predict(X)
rmse(y_train_log, y_pred)

# Test dataset predictions

In [None]:
df_test = pd.read_csv('data/test.csv')
id_col = df_test.pop('Id')
X_test = df_test.reindex(columns=X.columns)
y_test = base.predict(X_test)

submission = pd.DataFrame(
    {
        "Id": id_col,
        "SalePrice": np.expm1(y_test)
})

submission.to_csv('submissions/first_attempt.csv', index=False)

# Ensemble of GradientBoostingRegressor and Ridge

## Ridge

In [None]:
from sklearn.model_selection import KFold

ridge_grid = {
    "model__alpha": [0.1, 1.0, 10.0, 20.0],
    "model__fit_intercept": [True, False],
    "model__max_iter": [5000, 10000, 20000],
    "model__solver": ["auto", "svd", "cholesky", "lsqr", "sag", "saga"],
}

ridge_pipe = Pipeline([
    ("prep", custom_pipelines),
    ("model", Ridge())
])

cv = KFold(n_splits=5, shuffle=True, random_state=42)

ridge_gs = GridSearchCV(
	estimator=ridge_pipe,
	param_grid=ridge_grid,
	scoring="neg_root_mean_squared_error",
	cv=cv,
	n_jobs=-1,
	refit=True,
	return_train_score=True,
)
ridge_gs.fit(X_train, y_train)

best_idx = ridge_gs.best_index_
train_rmse = -float(ridge_gs.cv_results_["mean_train_score"][best_idx])
val_rmse   = -float(ridge_gs.cv_results_["mean_test_score"][best_idx])

In [None]:

print(f"\n=== Ridge ===")
print("Best params:", ridge_gs.best_params_)
print(f"CV train RMSE: {train_rmse:.6f}")
print(f"CV valid RMSE: {val_rmse:.6f}")

## GradientBoostingRegressor 

In [None]:
grid_param = {
    "model__n_estimators": [400, 800, 1000],
    # "model__learning_rate": [0.05, 0.1, 0.5],
    "model__max_depth": [2, 3, 4],
    "model__min_samples_leaf": [10, 20],
    "model__min_samples_split": [20, 50],
    "model__min_impurity_decrease": [1e-4, 1e-3],
    "prep__scaler": ["passthrough"],
}


In [None]:
from sklearn.model_selection import RandomizedSearchCV

cv = KFold(n_splits=5, shuffle=True, random_state=42)

gbr_pipe = Pipeline([
    ("prep", custom_pipelines),
    ("model", GradientBoostingRegressor(random_state=42))
])

gbr_gs = GridSearchCV(
    gbr_pipe,
    param_grid=grid_param,
    scoring="neg_root_mean_squared_error",
    cv=cv,
    n_jobs=-1,
    refit=True,
    return_train_score=True
)

gbr_gs.fit(X_train, y_train)

best_idx = gbr_gs.best_index_
train_rmse = -float(gbr_gs.cv_results_["mean_train_score"][best_idx])
val_rmse   = -float(gbr_gs.cv_results_["mean_test_score"][best_idx])

print("\n=== GBR RandomizedSearch ===\n")
print("Best params:", gbr_gs.best_params_)
print(f"\nCV train RMSE: {train_rmse:.6f}")
print(f"\nCV valid RMSE: {val_rmse:.6f}")

In [None]:
gbr_gs

## Stacking regressors

In [10]:
# Ridge
ridge_pipe = Pipeline([
    ("prep", custom_pipelines),
    ("model", Ridge(alpha=10.0, fit_intercept=True, solver='auto'))
])
# GBR
gbr_pipe = Pipeline([
    ("prep", custom_pipelines),
    ("model", GradientBoostingRegressor(
        n_estimators=400,
		max_depth=3,
        min_impurity_decrease=0.0001,
        min_samples_leaf=20,
        min_samples_split=20
    ))
])


In [11]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV

estimators = [
    ('ridge', ridge_pipe),
    ('gbr', gbr_pipe)
]

stacking_regressor = StackingRegressor(
    estimators=estimators
)

stacking_regressor

0,1,2
,estimators,"[('ridge', ...), ('gbr', ...)]"
,final_estimator,
,cv,
,n_jobs,
,passthrough,False
,verbose,0

0,1,2
,steps,"[('Log1pFeatureImputer', ...), ('LotFrontageNeighborhoodImputer', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,features,"['LotArea', 'GrLivArea']"
,drop_original,True
,clip_negative,True
,prefix,'log1p_'

0,1,2
,features,"['Alley', 'BsmtQual', ...]"

0,1,2
,features,"['OverallCond', 'OverallQual', ...]"
,degree,2

0,1,2
,categories,"{'BsmtCond': ['NA', 'Po', ...], 'BsmtExposure': ['NA', 'No', ...], 'BsmtFinType1': ['NA', 'Unf', ...], 'BsmtFinType2': ['NA', 'Unf', ...], ...}"
,drop_base_features,True
,fill_value,'NA'

0,1,2
,features,"['MSSubClass', 'MSZoning', ...]"
,drop_base_features,True
,fill_value,'NA'
,prefix_sep,'__'
,sparse_output,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,alpha,10.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,

0,1,2
,steps,"[('Log1pFeatureImputer', ...), ('LotFrontageNeighborhoodImputer', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,features,"['LotArea', 'GrLivArea']"
,drop_original,True
,clip_negative,True
,prefix,'log1p_'

0,1,2
,features,"['Alley', 'BsmtQual', ...]"

0,1,2
,features,"['OverallCond', 'OverallQual', ...]"
,degree,2

0,1,2
,categories,"{'BsmtCond': ['NA', 'Po', ...], 'BsmtExposure': ['NA', 'No', ...], 'BsmtFinType1': ['NA', 'Unf', ...], 'BsmtFinType2': ['NA', 'Unf', ...], ...}"
,drop_base_features,True
,fill_value,'NA'

0,1,2
,features,"['MSSubClass', 'MSZoning', ...]"
,drop_base_features,True
,fill_value,'NA'
,prefix_sep,'__'
,sparse_output,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,loss,'squared_error'
,learning_rate,0.1
,n_estimators,400
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,20
,min_samples_leaf,20
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0001

0,1,2
,alphas,"(0.1, ...)"
,fit_intercept,True
,scoring,
,cv,
,gcv_mode,
,store_cv_results,False
,alpha_per_target,False


In [12]:
stacking_regressor.fit(X, y_train_log)

0,1,2
,estimators,"[('ridge', ...), ('gbr', ...)]"
,final_estimator,
,cv,
,n_jobs,
,passthrough,False
,verbose,0

0,1,2
,steps,"[('Log1pFeatureImputer', ...), ('LotFrontageNeighborhoodImputer', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,features,"['LotArea', 'GrLivArea']"
,drop_original,True
,clip_negative,True
,prefix,'log1p_'

0,1,2
,features,"['Alley', 'BsmtQual', ...]"

0,1,2
,features,"['OverallCond', 'OverallQual', ...]"
,degree,2

0,1,2
,categories,"{'BsmtCond': ['NA', 'Po', ...], 'BsmtExposure': ['NA', 'No', ...], 'BsmtFinType1': ['NA', 'Unf', ...], 'BsmtFinType2': ['NA', 'Unf', ...], ...}"
,drop_base_features,True
,fill_value,'NA'

0,1,2
,features,"['MSSubClass', 'MSZoning', ...]"
,drop_base_features,True
,fill_value,'NA'
,prefix_sep,'__'
,sparse_output,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,alpha,10.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,

0,1,2
,steps,"[('Log1pFeatureImputer', ...), ('LotFrontageNeighborhoodImputer', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,features,"['LotArea', 'GrLivArea']"
,drop_original,True
,clip_negative,True
,prefix,'log1p_'

0,1,2
,features,"['Alley', 'BsmtQual', ...]"

0,1,2
,features,"['OverallCond', 'OverallQual', ...]"
,degree,2

0,1,2
,categories,"{'BsmtCond': ['NA', 'Po', ...], 'BsmtExposure': ['NA', 'No', ...], 'BsmtFinType1': ['NA', 'Unf', ...], 'BsmtFinType2': ['NA', 'Unf', ...], ...}"
,drop_base_features,True
,fill_value,'NA'

0,1,2
,features,"['MSSubClass', 'MSZoning', ...]"
,drop_base_features,True
,fill_value,'NA'
,prefix_sep,'__'
,sparse_output,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,loss,'squared_error'
,learning_rate,0.1
,n_estimators,400
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,20
,min_samples_leaf,20
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0001

0,1,2
,alphas,"(0.1, ...)"
,fit_intercept,True
,scoring,
,cv,
,gcv_mode,
,store_cv_results,False
,alpha_per_target,False


In [13]:
y_pred = stacking_regressor.predict(X)
rmse(y_train_log, y_pred)

np.float64(0.07556555556829088)

In [14]:
df_test = pd.read_csv('data/test.csv')
id_col = df_test.pop('Id')
X_test = df_test.reindex(columns=X.columns)


In [15]:
X_test.columns

Index(['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'YearBuilt', 'YearRemodAdd',
       'GarageYrBlt', 'MoSold', 'YrSold', 'OverallQual', 'OverallCond',
       'KitchenQual', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC',
       'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC', 'MSSubClass',
       'MSZoning', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType

In [16]:
y_test = stacking_regressor.predict(X_test)

submission = pd.DataFrame(
    {
        "Id": id_col,
        "SalePrice": np.expm1(y_test)
})

submission.to_csv('submissions/second_attempt.csv', index=False)


# Future steps:

- Deep error analysis
- More and more advanced feature engineering
- Add additional models to enseble
