In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import matplotlib.style as style
style.use('fivethirtyeight')
from matplotlib.ticker import FuncFormatter
import seaborn as sns
import warnings
import plotly.express as px
%matplotlib inline
warnings.filterwarnings('ignore')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# 1-Introduction

## 1.1 Description

Ask a home buyer to describe their dream house, and they probably won't begin with the height of the basement ceiling or the proximity to an east-west railroad.<br>
But this playground competition's dataset proves that much more influences price negotiations than the number of bedrooms or a white-picket fence.<br>
With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges you to predict the final price of each home.

## 1.2 Goal

Estimating the selling price for each id in the test set based on house charecteristics.

## 1.3 Metric

Submissions are evaluated on Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price.<br>(Taking logs means that errors in predicting expensive houses and cheap houses will affect the result equally.)

## 1.4 About Data

<b><a href="https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data?select=data_description.txt">House Prices - Advanced Regression Techniques</a></b>

# 2-Loading Data 

In [None]:
test=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
train=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

In [None]:
test.shape,train.shape

In [None]:
train.head()

In [None]:
#unnecassary column
ids=test['Id']
train.drop(['Id'],axis=1,inplace=True)
test.drop(['Id'],axis=1,inplace=True)

# 3.Exploring Important Features

Target variable is right skewed. Regression models work better with normal distributed data. We'll deal this before modelling.

In [None]:
from scipy.stats import norm
plt.figure(figsize=(12,6))
mu, sigma = norm.fit(train['SalePrice'])
sns.distplot(train['SalePrice'],fit=norm,color='b',rug=True,kde_kws={'shade':True,'color':'b','alpha':.2})
plt.legend(['$\mu=$ {:.3f} and $\sigma=$ {:.3f}'.format(mu, sigma)],fontsize=14)
plt.title('Distribution of Sale Price')
plt.tight_layout();

In [None]:
train['SalePrice'].describe()

Exploring the features that highly correlated with the target value is important. Because they effect model performance more than other features.

In [None]:
corrs=train.drop('SalePrice',axis=1).corrwith(train['SalePrice']).sort_values(ascending=False)

In [None]:
fig,axes=plt.subplots(1,1,figsize=(12,9))
axes.axhline(corrs[corrs>0].mean(), ls=':',color='black',linewidth=2)
axes.text(25.5,corrs[corrs>0].mean()+.015, "Average = {:.3f}".format(corrs[corrs>0].mean()),color='black',size=14)
axes.axhline(corrs[corrs<0].mean(), ls=':',color='black',linewidth=2)
axes.text(25.5,corrs[corrs<0].mean()-.035, "Average = {:.3f}".format(corrs[corrs<0].mean()),color='black',size=14)
sns.barplot(y=corrs,x=corrs.index,palette='Spectral')
plt.title('Correlation of Sale Price to other Features',size=20,color='black',y=1.03)
plt.xticks(rotation=90)
for p in axes.patches:
            value = p.get_height()
            if value <=.5:
                continue
            x = p.get_x() + p.get_width()-.9
            y = p.get_y() + p.get_height()+(.02*value)
            axes.text(x, y, str(value)[1:5], ha="left",fontsize=12,color='#000000')
plt.tight_layout();

In [None]:
corrs.head()

In [None]:
plt.figure(figsize=(11,6))
sns.boxplot(data=train, y='SalePrice',x='OverallQual')
plt.tight_layout()
plt.title('Overall Quality');

There is a 2 outliers with huge GrLivArea and extremely low SalePrice compared to others. Index 523 and 1298.

In [None]:
plt.figure(figsize=(12,6))
p1=sns.regplot(x='GrLivArea',y='SalePrice',data=train,line_kws={'color':'#252525','linewidth':2},ci=0,marker='o')
for index in [523,1298]:
    p1.text(train['GrLivArea'][index]+50, train['SalePrice'][index],s=train['GrLivArea'][index] ,size=14, color='red')
plt.title('GrLivArea',size=20)
plt.tight_layout();

In [None]:
train[train['GrLivArea']>4675][['OverallQual','GarageCars','GarageArea','SalePrice']]

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(data=train, y='SalePrice',x='GarageCars')
plt.tight_layout()
plt.title('GarageCars');

In [None]:
plt.figure(figsize=(12,6))
p1=sns.regplot(x='GarageArea',y='SalePrice',data=train,line_kws={'color':'#252525','linewidth':2},ci=0,marker='o')

p1.text(train['GarageArea'][691]+15, train['SalePrice'][691],s=train['GarageArea'][691] ,size=14, color='red')
p1.text(train['GarageArea'][1182]-65, train['SalePrice'][1182]-35000,s=train['GarageArea'][1182] ,size=14, color='red')
p1.text(train['GarageArea'][1061]-65, train['SalePrice'][1061]-35000,s=train['GarageArea'][1061] ,size=14, color='red')
plt.title('GarageArea',size=20)
plt.tight_layout();

In [None]:
train[((train['SalePrice']>700000) & (train['GarageArea']>800)) | ((train['SalePrice']<100000) & (train['GarageArea']>1200)) ][['OverallQual','GarageCars','GarageArea','SalePrice']]

In [None]:
plt.figure(figsize=(12,6))
p1=sns.regplot(x='TotalBsmtSF',y='SalePrice',data=train,line_kws={'color':'#252525','linewidth':2},ci=0,marker='o')
p1.text(train['TotalBsmtSF'][1298]-65, train['SalePrice'][1298]-35000,s=train['TotalBsmtSF'][1298] ,size=14, color='red')
plt.tight_layout()
plt.title('TotalBsmtSF',size=20);

In [None]:
train[((train['SalePrice']<200000) & (train['TotalBsmtSF']>6000))][['OverallQual','GarageCars','GarageArea','SalePrice']]

In [None]:
plt.figure(figsize=(12,9))
corrmat = train.corr()
top_corr_features = corrmat.index[abs(corrmat["SalePrice"])>0.5]
sns.heatmap(train[top_corr_features].corr(),annot=True,cmap='inferno',square=True,linewidths=1)
plt.title('Correlations',size=25,y=1.03)
plt.tight_layout();


## 3.1 Dropping Outliers

Test data might has some outliers like we discovered in train data. So deleting this outliers may affect badly to the model.<br>
However this outliers can be observe more than one features.(GrLivArea,TotalBsmtSF and the features we create later)<br>
I believe deleting them improve the model.


In [None]:
train.drop(index=[523,1298],inplace=True)

In [None]:
ntrain = train.shape[0] #for recreating data later
ntest = test.shape[0] #for recreating data later

In [None]:
all_df = pd.concat((train, test)).reset_index(drop=True)

# 4.Missing data

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(all_df.isnull(),yticklabels=False,cbar=False,cmap='cividis')
plt.xticks(ha='center')
plt.tight_layout();

In [None]:
#percentage of missing values for each column
na_ratio=all_df.isnull().sum()[all_df.isnull().sum()>0].sort_values(ascending=False)/len(all_df)*100
#number of missing values for each column
na_sum=all_df.isnull().sum()[all_df.isnull().sum()>0].sort_values(ascending=False)

In [None]:
pd.DataFrame(na_ratio,index=na_ratio.index,columns=['Missing Value Ratio']).head(5)

In [None]:
fig,axes=plt.subplots(1,1,figsize=(12,6))
# axes.grid(color='#909090',linestyle=':',linewidth=2)
plt.xticks(rotation=90)
sns.barplot(x=na_ratio.index,y=na_ratio,palette='coolwarm_r')
plt.title('Missing Value Ratio',color=('#000000'),y=1.03)
plt.tight_layout();

In [None]:
fig,axes=plt.subplots(1,1,figsize=(12,12))
sns.barplot(x=na_sum,y=na_sum.index,palette='brg')
for p in axes.patches:
            value = p.get_width()
            x = p.get_x() + p.get_width()+30
            y = p.get_y() + p.get_height()-.2
            axes.text(x, y, int(value), ha="left",fontsize=11,color='#000000',bbox=dict(facecolor='#dddddd', edgecolor='black',boxstyle='round', linewidth=.5))
plt.title('Total Missing Values',color=('#000000'),y=1.03)
plt.tight_layout();

## 4.1 Imputing missing values

I tried to impute missing values according to <b><a href="https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data?select=data_description.txt">description</a></b> as best i could.

### Alley,PoolQC,Fence,MiscFeature,FireplaceQu

Data description says:<br>
-Alley : NA means "no alley access"<br>
-PoolQC : NA means "no Pool".<br>
-Fence : NA means "no fence"<br>
-MiscFeature : NA means "no misc feature"<br>
-FireplaceQu : NA means "no fireplace"<br>
So we can fill all NA values of this features with 'None'

In [None]:
for i in ['Alley','PoolQC','Fence','MiscFeature','FireplaceQu']:
    all_df[i]=all_df[i].apply(lambda x: 'None' if str(x)=='nan' else x)

### LotFrontage

LotFrontage is linear feet of street connected to property. Imputing missing values by the mean LotFrontage of the neighborhood seems best option.

In [None]:
fig,axes=plt.subplots(1,1,figsize=(12,6))

plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.ylabel('LotFrontage',size=15)
plt.xlabel('Neighborhood',size=15)
mean_lot=all_df.groupby("Neighborhood").mean()['LotFrontage']
sns.barplot(x=mean_lot.index,y=mean_lot)
for p in axes.patches:
            value = p.get_height()
            x = p.get_x() +.12
            y = p.get_y() + p.get_height()-5
            axes.text(x, y, int(value), ha="left",fontsize=15,color='#000000')
plt.xticks(rotation=90)
plt.title('Mean LotFrontage',color='black')
plt.tight_layout();

In [None]:
means=dict(all_df.groupby("Neighborhood").mean()['LotFrontage'])

In [None]:
def LotFrontage(row):
    neigh=row[0]
    lot=row[1]
    if pd.isnull(lot):
        return means[neigh]
    else:
        return lot

In [None]:
all_df['LotFrontage']=all_df[['Neighborhood','LotFrontage']].apply(LotFrontage,axis=1)

### Garage

We can safely fill Na values of GarageType,GarageFinish,GarageQual,GarageCond with 'None'

In [None]:
for i in ('GarageType','GarageFinish','GarageQual','GarageCond'):
    all_df[i].fillna('None',inplace=True)

If there is no garage there is no GarageYrBlt,GarageArea,GarageCars

In [None]:
for i in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    all_df[i].fillna(0,inplace=True)

### Bsmt

We can fill the Na values of Bsmt Features with None and 0 since there is no basement

In [None]:
for i in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    all_df[i].fillna('None',inplace=True)
for i in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    all_df[i].fillna(0,inplace=True)

### Masvnr

MasVnrType: Masonry veneer type.<br>
MasVnrArea: Masonry veneer area in square feet.<br>
We can fill 0 for the area and None for the type.

In [None]:
all_df["MasVnrType"].fillna("None",inplace=True)
all_df["MasVnrArea"].fillna(0,inplace=True)

### MSZoning,Electrical,KitchenQual,Exterior,SaleType

MSZoning: Identifies the general zoning classification of the sale.<br>
Electrical: Electrical system<br>
KitchenQual: Kitchen quality<br>
Exterior1st: Exterior covering on house<br>
Exterior2nd: Exterior covering on house (if more than one material)<br>
SaleType: Type of sale<br>
We can fill Na values of above features with their most repeating value.

In [None]:
for i in ('MSZoning','Electrical','KitchenQual','Exterior1st','Exterior2nd','SaleType'):
    all_df[i].fillna(all_df[i].mode()[0],inplace=True)

In [None]:
all_df.isnull().sum().sort_values(ascending=False)

### Utilities,Functional

Imputing Utilities and Functional with their mode.

In [None]:
for i in ('Utilities','Functional'):
    all_df[i].fillna(all_df[i].mode()[0],inplace=True)

Checks for any missing data.

In [None]:
all_df.isnull().sum().sort_values(ascending=False)

There is no missing value except the SalePrice coming from test data.

# 5.Feature Engineering

## 5.1 Changing some numeric variables

There are 3 variables that are seems numeric but should actually be categorical in the data.<br>
MSSubClass,YrSold and MoSold.<br>

According to description:<br>

MSSubClass: Identifies the type of dwelling involved in the sale. Keeping this feature numeric does not improve the model.<br>
So we convert them into string.

### 5.1.1 MSSubClass

MSSubClass
*         20	1-STORY 1946 & NEWER ALL STYLES
*         30	1-STORY 1945 & OLDER
*         40	1-STORY W/FINISHED ATTIC ALL AGES
*         45	1-1/2 STORY - UNFINISHED ALL AGES
*         50	1-1/2 STORY FINISHED ALL AGES
*         60	2-STORY 1946 & NEWER
*         70	2-STORY 1945 & OLDER
*         75	2-1/2 STORY ALL AGES
*         80	SPLIT OR MULTI-LEVEL
*         85	SPLIT FOYER
*         90	DUPLEX - ALL STYLES AND AGES
*        120	1-STORY PUD (Planned Unit Development) - 1946 & NEWER
*        150	1-1/2 STORY PUD - ALL AGES
*        160	2-STORY PUD - 1946 & NEWER
*        180	PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
*        190	2 FAMILY CONVERSION - ALL STYLES AND AGES
       
MSSubClass: Identifies the type of dwelling involved in the sale.

In [None]:
fig,axes=plt.subplots(1,1,figsize=(12,6))
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
sns.barplot(data=all_df,x='MSSubClass',y='SalePrice',palette='magma',ci=0)
axes.text(2,all_df['SalePrice'].mean()+4500, "Average={}".format(int(all_df['SalePrice'].mean())),color='black',size=13)
axes.axhline(all_df['SalePrice'].mean(), ls=':',color='red',linewidth=2.5)
plt.xlabel('MSSubClass',size=15)
plt.ylabel('SalePrice',size=15)
plt.tight_layout();

In [None]:
     x=  { 20:'1-STORY 1946 & NEWER ALL STYLES',
        30:'1-STORY 1945 & OLDER',
        40:'1-STORY W/FINISHED ATTIC ALL AGES',
        45:'1-1/2 STORY - UNFINISHED ALL AGES',
        50:'1-1/2 STORY FINISHED ALL AGES',
        60:'2-STORY 1946 & NEWER',
        70:'2-STORY 1945 & OLDER',
        75:'2-1/2 STORY',
        80:'SPLIT',
        85:'SPLIT FOYER',
        90:'DUPLEX',
       120:'1-STORY PUD',
       150:'1-1/2 STORY PUD',
       160:'2-STORY PUD',
       180:'PUD - MULTILEVEL',
       190:'2 FAMILY CONVERSION'}

In [None]:
all_df['MSSubClass']=all_df['MSSubClass'].map(x)

### 5.1.2 YrSold-MoSold

Month sold shouldn't be numeric. Cause january not better than july. Bu we can't tell same thing about year sold. However we convert it either.

In [None]:
fig,axes=plt.subplots(2,1,figsize=(12,12))
sns.barplot(data=all_df,x='YrSold',y='SalePrice',ax=axes[0],palette='inferno',ci=0)
sns.barplot(data=all_df,x='MoSold',y='SalePrice',ax=axes[1],palette='plasma_r',ci=0)
axes[0].axhline(all_df['SalePrice'].mean(), ls=':',color='red',linewidth=2.5)
axes[0].text(3.2,all_df['SalePrice'].mean()+4500, "Average={}".format(int(all_df['SalePrice'].mean())),color='black',size=13)
axes[1].axhline(all_df['SalePrice'].mean(), ls=':',color='red',linewidth=2.5)
axes[1].text(3.2,all_df['SalePrice'].mean()+4500, "Average={}".format(int(all_df['SalePrice'].mean())),color='black',size=13)
plt.tight_layout();

In [None]:
all_df['MoSold'] = all_df['MoSold'].apply(str)
all_df['YrSold'] = all_df['YrSold'].apply(str)

### 5.2 Adding new features

Creating new features that we think will improve our model.

In [None]:
all_df['TotalBathrooms'] = (all_df['FullBath'] + (0.5 * all_df['HalfBath']) + all_df['BsmtFullBath'] + (0.5 * all_df['BsmtHalfBath']))

In [None]:
all_df['TotalHomeQuality'] = all_df['OverallQual'] + all_df['OverallCond']

In [None]:
all_df['HouseAge']=all_df['YrSold'].apply(int)-all_df['YearRemodAdd']

In [None]:
all_df['TotalSF'] = all_df['TotalBsmtSF'] + all_df['GrLivArea']

In [None]:
all_df['TotalPorchSF'] = all_df['OpenPorchSF'] + all_df['EnclosedPorch'] + all_df['3SsnPorch'] + all_df['ScreenPorch']

In [None]:
all_df['2ndfloor'] = all_df['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
data=['TotalBathrooms','TotalHomeQuality','HouseAge','TotalSF','TotalPorchSF','2ndfloor']
fig,axes=plt.subplots(ncols=2,nrows=3,figsize=(12,12))
for i, feature in enumerate(all_df[data]):
    row = int(i/2)
    col = i%2
    sns.regplot(x=all_df[feature],y=all_df['SalePrice'], ax=axes[row][col],ci=0,line_kws={'color':'#000000','linewidth':2},marker='o')
plt.suptitle('New Features',y=1,size=20)
all_df[data].iloc[:, i]
plt.tight_layout()

### 5.3 Skewed data

In skewed data, the tail region may act as an outlier for the statistical model and that <br>outliers adversely affect the model’s performance especially regression-based models.<br>
We transform the skewed data with boxcox to improve model.<br>
It can be done with log1p transform too. I prefer boxcox for this notebook.<br>
* <b><a href="https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.boxcox.html-">boxcox</a></b>
* <b><a href="https://numpy.org/doc/stable/reference/generated/numpy.log1p.html">log1p</a></b>

In [None]:
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

In [None]:
all_df.drop('SalePrice',axis=1,inplace=True)

In [None]:
skewed=dict(all_df.skew().sort_values(ascending=False))

In [None]:
pd.DataFrame(data=skewed.values(),index=skewed.keys(),columns=['Skew Values']).head(10)

In [None]:
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric = []
for i in all_df.columns:
    if all_df[i].dtype in numeric_dtypes:
        numeric.append(i)

In [None]:
for i in skewed.keys():
       if skewed[i]<.8 or i=='MSSubClass':
            continue
       else:
            all_df[i] = boxcox1p(all_df[i], boxcox_normmax(all_df[i] + 1))

In [None]:
skewed=dict(all_df.skew().sort_values(ascending=False))
pd.DataFrame(data=skewed.values(),index=skewed.keys(),columns=['Skew Values']).head(10)

### 5.4 Transforming target value

Regression models works better with normally distributed features. We transform our target value with log(1+x) transform.<br>
* <b><a href="https://numpy.org/doc/stable/reference/generated/numpy.log1p.html">log1p</a></b>

In [None]:
from scipy import stats

In [None]:
train['SalePrice'].skew()

In [None]:
plt.figure(figsize=(12,6))
stats.probplot(train['SalePrice'],plot=plt);
plt.figure(figsize=(12,6))
mu, sigma = norm.fit(train['SalePrice'])
sns.distplot(train['SalePrice'],fit=norm,color='b',rug=True,kde_kws={'shade':True,'color':'b','alpha':.2})
plt.legend(['$\mu=$ {:.3f} and $\sigma=$ {:.3f}'.format(mu, sigma)],fontsize=14)
plt.title('Sale Price',size=20)
plt.tight_layout();

In [None]:
train['SalePrice']=np.log1p(train['SalePrice'])

In [None]:
y_train=train['SalePrice'].reset_index(drop=True)

In [None]:
plt.figure(figsize=(12,6))
stats.probplot(train['SalePrice'],plot=plt);
plt.figure(figsize=(12,6))
mu, sigma = norm.fit(train['SalePrice'])
sns.distplot(train['SalePrice'],fit=norm,color='b',rug=True,kde_kws={'shade':True,'color':'b','alpha':.2})
plt.legend(['$\mu=$ {:.3f} and $\sigma=$ {:.3f}'.format(mu, sigma)],fontsize=14)
plt.title('Sale Price',size=20)
plt.tight_layout();

In [None]:
train['SalePrice'].skew()

# 6.Modelling

## 6.1 Preparing data

Getting dummies

In [None]:
all_df=pd.get_dummies(all_df,drop_first=True)

In [None]:
all_df.isnull().sum()

Recreating data

In [None]:
X_train = all_df[:ntrain]
X_test = all_df[ntrain:]

In [None]:
y_train.isnull().sum()

In [None]:
y_train.shape,X_train.shape,X_test.shape

In [None]:
X_train

## 6.2 Predictions

Imports

In [None]:
from sklearn.linear_model import Lasso,ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor,StackingRegressor,RandomForestRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor 
from sklearn.model_selection import cross_val_score,train_test_split,KFold,GridSearchCV
from sklearn.pipeline import make_pipeline ,Pipeline
from sklearn.preprocessing import RobustScaler,StandardScaler

Cross validation function

In [None]:
kf = KFold(5, shuffle=True, random_state=42)
def rmsecv(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, y_train, cv = kf,scoring='neg_mean_squared_error',error_score='raise'))
    scores.append(['{:3f}'.format(rmse.mean()),'{:3f}'.format(rmse.std())])

In [None]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

### 6.2.1 Hyperparameter Tuning

I used hyperparameter tuning to finding better parameters for some models.<br>
* <b><a href="https://www.kaggle.com/pavansanagapati/automated-hyperparameter-tuning/notebook#notebook-container">Automated Hyperparameter Tuning</a></b>

In [None]:

# space = {
# 'n_estimators': hp.randint('n_estimators',4000,5000),
# 'random_state':hp.choice('random_state',[42]),
#     'bagging_fraction':hp.uniform('bagging_fraction',.7,1),
#     'learning_rate':hp.uniform('learning_rate',0.002,.006),
#     'bagging_freq':hp.randint('bagging_freq',1,30),
#     'bagging_seed':hp.randint('bagging_seed ',1,30),

#     'boosting_type':hp.choice('boosting_type',['gbdt']),
#     'feature_fraction':hp.uniform('feature_fraction',.01,.5),
#     'feature_fraction_seed':hp.randint('feature_fraction_seed',1,50),
#         'min_sum_hessian_in_leaf':hp.randint('min_sum_hessian_in_leaf',1,50),
# 'num_leaves':hp.randint('num_leaves',1,50),
# 'objective':hp.choice('objective',['regression'])}

# def objective(space):
#     model = LGBMRegressor(
#         n_estimators = space['n_estimators'],
#         random_state=space['random_state'],
#         bagging_fraction=space['bagging_fraction'], 
#         learning_rate=space['learning_rate'],
#         boosting_type=space['boosting_type'],
#         bagging_seed=space['bagging_seed'],
#         feature_fraction=space['feature_fraction'],
#         feature_fraction_seed=space['feature_fraction_seed'],
#         min_sum_hessian_in_leaf =space['min_sum_hessian_in_leaf'],
#         num_leaves=space['num_leaves'],
#     objective=space['objective'])
    
#     rmse = (np.sqrt(-cross_val_score(model, X_train, y_train, cv = kf,scoring='neg_mean_squared_error')).mean())
#     return {'loss': rmse, 'status': STATUS_OK }
    
# trials = Trials()
# best = fmin(fn= objective,
#             space= space,
#             algo= tpe.suggest,
#             max_evals = 100,
#             trials= trials)
# best


### 6.2.2 Making Pipelines

Some of our models like Lasso etc. is sensitive to the outliers. We try to ignore this situation with Robust Scaler.<br>
Also it can be done with Standard Scaler.<br>
Making pipeline for Lasso, ElasticNet and SVR.<br>
* <b><a href="https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html">Robust Scaler</a></b>
* <b><a href="https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html">Standart Scaler</a></b>
* <b><a href="https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.make_pipeline.html">Make Pipeline</a></b>

In [None]:
lasso=make_pipeline(RobustScaler(),Lasso(alpha= 0.0005395401757154939,
                                         max_iter= 2713,
                                         normalize= False,
                                         positive= False,
                                         random_state= 42,
                                         selection= 'random',
                                         tol= 0.006902489174276111,
                                         warm_start= False,copy_X=True,fit_intercept=True,precompute=False))
elasticnet=make_pipeline(RobustScaler(),ElasticNet(alpha= 0.006528654958339889,
                                                   l1_ratio= 0.027196526024128087,
                                                   max_iter= 2286,
                                                   normalize=False,
                                                   positive=False,
                                                   random_state= 42,
                                                   selection= 'cyclic',
                                                   tol= 0.003954468580556897,
                                                  warm_start= False,copy_X=False,fit_intercept=True,precompute=False))



svr = make_pipeline(RobustScaler(), SVR(C= 26.25, epsilon= 0.0215, gamma=0.000235))


In [None]:
gbr=GradientBoostingRegressor(n_estimators=6500,
                                  learning_rate=0.008,
                                  max_depth=3,
                                  max_features='sqrt',
                                  criterion='friedman_mse',
                                  min_samples_leaf=15,
                                  min_samples_split=11,
                                  loss='huber',random_state=42)

In [None]:
lgbm=LGBMRegressor(bagging_fraction = 0.9396546294370978,
             bagging_freq= 20,
             bagging_seed= 26,
             boosting_type='gbdt',
             feature_fraction=0.18000984904260108,
             feature_fraction_seed= 24,
             learning_rate= 0.0040468864436411135,
             min_sum_hessian_in_leaf=7,
             n_estimators= 4014,
             num_leaves= 13,
             objective='regression',
             random_state= 42,
                     verbosity=-1)

In [None]:
xgb=XGBRegressor(random_state=42,verbosity=0,
            n_estimators=8025,
            learning_rate=0.005959814917079281, 
            colsample_bytree=0.0780460264599038, 
            gamma=0.0051170320540329977, 
            max_depth=5, 
            min_child_weight=0.7706298621431724, 
            reg_alpha=0.011645470627696723, 
            reg_lambda=0.9138427702529622,
            subsample=0.297642172460661)

### 6.2.3 Stacked Regressor

Stacked generalization consists in stacking the output of individual estimator and use a regressor to compute the final prediction.<br>
Stacking allows to use the strength of each individual estimator by using their output as input of a final estimator.<br>
* <b><a href="https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingRegressor.html">Stacking Regressor</a></b>

In [None]:
est=[      ('lasso',lasso),
           ('enet',elasticnet),
           ('svr',svr),
           ('gbr',gbr),
           ('lgbm',lgbm),
           ('xgb',xgb)
    ]
stacked=StackingRegressor(cv=kf,estimators=est,passthrough=False,final_estimator=xgb)

### 6.2.4 Cross Validation

In [None]:
models=[
        lasso,
        elasticnet,
        svr,
        gbr,
        lgbm,
        xgb,
        ]

In [None]:
%%time
scores=[]
for i in models:
    try:
        rmsecv(i)
    except Exception as e:
        print(i,'\n',e)

In [None]:
scores

In [None]:
mods=['Lasso','ElasticNet','SVR','GBR','LightGBM','XGBoost']
scores
df=(pd.DataFrame(scores,index=mods,columns=['RMSE','STD']))
df['RMSE'] = df['RMSE'].astype(float, errors = 'raise')

In [None]:
fig,axes=plt.subplots(1,1,figsize=(12,6))
sns.barplot(x=df['RMSE'],y=df.index,palette='gnuplot')
for p in axes.patches:
            value = '{:.5f}'.format(p.get_width())
            x = p.get_x() + p.get_width()-.012
            y = p.get_y() + p.get_height()-.3
            axes.text(x, y, (value), ha="left",fontsize=14,color='#ffffff')
plt.title('RMSE');

## 6.3 Averaging Predictions

Target values comes log transformed. We reverse it with <b><a href="https://numpy.org/doc/stable/reference/generated/numpy.expm1.html">expm1</a></b>

In [None]:
lasso.fit(X_train,y_train)
pred_lasso=np.expm1(lasso.predict(X_test))

In [None]:
elasticnet.fit(X_train,y_train)
pred_enet=np.expm1(elasticnet.predict(X_test))

In [None]:
svr.fit(X_train,y_train)
pred_svr=np.expm1(svr.predict(X_test))

In [None]:
gbr.fit(X_train,y_train)
pred_gbr=np.expm1(gbr.predict(X_test))

In [None]:
lgbm.fit(X_train,y_train)
pred_lgbm=np.expm1(lgbm.predict(X_test))

In [None]:
xgb.fit(X_train,y_train)
pred_xgb=np.expm1(xgb.predict(X_test))

In [None]:
stacked.fit(X_train,y_train)
pred_stacked=np.expm1(stacked.predict(X_test))

In [None]:
plt.figure(figsize=(12,6))
sns.regplot(y=pred_lasso,x=pred_xgb,ci=0,line_kws={'color':'black','linewidth':2})
plt.title('SVR-XGB Predictions')
plt.xlabel('XGB')
plt.ylabel('SVR');

Taking 1/7 of each prediction to averaging predictions.

In [None]:
avg_pred=(pred_enet+pred_lasso+pred_svr+pred_gbr+pred_lgbm+pred_xgb+pred_stacked)/7

In [None]:
plt.figure(figsize=(12,6))
plt.scatter(y=avg_pred,x=ids)
plt.title('Average Predictions');

In [None]:
submissions=pd.DataFrame({'Id': ids, 'SalePrice': avg_pred})

In [None]:
submissions.reset_index()

In [None]:
submissions.to_csv('submission.csv',index=False)