In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.linear_model import LassoCV
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from matplotlib import pyplot as plt
from sklearn.pipeline import make_pipeline
from scipy.stats import skew
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

pd.options.mode.chained_assignment = None  # default='warn'

def Get_StrColumn_List(X):
    str_col_list = []
    for col in X.columns:
        col_value = next(i for i in X[col] if not pd.isna(i))
        if isinstance(col_value, str):
            str_col_list.append(col)
    return str_col_list

def Get_NumberColumn_List(X):
    number_col_list = []
    for col in X.columns:
        col_value = next(i for i in X[col] if not pd.isna(i))
        if isinstance(col_value, int) or isinstance(col_value, float):
            number_col_list.append(col)
    return number_col_list

def encode(frame, feature):
    # Encode the categories based on the ascending mean sale prices
    ordering = pd.DataFrame()
    ordering['val'] = frame[feature].unique()
    ordering.index = ordering.val
    ordering['spmean'] = frame[[feature, 'SalePrice']].groupby(feature).mean()['SalePrice']
    ordering = ordering.sort_values('spmean')
    ordering['ordering'] = range(1, ordering.shape[0] + 1)
    ordering = ordering['ordering'].to_dict()

    for cat, o in ordering.items():
        frame.loc[frame[feature] == cat, feature + '_E'] = o

def FeatureCreation_X(X):

    # Create new columns based on the mean of sales price for each string category
    str_col_list = Get_StrColumn_List(X)
    for col in str_col_list:
        encode(X, col)

    # Convert yr to age
    yr_col = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']
    # Correct the future year 2207 to 2007 (typo) in GarageYrBlt found the typo while looking at the data
    X.loc[X['GarageYrBlt'] > 2010, 'GarageYrBlt'] = 2007
    latestyr = X[yr_col].max().max()
    X[yr_col] = X[yr_col].fillna(latestyr)
    # Correct the future year 2207 to 2007 (typo) in GarageYrBlt
    X.loc[X['GarageYrBlt'] > latestyr, 'GarageYrBlt'] = 2007
    X[yr_col] = latestyr - X[yr_col]

    # Make new feature columns on area
    # GrLivArea = 1stFlrSF + 2ndFlrSF + LowQualFinSF
    # TotalBsmtSF = BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF
    # Sometimes 1stFlrSF is same as TotalBsmtSF which we need to adjust (basement same as 1st Floor?)

    X['BsmtFinSF'] = X['BsmtFinSF1'] + X['BsmtFinSF2']
    X.drop(['BsmtFinSF1', 'BsmtFinSF2'], axis=1, inplace=True)
    X['1stFlrSF'] = X['1stFlrSF'] - X['TotalBsmtSF']

    X['BadLivAreaPct'] = (X['LowQualFinSF'] + X['BsmtUnfSF']) / X['GrLivArea']
    X['GoodLivAreaPct'] = 1 - X['BadLivAreaPct']
    X.drop(['1stFlrSF', '2ndFlrSF', 'LowQualFinSF'], axis=1, inplace=True)

    X['OutSideArea'] = X['WoodDeckSF'] + X['OpenPorchSF'] + X['EnclosedPorch'] + \
                       X['3SsnPorch'] + X['ScreenPorch'] + X['PoolArea'] + X['MasVnrArea']
    X.drop(['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
            '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MasVnrArea'], axis=1, inplace=True)

    X['TotalBuildingArea'] = X['GrLivArea'] + X['GarageArea'] + X['OutSideArea']
    X.drop(['GarageArea', 'OutSideArea'], axis=1, inplace=True)

    # Creating the total number of bath
    X['Bath'] = X['BsmtFullBath'] + X['FullBath'] + (X['BsmtHalfBath'] + X['HalfBath']) * 0.5
    X.drop(['BsmtFullBath', 'FullBath', 'BsmtHalfBath', 'HalfBath'], axis=1, inplace=True)

    # Creating flag for remodelling
    X['Is_Remodelled'] = np.where(X['YearRemodAdd'] != X['YearBuilt'], 1, 0)

    # Creating flag for remodelling just before sale:-
    X['Is_ReMdlB4Sale'] = np.where(abs(X['YrSold'] - X['YearRemodAdd']) <= 1, 1, 0)

    return X

# Import the data
test = pd.read_csv('test.csv', index_col=0)
train = pd.read_csv('train.csv', index_col=0)
submission = pd.read_csv('sample_submission.csv', index_col=0)

# Obtaining the y and X for the data
y = train['SalePrice']
# Combine both the test and train X data
X = pd.concat([train, test], ignore_index=True)
# Remove columns with more than 70% nulls
col_list_null = X.isna().sum() / len(X)
X = X.loc[:, col_list_null < 0.7]
# Adjust the skewed numerical columns
number_col_list = Get_NumberColumn_List(X)
skew_features = X[number_col_list].apply(lambda x: skew(x)).sort_values(ascending=False)
high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index
for i in skew_index:
    X[i] = boxcox1p(X[i], boxcox_normmax(X[i] + 1))
# Create new features columns based on string columns
X = FeatureCreation_X(X)
# Fill na with the mode for str columns and 0 for number columns
for col in Get_StrColumn_List(X):
    X[col].fillna(X[col].mode()[0], inplace=True)
X.fillna(0, inplace=True)
# Encode and adjust skew for numerical columns
X_mod = pd.get_dummies(X).reset_index(drop=True)
y_mod = np.log1p(y)
Xtest_mod = X_mod.loc[X_mod['SalePrice'] == 0, X_mod.columns != 'SalePrice']
Xtrain_mod = X_mod.loc[X_mod['SalePrice'] != 0, X_mod.columns != 'SalePrice']

# Feature reduction with recursive feature elimination ============================================================
# X_train, X_test, y_train, y_test = train_test_split(Xtrain_mod, y_mod, test_size=0.2, random_state=0)
# rfe_rf = RFE(estimator=RandomForestRegressor(), n_features_to_select=100, step=10, verbose=1)
# rfe_rf.fit(X_train, y_train)
# rf_mask = rfe_rf.support_
# rfe_gb = RFE(estimator=GradientBoostingRegressor(), n_features_to_select=100, step=10, verbose=1)
# rfe_gb.fit(X_train, y_train)
# gb_mask = rfe_gb.support_
# votes = np.sum([rf_mask, gb_mask], axis=0)
# mask = votes >= 2
# Xtrain_mod_mask = Xtrain_mod.loc[:, mask]
# Xtest_mod_mask = Xtest_mod.loc[:, mask]

# # specify your configurations as a dict for lightgbm ============================================================
# from shaphypetune import BoostSearch
#
# param_grid_1 = {'learning_rate': [0.01],
#                 'n_estimators': [500],
#                 'num_leaves': [10, 20, 30, 40],
#                 'max_depth': [5, 6, 7, 8, 9, 10],
#                 'boosting': ["gbdt", "dart", "goss"]}
#
# model = BoostSearch(LGBMRegressor(), param_grid=param_grid_1)
# model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=6, verbose=0)
#
# best_param = {'learning_rate': 0.01,
#               'n_estimators': 500,
#               'num_leaves': 20,
#               'max_depth': 10,
#               'boosting': "gbdt"}

# specify your configurations as a dict for lightgbm ==============================================================
mdl_pred_prf = pd.DataFrame(data=None, index=submission.index, columns=None)
RMSE = pd.DataFrame(columns=['RmseTest_STACK', 'RmseTrain_STACK'])
nb_mdl = 10
k = 0

from sklearn.model_selection import KFold, cross_val_score

kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds))
ridge_model_full_data = ridge.fit(Xtrain_mod, y_mod)

lasso = make_pipeline(RobustScaler(), LassoCV(max_iter=100000, alphas=alphas2, random_state=42, cv=kfolds))
lasso_model_full_data = lasso.fit(Xtrain_mod, y_mod)

elasticnet = make_pipeline(RobustScaler(), ElasticNetCV(max_iter=100000, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio))
elastic_model_full_data = elasticnet.fit(Xtrain_mod, y_mod)

lgb = LGBMRegressor(boosting_type='gbdt', objective='regression', num_leaves=20,
                    learning_rate=0.01, n_estimators=500, max_depth=10, metric='rmse')
lgb_model_full_data = lgb.fit(Xtrain_mod, y_mod)

svr = make_pipeline(RobustScaler(), SVR(C=20, epsilon=0.008, gamma=0.0003,))
svr_model_full_data = svr.fit(Xtrain_mod, y_mod)

xgboost = XGBRegressor(learning_rate=0.01, n_estimators=3460, max_depth=3, min_child_weight=0,
                       gamma=0, subsample=0.7, colsample_bytree=0.7, nthread=-1,
                       scale_pos_weight=1, seed=27, reg_alpha=0.00006)
xgb_model_full_data = xgboost.fit(Xtrain_mod, y_mod)

gbr = GradientBoostingRegressor(learning_rate=0.01, max_depth=6, n_estimators=500, subsample=0.5)
gbr_model_full_data = gbr.fit(Xtrain_mod, y_mod)

level0 = list()
level0.append(('ridge', ridge))
level0.append(('lasso', lasso))
level0.append(('elasticnet', elasticnet))
level0.append(('lgb', lgb))
level0.append(('svr', svr))
level0.append(('xgb', xgboost))
# define meta learner model
level1 = gbr
# define the stacking ensemble
stack_model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5, n_jobs=-1)
# fit the model on all available data
stack_model.fit(Xtrain_mod, y_mod)

def blend_models_predict(X):
    return ((0.1 * elastic_model_full_data.predict(X)) + (0.05 * lasso_model_full_data.predict(X)) +
            (0.1 * ridge_model_full_data.predict(X)) + (0.1 * svr_model_full_data.predict(X)) +
            (0.1 * gbr_model_full_data.predict(X)) + (0.15 * xgb_model_full_data.predict(X)) +
            (0.1 * lgb_model_full_data.predict(X)) + (0.3 * stack_model.predict(np.array(X))))

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, X=X):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kfolds))
    return (rmse)

submission['SalePrice'] = np.expm1(blend_models_predict(Xtest_mod))
submission.to_csv('sub9.csv')
