In [25]:
import numpy as np 
import pandas as pd 
%matplotlib inline
import matplotlib.pyplot as plt  
import seaborn as sns
from scipy import stats
from scipy.stats import norm
from scipy.stats import skew 
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

In [26]:
train = pd.read_csv('../HW5/train.csv')
test = pd.read_csv('../HW5/test.csv')

test_ID = test['Id']
train = train.drop("Id", axis=1)
test = test.drop("Id", axis=1) 

In [27]:
train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index)
train["SalePrice"] = np.log1p(train["SalePrice"])
ntrain = train.shape[0]
ntest = test.shape[0]
y_train = train.SalePrice.values
all_data = pd.concat((train, test)).reset_index(drop=True)
all_data.drop("SalePrice", axis=1, inplace=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [28]:
# features where NAN is missing (none)
f_names=['PoolQC','MiscFeature','Alley','Fence',
         'FireplaceQu','GarageType', 'GarageFinish', 
         'GarageQual', 'GarageCond','BsmtQual', 'BsmtCond', 
         'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
         'MasVnrType','MasVnrArea','MSSubClass']

for f in f_names:
    all_data[f].fillna("None", inplace=True) 
    

# numerical features to replace with zero    
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars','BsmtFinSF1', 
            'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 
            'BsmtHalfBath'):
    all_data[col] = all_data[col].fillna(0)

# feaures to replace with mode    
mode_col = ['Electrical','KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType','MSZoning']
for col in mode_col:
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])
    
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
all_data = all_data.drop(['Utilities'], axis=1)
all_data["Functional"] = all_data["Functional"].fillna("Typ")

# numerical features better suited for label encoding
all_data['MSSubClass'] = all_data['MSSubClass'].astype(str)
all_data['OverallCond'] = all_data['OverallCond'].astype(str)
all_data['YrSold'] = all_data['YrSold'].astype(str)
all_data['MoSold'] = all_data['MoSold'].astype(str)

In [29]:
from sklearn.preprocessing import LabelEncoder

# features to be label encoded
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')

for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(all_data[c].values)) 
    all_data[c] = lbl.transform(list(all_data[c].values))

In [30]:
# Adding total sqfootage feature 
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
#create total bath
total_b_train = all_data.BsmtFullBath + (all_data.BsmtHalfBath*0.5) + all_data.FullBath + (all_data.HalfBath*0.5)
all_data['TotalBath'] = total_b_train

In [31]:
numerical = all_data.dtypes[all_data.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = all_data[numerical].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness = skewness[abs(skewness) > 0.75]

from scipy.special import boxcox1p

skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    all_data[feat] = boxcox1p(all_data[feat], lam)
    
all_data = pd.get_dummies(all_data)
train = all_data[:ntrain]
test = all_data[ntrain:]

In [32]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458 entries, 0 to 1457
Columns: 663 entries, 1stFlrSF to SaleType_WD
dtypes: float64(59), uint8(604)
memory usage: 1.5 MB


In [33]:
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
XGBoost = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
LightgBoost = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

from mlxtend.regressor import StackingCVRegressor
stack_gen = StackingCVRegressor(regressors=(KRR, lasso, ENet, GBoost,XGBoost,LightgBoost),
                                meta_regressor=XGBoost,
                                use_features_in_secondary=True)


In [34]:
stack_gen_t = stack_gen.fit(train.values,y_train)

KeyboardInterrupt: 

In [None]:
predicted_prices = np.expm1(stack_gen_t.predict(test.values))

In [15]:
#save results
output = pd.DataFrame({'Id': test_ID, 'SalePrice': predicted_prices})
output.to_csv('HW5_stack_lgb_xgb.csv', index=False)