# Importing Data and Packages

In [None]:
# Importing packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import scipy
import xgboost
import seaborn as sns
from scipy import stats
from scipy.stats import skew, zscore, norm
from scipy.special import boxcox1p
from sklearn.linear_model import Ridge, ElasticNet, Lasso, LinearRegression
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,RobustScaler
from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from mlxtend.regressor import StackingCVRegressor
import xgboost as xgb
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

In [None]:
# Importing data and assigning them as training and test sets and viewing all columns that have missing values. 

train_d = 'train.csv'
test_d = 'test.csv'
train_data = pd.read_csv(train_d, index_col="Id")
test_data = pd.read_csv(test_d, index_col="Id")

# Data Exploration

In [None]:
train_data.describe()

In [None]:
corr = train_data.corr()
corr.SalePrice.sort_values(ascending = False)

In [None]:
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
fig, ax = fig, ax = plt.subplots(figsize=(12,10))
ax = sns.heatmap(corr, mask= mask, vmax=0.4, square= True, fmt=".1f", annot= True)
ax

In [None]:
def plot_(X, y, axes, data_style="b."):
    z = np.polyfit(X, y, 2)
    p = np.poly1d(z)
    plt.plot(X, p(X), data_style)

In [None]:
OverallQual_pivot = train_data.pivot_table(index='OverallQual',values='SalePrice', aggfunc=np.mean)
GarageCars_pivot = train_data.pivot_table(index='GarageCars',values='SalePrice', aggfunc=np.mean)

In [None]:
OverallQual_pivot.plot(kind='bar',color='blue')
plt.xlabel('Overall Quality')
plt.ylabel('Mean Sale Price')
plt.xticks(rotation=0)
plt.show()

GarageCars_pivot.plot(kind='bar',color='blue')
plt.xlabel('Overall Quality')
plt.ylabel('Mean Sale Price')
plt.xticks(rotation=0)
plt.show()

plot_(train_data.GrLivArea, train_data.SalePrice, axes = [0, 500000, 0, 4000], data_style="b.")
plt.xlabel('Above Ground Living Area')
plt.ylabel('Mean Sale Price')
plt.show()

plot_(train_data.GarageArea, train_data.SalePrice, axes = [0, 500000, 0, 4000], data_style="b.")
plt.xlabel('Total Garage Area')
plt.ylabel('Mean Sale Price')
plt.show()

plot_(train_data.TotalBsmtSF, train_data.SalePrice, axes = [0, 500000, 0, 4000], data_style="b.")
plt.xlabel('Total Basement Squarefootage')
plt.ylabel('Mean Sale Price')
plt.show()

# Missing Values

In [None]:
# Combining training and test data to deal with missing values
all_data = pd.concat([train_data.drop("SalePrice",axis=1),test_data])

In [None]:
na_df = pd.DataFrame(all_data.isna().sum().sort_values(ascending=False))
na_df.head(22)

From the text file that was shared we learn the following about the columns and what missing values mean:

PoolQC: No pool

MiscFeature: No miscellaneous feature

Alley: No alley access

Fence: No Fence

FireplaceQu: No Fireplace

LotFrontage: No info on missing values. Description given: Linear feet of street connected to property

GarageYrBlt, GarageCond, GarageType, GarageFinish, GarageQual: No Garage

BsmtFinType2, BsmtExposure: No Basement

BsmtQual, BsmtCond, BsmtFinType1: No basement

MasVnrArea: No info on missing values. Description given: Masonry veneer area in square feet

MasVnrType: No info on missing values. Description given: Masonry veneer type.

Electrical: No info on missing values. Description given: Electrical system

In [None]:
# For the following rows I am going to replace missing values with "No Feature": 
# 'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageCond', 'GarageType', 
# 'GarageFinish', 'GarageQual', 'BsmtExposure', 'BsmtQual', 'BsmtCond', 'BsmtFinType1'

no_feat_rows = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageCond', 'GarageType', 
                'GarageFinish', 'GarageQual', 'BsmtExposure', 'BsmtQual', 'BsmtCond', 'BsmtFinType1']

all_data[no_feat_rows] = all_data[no_feat_rows].fillna('No Feature')

In [None]:
# For the following rows I am going to replace missing values with the mode for that column: 
# "MasVnrType", "MSZoning", "Utilities", "Exterior1st", "Exterior2nd", "SaleType", "Electrical", 
# "KitchenQual", "Functional"

mode_feat_rows = ["MasVnrType", "MSZoning", "Utilities", "Exterior1st", "Exterior2nd", "SaleType", "Electrical", 
                  "KitchenQual", "Functional"]

all_data[mode_feat_rows] = all_data.groupby("Neighborhood")[mode_feat_rows].transform(lambda x:x.fillna(x.mode()[0]))

In [None]:
# For the following rows I am going to replace missing values with 0: 
# 'GarageYrBlt', 'BsmtFinType2', 'LotFrontage', 'MasVnrArea', 'BsmtHalfBath', 'BsmtFullBath', 'GarageCars',
# 'TotalBsmtSF', 'GarageArea', 'BsmtUnfSF', 'BsmtFinSF2', 'BsmtFinSF1'

zero_feat_rows = ['GarageYrBlt', 'BsmtFinType2', 'MasVnrArea', 'BsmtHalfBath', 'BsmtFullBath', 'GarageCars',
                 'TotalBsmtSF', 'BsmtUnfSF', 'BsmtFinSF2', 'BsmtFinSF1' ]

all_data[zero_feat_rows] = all_data[zero_feat_rows].fillna(0)

In [None]:
# For the following rows I am going to replace missing values with the median for that column: 
# 'GarageArea', 'LotFrontage'

med_feat_rows = ['GarageArea', 'LotFrontage']

all_data[med_feat_rows] = all_data.groupby("Neighborhood")[med_feat_rows].transform(lambda x: x.fillna(x.median()))

In [None]:
all_data['MSSubClass'] = all_data['MSSubClass'].apply(str)
all_data['OverallCond'] = all_data['OverallCond'].astype(str)

In [None]:
# Creating a TotalSF column
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

In [None]:
# Removing the years and just having a column of rolling months.
year_dict = {2006 : 0, 2007 : 12, 2008 : 24, 2009 : 36, 2010 : 48}
all_data.replace({'YrSold' : year_dict}, inplace = True)
all_data.YrSold.value_counts()

In [None]:
all_data['NewMonth'] = all_data['MoSold'] + all_data['YrSold']

In [None]:
all_data.drop(['MoSold','YrSold'], axis = 1, inplace = True)

# Encoding Catagorical Variables

In [None]:
# Using the LabelEncoder to Label encode all catagorical variables.

cata_cols = ['FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond']
for col in cata_cols:
    lblenc = LabelEncoder()
    lblenc.fit(list(all_data[col].values)) 
    all_data[col] = lblenc.transform(list(all_data[col].values))

# Normally Distributing the Data

In [None]:
# Viewing how skewed the data is. 
train_data["SalePrice"] = np.log1p(train_data["SalePrice"])
(mu, sigma) = norm.fit(train_data['SalePrice'])

plt.figure(figsize = (15, 8))
plt.subplot(1,2,1)
sns.distplot(train_data['SalePrice'] , fit = norm);
plt.legend(['Normal dist. \n$\mu=$ {:.2f} and $\sigma=$ {:.2f}'.format(mu, sigma)],
            loc = 'upper left', ncol = 2)
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

plt.subplot(2,2,2)
res = stats.probplot(train_data['SalePrice'], plot=plt, fit = norm)
plt.show()
y = train_data.SalePrice.values

print('mu = {:.2f} and sigma = {:.2f}'.format(mu, sigma))
print("Skewness: %f" % train_data['SalePrice'].skew())
print("Kurtosis: %f" % train_data['SalePrice'].kurt())

In [None]:
# Adjusting the skew more toward a normal distribution
numeric_cols = all_data.dtypes[all_data.dtypes != "object"].index
skewed_cols = all_data[numeric_cols].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewed = pd.DataFrame({'Skewed Columns' :skewed_cols})
skewed = skewed[abs(skewed) > 0.75]

skewed_feats = skewed.index
lam = 0.15
for feat in skewed_feats:
    all_data[feat] = boxcox1p(all_data[feat], lam)
    all_data[feat] += 1

# Scaling and Encoding the Data

In [None]:
# Using RobustScaler to scale the data
robust_scale_cols = all_data.select_dtypes(np.number).columns
all_data[robust_scale_cols] = RobustScaler().fit_transform(all_data[robust_scale_cols])

In [None]:
# Encoding the data
all_data = pd.get_dummies(all_data)

# Fine Tuning Hyperparameters And Training Models

In [None]:
kfold = KFold(n_splits=5, random_state=0, shuffle=True)
rmse = lambda y_train, y_pred: np.sqrt(mean_squared_error(y_train, y_pred))
scorer = make_scorer(rmse, greater_is_better=False)

def ran_search(model, grid, n_iter=100):
    if model == xgb.XGBRegressor(n_jobs=4):
        search0 = RandomizedSearchCV(estimator = model, param_distributions = grid, cv=kfold, n_iter = n_iter, 
                                      n_jobs = 4, random_state = 0, verbose = True)
        return search0.fit(X_train, y_train, early_stopping_rounds = 9, verbose = True)
    else:
        search1 = RandomizedSearchCV(estimator = model, param_distributions = grid, cv=kfold, n_iter = n_iter, 
                                    n_jobs = 4, random_state = 0, verbose = True)
        return search1.fit(X_train, y_train)

In [None]:
ridge_grid = {'alpha' : np.logspace(-1, 2, 500)} 
lasso_grid = {'alpha' : np.logspace(-5, -1, 500)}
enet_grid = {'alpha' : np.logspace(-5, .1, 500), 'l1_ratio' : np.logspace(-5, .1, 500)}
kern_grid = {'kernel' : ['polynomial'], 'alpha' : np.logspace(-5, .1, 500), 
             'gamma': np.logspace(-5, .1, 500), 'degree' : [1,3,5,7]} 
lgbm_grid = {"colsample_bytree": np.linspace(0.2, 0.7, 6), "learning_rate": np.logspace(-3, -1, 100), 
             'n_estimators' : range(200, 3000, 200), 'max_depth' : [1, 4, 7, 10]}
grboo_grid = {"max_features": np.linspace(0.2, 0.7, 6), "learning_rate": np.logspace(-5, -1, 100),
             'n_estimators' : range(200, 3000, 200), 'max_depth' : [1, 4, 7, 10]} 
xgboo_grid = {'n_estimators': range(200, 3000, 200),'max_depth': [1, 4, 7, 10],'learning_rate': [0.05, 0.1, 0.20],
           'min_child_weight': [1, 10, 100]}

In [None]:
# Separating data into feature, response, and test data.
X_train = all_data.loc[train_data.index]
X_test = all_data.loc[test_data.index]
y_train = train_data['SalePrice']

In [None]:
# Removing outlies from the training data.
residuals = y_train - LinearRegression().fit(X_train, y_train).predict(X_train)
outliers = residuals[np.abs(zscore(residuals)) > 3].index
X_train = X_train.drop(outliers)
y_train = y_train.drop(outliers)

In [None]:
ridge = ran_search(Ridge(), ridge_grid, n_iter=100) 
lasso = ran_search(Lasso(), lasso_grid, n_iter=100)
enet = ran_search(ElasticNet(), enet_grid, n_iter=100)
kern = ran_search(KernelRidge(), kern_grid, n_iter=100)

In [None]:
lgbm = ran_search(LGBMRegressor(), lgbm_grid, n_iter=100)

In [None]:
grboo = ran_search(GradientBoostingRegressor(), grboo_grid, n_iter=100)

In [None]:
xgboo = ran_search(xgb.XGBRegressor(n_jobs=4), xgboo_grid)

# Creating the Ensemble of Models

In [None]:
models = [search.best_estimator_ for search in [ridge, lasso, enet, kern, lgbm, grboo, xgboo]] 
ensemble_search = ran_search(StackingCVRegressor(models,Ridge(), cv=kfold), 
                                {"meta_regressor__alpha": np.logspace(-3, -1, 500)}, n_iter=15) 
models.append(ensemble_search.best_estimator_) 

# Creating and Exporting Final Prediction

In [None]:
final = [i.predict(X_test) for i in models]
final_prediction = np.average(final,axis=0)
my_prediction = pd.DataFrame({"Id": test_data.index, "SalePrice": np.exp(final_prediction)})

In [None]:
my_prediction

In [None]:
#my_prediction.to_csv('JDC_Prediction2.csv', index = False)