In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy import stats
%matplotlib inline

In [None]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
train.head()

In [None]:
# Check for duplicates
idsUnique = len(set(train.Id))
idsTotal = train.shape[0]
idsDupli = idsTotal - idsUnique
print("There are " + str(idsDupli) + " duplicate IDs for " + str(idsTotal) + " total entries")

In [None]:
test_ID = test['Id']

#Now drop the  'Id' colum since it's unnecessary for  the prediction process.
train.drop("Id", axis = 1, inplace = True)
test.drop("Id", axis = 1, inplace = True)

In [None]:
train.info()

In [None]:
#corr = train.corr()

# Коррелиция признаков
#f,ax = plt.subplots(figsize=(18, 18))
#sns.heatmap(corr, annot=True, linewidths=.5, fmt= '.1f',ax=ax)
#plt.show()

# Поиск 10ти самых влиятельных признаков
#corr.sort_values(["SalePrice"], ascending = False, inplace = True)
#print(corr.SalePrice)

In [None]:
fig = plt.figure(figsize = (24, 12))

corr = train.corr()

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

sns.heatmap(corr, mask = mask, cmap = 'PiYG', annot = True, fmt=".2f")

plt.yticks(rotation=0) 
plt.xticks(rotation=90)
plt.title('Correlation Matrix for Train Data', fontsize = 15)
plt.show()

In [None]:
# Корреляция признаков. Скаттер с точками

#sns.pairplot(train[['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'FullBath', 
#                    'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd']])

In [None]:
# Удаляем выбросы

train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index)

In [None]:
# log transformation
train["SalePrice"] = np.log1p(train["SalePrice"])

In [None]:
# Соединим train and test
ntrain = train.shape[0]
ntest = test.shape[0]
y_train = train.SalePrice.values
train.drop(['SalePrice'], axis=1, inplace=True)
all_data = pd.concat((train, test)).reset_index(drop=True)
print("all_data size is : {}".format(all_data.shape))

In [None]:
#missing data
total = all_data.isnull().sum().sort_values(ascending=False)
percent = (all_data.isnull().sum()/all_data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

In [None]:
all_data['Exterior2nd'].value_counts(normalize=True)

In [None]:
# Imputing missing values

all_data['PoolQC'] = all_data['PoolQC'].fillna('NA')
all_data['MiscFeature'] = all_data['MiscFeature'].fillna('NA')
all_data['Alley'] = all_data['Alley'].fillna('NA')
all_data['Fence'] = all_data['Fence'].fillna('NA')
all_data['FireplaceQu'] = all_data['FireplaceQu'].fillna('NA')
all_data['GarageType'] = all_data['GarageType'].fillna('NA')
all_data['GarageCond'] = all_data['GarageCond'].fillna('NA')
all_data['GarageYrBlt'] = all_data['GarageYrBlt'].fillna('NA')
all_data['GarageFinish'] = all_data['GarageFinish'].fillna('NA')
all_data['GarageQual'] = all_data['GarageQual'].fillna('NA')
all_data['BsmtExposure'] = all_data['BsmtExposure'].fillna('NA')
all_data['BsmtFinType2'] = all_data['BsmtFinType2'].fillna('NA')
all_data['BsmtFinType1'] = all_data['BsmtFinType1'].fillna('NA')
all_data['BsmtCond'] = all_data['BsmtCond'].fillna('NA')
all_data['MasVnrType'] = all_data['MasVnrType'].fillna('None')
all_data['Electrical'] = all_data['Electrical'].fillna(all_data.Electrical.value_counts().idxmax())
all_data['BsmtQual'] = all_data['BsmtQual'].fillna('NA')
all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data.MSZoning.value_counts().idxmax())
all_data['Utilities'] = all_data['Utilities'].fillna(all_data.Utilities.value_counts().idxmax())
all_data['Functional'] = all_data['Functional'].fillna(all_data.Functional.value_counts().idxmax())
all_data['KitchenQual'] = all_data['KitchenQual'].fillna('TA')
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna('Other')
all_data['SaleType'] = all_data['SaleType'].fillna(all_data.SaleType.value_counts().idxmax())
all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data.Exterior1st.value_counts().idxmax())
all_data['MasVnrArea'] = all_data['MasVnrArea'].fillna(0)
all_data['LotFrontage'] = all_data['LotFrontage'].fillna(0)
all_data['BsmtFullBath'] = all_data['BsmtFullBath'].fillna(0)
all_data['BsmtHalfBath'] = all_data['BsmtHalfBath'].fillna(0)
all_data['BsmtUnfSF'] = all_data['BsmtUnfSF'].fillna(0)
all_data['TotalBsmtSF'] = all_data['TotalBsmtSF'].fillna(0)
all_data['BsmtFinSF2'] = all_data['BsmtFinSF2'].fillna(0)
all_data['BsmtFinSF1'] = all_data['BsmtFinSF1'].fillna(0)
all_data['GarageCars'] = all_data['GarageCars'].fillna(0)
all_data['GarageArea'] = all_data['GarageArea'].fillna(0)

In [None]:
#MSSubClass=The building class
all_data['MSSubClass'] = all_data['MSSubClass'].astype('object')


#Changing OverallCond into a categorical variable
all_data['OverallCond'] = all_data['OverallCond'].astype('object')


#Year and month sold are transformed into categorical features.
all_data['YearBuilt'] = all_data['YearBuilt'].astype('object')
all_data['YearRemodAdd'] = all_data['YearRemodAdd'].astype('object')
all_data['YrSold'] = all_data['YrSold'].astype('object')
all_data['MoSold'] = all_data['MoSold'].astype('object')


In [None]:
# Удалим сильно связанные колонки
all_data.drop(['1stFlrSF'], axis=1, inplace=True)
all_data.drop(['TotRmsAbvGrd'], axis=1, inplace=True)
all_data.drop(['GarageYrBlt'], axis=1, inplace=True)
all_data.drop(['GarageArea'], axis=1, inplace=True)

In [None]:
# Добавим колонку
all_data['tot_sf'] = all_data['TotalBsmtSF'] + all_data['GrLivArea']

In [None]:
# Преобразование признаков с отклонениями
numerical_features = all_data.select_dtypes(exclude = ["object"]).columns
skewness = all_data[numerical_features].skew()
skewed_features = skewness[abs(skewness) > 0.75].index
all_data[skewed_features] = np.log1p(all_data[skewed_features])

In [None]:
# Преобразование категориальных признаков в таблицу 0 и 1 созданием фиктивных колонок

all_data = pd.get_dummies(all_data)
print(all_data.shape)

In [None]:
X_train = all_data[:ntrain]
X_test = all_data[ntrain:]

# **Modeling**

In [None]:
# Нормализация признаков (одинаковый масштаб)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score

def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

## Ridge

In [None]:
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score

alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
cv_ridge = [rmse_cv(Ridge(alpha = alpha)).mean() for alpha in alphas]

In [None]:
cv_ridge

In [None]:
cv_ridge = pd.Series(cv_ridge, index = alphas)
cv_ridge.plot(title = "Validation - Just Do It")
plt.xlabel("alpha")
plt.ylabel("rmse")

In [None]:
 min(cv_ridge)

## Lasso

In [None]:
model_lasso = LassoCV(alphas = [1, 0.1, 0.001, 0.0005], cv=5).fit(X_train, y_train)
rmse_cv(model_lasso).mean()

In [None]:
model_lasso = LassoCV(alphas = [0.001], cv=5).fit(X_train, y_train)
rmse_cv(model_lasso).mean()

In [None]:
lasso_preds = np.expm1(model_lasso.predict(X_test))
solution = pd.DataFrame({"id":test_ID, "SalePrice":lasso_preds})
#solution.to_csv("lasso.csv", index = False)

## Gradient boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

clf = GradientBoostingRegressor(learning_rate=0.1, n_estimators=500, random_state=241)
#rmse_cv(clf).mean()

## Neural net

In [None]:
from sklearn.neural_network import MLPRegressor

clf = MLPRegressor(hidden_layer_sizes=(30, 20, 15), solver='lbfgs', alpha=1e-5, random_state=1)
#rmse_cv(clf).mean()

## Ensemble

In [None]:
# Models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# Misc
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

In [None]:
# Setup cross validation folds
kf = KFold(n_splits=12, random_state=42, shuffle=True)

In [None]:
# Define error metrics
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, X_train=X_train):
    rmse = np.sqrt(-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv=kf))
    return (rmse)

In [None]:
# Setup models

# Light Gradient Boosting Regressor
lightgbm = LGBMRegressor(objective='regression', 
                       num_leaves=6,
                       learning_rate=0.01, 
                       n_estimators=4000,
                       max_bin=200, 
                       bagging_fraction=0.8,
                       bagging_freq=4, 
                       bagging_seed=8,
                       feature_fraction=0.2,
                       feature_fraction_seed=8,
                       min_sum_hessian_in_leaf = 11,
                       verbose=-1,
                       random_state=42)

# XGBoost Regressor
xgboost = XGBRegressor(learning_rate=0.01,
                       n_estimators=6000,
                       max_depth=4,
                       min_child_weight=0,
                       gamma=0.6,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:linear',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006,
                       random_state=42)

# Ridge Regressor
ridge_alphas = [1e-15, 1e-10, 1e-8, 9e-4, 7e-4, 5e-4, 3e-4, 1e-4, 1e-3, 5e-2, 1e-2, 0.1, 0.3, 1, 3, 5, 10, 15, 18, 20, 30, 50, 75, 100]
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=ridge_alphas, cv=kf))

# Support Vector Regressor
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003))

# Gradient Boosting Regressor
gbr = GradientBoostingRegressor(n_estimators=6000,
                                learning_rate=0.01,
                                max_depth=4,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
                                loss='huber',
                                random_state=42)  

# Random Forest Regressor
rf = RandomForestRegressor(n_estimators=1200,
                          max_depth=15,
                          min_samples_split=5,
                          min_samples_leaf=5,
                          max_features=None,
                          oob_score=True,
                          random_state=42)

# Stack up all the models above, optimized using xgboost
stack_gen = StackingCVRegressor(regressors=(xgboost, lightgbm, svr, ridge, gbr, rf),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)

### Get cross validation scores for each model

In [None]:
scores = {}

score = cv_rmse(lightgbm)
print("lightgbm: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['lgb'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(xgboost)
print("xgboost: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['xgb'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(svr)
print("SVR: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['svr'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(ridge)
print("ridge: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['ridge'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(rf)
print("rf: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['rf'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(gbr)
print("gbr: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['gbr'] = (score.mean(), score.std())

### Fit the models

In [None]:
print('stack_gen')
stack_gen_model = stack_gen.fit(np.array(X_train), np.array(y_train))

In [None]:
print('lightgbm')
lgb_model_full_data = lightgbm.fit(X_train, y_train)

In [None]:
print('xgboost')
xgb_model_full_data = xgboost.fit(X_train, y_train)

In [None]:
print('Svr')
svr_model_full_data = svr.fit(X_train, y_train)

In [None]:
print('Ridge')
ridge_model_full_data = ridge.fit(X_train, y_train)

In [None]:
print('RandomForest')
rf_model_full_data = rf.fit(X_train, y_train)

In [None]:
print('GradientBoosting')
gbr_model_full_data = gbr.fit(X_train, y_train)

### Blend models and get predictions

In [None]:
# Blend models in order to make the final predictions more robust to overfitting
def blended_predictions(X):
    return ((0.1 * ridge_model_full_data.predict(X)) + \
            (0.2 * svr_model_full_data.predict(X)) + \
            (0.1 * gbr_model_full_data.predict(X)) + \
            (0.1 * xgb_model_full_data.predict(X)) + \
            (0.1 * lgb_model_full_data.predict(X)) + \
            (0.05 * rf_model_full_data.predict(X)) + \
            (0.35 * stack_gen_model.predict(np.array(X))))

In [None]:
# Get final precitions from the blended model
blended_score = rmsle(y_train, blended_predictions(X_train))
scores['blended'] = (blended_score, 0)
print('RMSLE score on train data:')
print(blended_score)

In [None]:
ensemble_preds = np.floor(np.expm1(blended_predictions(X_test)))
solution = pd.DataFrame({"id":test_ID, "SalePrice":ensemble_preds})
solution.to_csv("submission.csv", index = False)