## Supervised Regression- House Price Data

Resources:
- Visualize Parameter Search: https://blancas.io/sklearn-evaluation/user_guide/grid_search.html
- Scorer Sklearn and Mass-Histograms: https://www.kaggle.com/liyenhsu/feature-selection-and-ensemble-of-5-models/notebook
- https://www.kaggle.com/juliencs/a-study-on-regression-applied-to-the-ames-dataset

In [1]:
# General
import numpy as np
import pandas as pd
import os
import scipy.stats as st
import pickle
import matplotlib.pyplot as plt
%matplotlib inline

# Evalaluation
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer

# Grid
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

# Models
# Linear Regression
from sklearn import linear_model
from sklearn.linear_model import Ridge

# XGBoost
import xgboost as xgb
from xgboost.sklearn import XGBRegressor  


In [2]:
# Load
# train_df = pd.read_csv("../input/feature-engineering-and-pre-processing-house-data/house_train.csv", index_col='Id')
# test_df = pd.read_csv("../input/feature-engineering-and-pre-processing-house-data/house_test.csv", index_col='Id')

# Read
train_df = pd.read_csv("house_train.csv", index_col='Id')
test_df = pd.read_csv("house_train.csv", index_col='Id')

# Log
train_df['SalePrice']= np.log(train_df['SalePrice'])

In [3]:
train_df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,12.247694
2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,12.109011
3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,12.317167
4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,11.849398
5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,12.429216


In [4]:
# Hyper-Parameter
n_inter = 25
cv = 5
rstate = 23
score_name = "Root Mean Square Error"

In [5]:
# Define a function to calculate Root Mean Sqaure Error
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true-y_pred)**2))

# Define a function to calculate negative RMSE (as a score)
def nrmse(y_true, y_pred):
    return -1.0*rmse(y_true, y_pred)

#neg_rmse = make_scorer(nrmse)
scoring = make_scorer(rmse, greater_is_better=False)

In [6]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

# Storage for Model and Results
results = pd.DataFrame(columns=['Model','Para','Test_Score','CV Mean','CV STDEV'])
def save(model, modelname):
    global results
    model.best_estimator_.fit(X, y)
    submission =  np.exp(model.predict(test_df))
    
    df = pd.DataFrame({'Id':test_df.index, 
                        'SalePrice':submission})
    df.to_csv("{}.csv".format(modelname),header=True,index=False)
    
    model.best_estimator_.fit(X_train, y_train)
    top = np.flatnonzero(grid.cv_results_['rank_test_score'] == 1)
    CV_scores = grid.cv_results_['mean_test_score'][top]*-1
    STDev = grid.cv_results_['std_test_score'][top]
    Test_scores = rmse(y_test, model.predict(X_test))
    
    # CV and Save Scores
    results = results.append({'Model': modelname,'Para': model.best_params_,'Test_Score': Test_scores,
                             'CV Mean':CV_scores, 'CV STDEV': STDev}, ignore_index=True)
    
    # Print Evaluation
    print("\nEvaluation Method: {}".format(score_name))
    print("Optimal Model Parameters: {}".format(grid.best_params_))
    print("Training RMSE: ", rmse(y_train, model.predict(X_train)))
    print("Train CV Accuracy: %0.2f (+/- %0.2f) [%s]" % (CV_scores, STDev, modelname))
    print('Test_Score:', Test_scores)


In [7]:
# Indepedent and Dependent
X = train_df.drop(["SalePrice"] , axis=1)
y = train_df["SalePrice"]

# use train/test split with different random_state values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print("X_train Shape: {}".format(X_train.shape), "\ny_train Shape: {}".format(y_train.shape),
      "\nX_test Shape: {}".format(X_test.shape), "\ny_test Shape: {}".format(y_test.shape))

print("\nDo Train and Submission Set Columns Match? {}".format(X.columns.equals(test_df.columns)))

X_train Shape: (1168, 74) 
y_train Shape: (1168,) 
X_test Shape: (292, 74) 
y_test Shape: (292,)

Do Train and Submission Set Columns Match? False


# Models
## Linear Regression

In [8]:
linear_model.LinearRegression().get_params().keys()

dict_keys(['fit_intercept', 'normalize', 'n_jobs', 'copy_X'])

In [9]:
model = linear_model.LinearRegression()
score = cross_val_score(model, X_train, y_train, cv=2, scoring=scoring)
print(score.mean())
model.fit(X_train,y_train)
print(rmse(y_test, model.predict(X_test)))

ValueError: could not convert string to float: 'Normal'

## Ridge Regression

In [10]:
Ridge().get_params().keys()

dict_keys(['fit_intercept', 'solver', 'max_iter', 'normalize', 'copy_X', 'tol', 'random_state', 'alpha'])

In [11]:
model = Ridge()

alpha= st.beta(10, 1)
alpha = [1000,100,10, 1, 0.1, 0.01, 0.001,0.0001]
alpha = np.logspace(4,-4,10)

param_grid = {'alpha': alpha}

grid = RandomizedSearchCV(model, param_grid,
                          cv=cv, verbose=1, scoring=scoring,
                         n_iter=len(alpha)-1, random_state=rstate)

grid.fit(X_train, y_train)
save(grid, "Ridge")

Fitting 5 folds for each of 9 candidates, totalling 45 fits


ValueError: could not convert string to float: 'Normal'

In [12]:
report(grid.cv_results_)

AttributeError: 'RandomizedSearchCV' object has no attribute 'cv_results_'

## XGBoost

In [13]:
# Human Analog Model
# https://www.kaggle.com/humananalog/xgboost-lasso/code
regr = xgb.XGBRegressor(
                 colsample_bytree=0.2,
                 gamma=0.0,
                 learning_rate=0.01,
                 max_depth=4,
                 min_child_weight=1.5,
                 n_estimators=7200,                                                                  
                 reg_alpha=0.9,
                 reg_lambda=0.6,
                 subsample=0.2,
                 seed=42,
                 silent=1)

regr.fit(X_train, y_train)

ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in fields MSZoning, Street, LotShape, LandContour, Utilities, LotConfig, LandSlope, Neighborhood, Condition1, Condition2, BldgType, HouseStyle, RoofStyle, RoofMatl, Exterior1st, Exterior2nd, MasVnrType, ExterQual, ExterCond, Foundation, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, Heating, HeatingQC, CentralAir, Electrical, KitchenQual, Functional, GarageType, GarageFinish, GarageQual, GarageCond, PavedDrive, SaleType, SaleCondition

In [None]:
xgb.plot_importance(regr)

In [None]:
# Run prediction on training set to get a rough idea of how well it does.
y_pred = regr.predict(X_train)
print("XGBoost score on training set: ", rmse(y_train, y_pred))

In [None]:
regr.evals_result

In [None]:
#save()

In [None]:
# y_pred = np.exp(pred_xgb)

# pred_df = pd.DataFrame(y_pred, index=test_df.index, columns=["SalePrice"])
# #pred_df.to_csv('output.csv', header=True, index_label='Id')
# pred_df.to_csv((os.path.join(path, r"submission/output.csv")),header=True, index_label='Id')

### Nick's Optimized XGBoost

- General Parameters: Guide the overall functioning
- Booster Parameters: Guide the individual booster (tree/regression) at each step
- Learning Task Parameters: Guide the optimization performed



## Sklearn XGBOOST

In [None]:
one_to_left = st.beta(10, 1)  
from_zero_positive = st.expon(0, 50)

params = {  
    "learning_rate": st.uniform(0.001, 0.3),
    "colsample_bytree": one_to_left,
    "max_depth": st.randint(1, 40),
    "gamma": st.uniform(0, 10),
    'reg_alpha': from_zero_positive,
    "n_estimators": st.randint(3, 100),
    "min_child_weight": from_zero_positive,
    "subsample": one_to_left
}

model = XGBRegressor(nthread=-1)

grid = RandomizedSearchCV(model, params, cv=cv,
                          scoring=scoring,
                          n_jobs=1, verbose=1,
                         n_iter=5)  


grid.fit(X_train, y_train)
save(grid, "XGboostRand")

In [None]:
np.logspace(2,-4,6), np.arange(50, 301, 25)

In [None]:
results

### GridSearchCV

In [None]:
from xgboost.sklearn import XGBRegressor  
import scipy.stats as st

one_to_left = st.beta(10, 1)  
from_zero_positive = st.expon(0, 50)

params = {  
    "learning_rate": st.uniform(0.001, 0.3),
    "colsample_bytree": one_to_left,
    "max_depth": st.randint(1, 40),
    "gamma": st.uniform(0, 10),
    'reg_alpha': from_zero_positive,
    "n_estimators": st.randint(3, 100),
    "min_child_weight": from_zero_positive,
    "subsample": one_to_left
}

model = XGBRegressor(nthread=-1)

grid = RandomizedSearchCV(model, params, cv=cv,
                          scoring=scoring,
                          n_jobs=1, verbose=1)  


grid.fit(X_train, y_train)
save(grid.best_estimator_, "XGboostGrid")

## Regularized Linear Models

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import r2_score
from sklearn.linear_model import Lasso
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 6


alpha = 0.1
lasso = Lasso(alpha=alpha)

y_pred_lasso = lasso.fit(X_train, y_train).predict(X_test)
r2_score_lasso = r2_score(y_test, y_pred_lasso)
print(lasso)
print("r^2 on test data : %f" % r2_score_lasso)

# #############################################################################
# ElasticNet
from sklearn.linear_model import ElasticNet

enet = ElasticNet(alpha=alpha, l1_ratio=0.7)

y_pred_enet = enet.fit(X_train, y_train).predict(X_test)
r2_score_enet = r2_score(y_test, y_pred_enet)
print(enet)
print("r^2 on test data : %f" % r2_score_enet)

plt.plot(enet.coef_, color='lightgreen', linewidth=2,
         label='Elastic net coefficients')
plt.plot(lasso.coef_, color='gold', linewidth=2,
         label='Lasso coefficients')
#plt.plot(coef, '--', color='navy', label='original coefficients')
plt.legend(loc='best')
plt.title("Lasso R^2: %f, Elastic Net R^2: %f"
          % (r2_score_lasso, r2_score_enet))
plt.show()

In [None]:
results