In [1]:
import numpy as np
import pandas as pd
import datetime
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
import time
from sklearn import preprocessing
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.linear_model import Ridge, LassoCV,LassoLarsCV, ElasticNet
from sklearn.kernel_ridge import KernelRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from scipy.stats import skew

def create_submission(prediction,score):
    now = datetime.datetime.now()
    sub_file = 'submission_'+str(score)+'_'+str(now.strftime("%Y-%m-%d-%H-%M"))+'.csv'
    #sub_file = 'prediction_training.csv'
    print ('Creating submission: ', sub_file)
    pd.DataFrame({'Id': test['Id'].values, 'SalePrice': prediction}).to_csv(sub_file, index=False)

# train need to be test when do test prediction
def data_preprocess(train,test):
    outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
    train.drop(train.index[outlier_idx],inplace=True)
    all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                          test.loc[:,'MSSubClass':'SaleCondition']))
    
    to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
    all_data = all_data.drop(to_delete,axis=1)

    train["SalePrice"] = np.log1p(train["SalePrice"])
    #log transform skewed numeric features
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    all_data = pd.get_dummies(all_data)
    all_data = all_data.fillna(all_data.mean())
    X_train = all_data[:train.shape[0]]
    X_test = all_data[train.shape[0]:]
    y = train.SalePrice

    return X_train,X_test,y





In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
Xtrain,Xtest,ytrain = data_preprocess(train,test)

In [3]:
Xtrain.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,4.110874,4.189655,9.04204,7,5,2003,2003,5.283204,706.0,0.0,...,0,0,0,1,0,0,0,0,1,0
1,3.044522,4.394449,9.169623,6,8,1976,1976,0.0,978.0,0.0,...,0,0,0,1,0,0,0,0,1,0
2,4.110874,4.234107,9.328212,7,5,2001,2002,5.09375,486.0,0.0,...,0,0,0,1,0,0,0,0,1,0
3,4.26268,4.110874,9.164401,7,5,1915,1970,0.0,216.0,0.0,...,0,0,0,1,1,0,0,0,0,0
5,3.931826,4.454347,9.555064,5,5,1993,1995,0.0,732.0,0.0,...,0,0,0,1,0,0,0,0,1,0


## GridSearchCV
- Exhaustive search over specified parameter values for an estimator.
- Important members are fit, predict.
- GridSearchCV implements a “fit” and a “score” method. 
- It also implements “predict”, “predict_proba”, “decision_function”, “transform” and “inverse_transform” if they are implemented in the estimator used.
- The parameters of the estimator used to apply these methods are optimized by cross-validated grid-search over a parameter grid.

In [4]:
def mean_squared_error_(ground_truth, predictions):
    return mean_squared_error(ground_truth, predictions) ** 0.5
RMSE = make_scorer(mean_squared_error_, greater_is_better=False)

In [5]:
def model_random_forecast(Xtrain, Xtest, ytrain):
    X_train = Xtrain
    y_train = ytrain
    rfr = RandomForestRegressor(n_jobs=1, random_state=0)
    param_grid = {
        'n_estimators':[500], 
        'max_features':[10,15,20,25], 
        'max_depth':[3,5,7,9,11]
    }
    model = GridSearchCV(estimator=rfr, 
                         param_grid=param_grid,
                         n_jobs=1,
                         cv=10,
                         scoring=RMSE
                        )
    model.fit(X_train, y_train)
    print('Random forecast regression...')
    print('Best Params:')
    print(model.best_params_)
    print('Best CV Score:')
    print(-model.best_score_)
    
    y_pred = model.predict(Xtest)
    return y_pred, -model.best_score_

In [None]:
test_predict,score = model_random_forecast(Xtrain, Xtest, ytrain)

In [None]:
def model_gradient_boosting_tree(Xtrain,Xtest,ytrain):
    X_train = Xtrain
    y_train = ytrain
    gbr = GradientBoostingRegressor(random_state=0)
    param_grid = {
        'n_estimators': [500],
        'max_features': [10,15],
        'max_depth': [6,8,10],
        'learning_rate': [0.05,0.1,0.15],
        'subsample': [0.8]
    }
    model = GridSearchCV(estimator=gbr, 
                         param_grid=param_grid,
                         n_jobs=1,
                         cv=10,
                         scoring=RMSE
                        )
    model.fit(X_train, y_train)
    print('Gradient boosted tree regression...')
    print('Best Params:')
    print(model.best_params_)
    print('Best CV Score:')
    print(-model.best_score_)
    
    y_pred = model.predict(Xtest)
    return y_pred, -model.best_score_

In [None]:
test_predict,score = model_gradient_boosting_tree(Xtrain, Xtest, ytrain)

In [None]:
def model_xgb_regression(Xtrain,Xtest,ytrain):
    X_train = Xtrain
    y_train = ytrain
    
    xgbreg = xgb.XGBRegressor(seed=0)
    param_grid = {
        'n_estimators': [500],
        'learning_rate': [ 0.05],
        'max_depth': [ 7, 9, 11],
        'subsample': [ 0.8],
        'colsample_bytree': [0.75,0.8,0.85],
    }
    model = GridSearchCV(estimator=xgbreg, 
                         param_grid=param_grid,
                         n_jobs=1,
                         cv=10,
                         scoring=RMSE
                        )
    model.fit(X_train, y_train)
    print('eXtreme Gradient Boosting regression...')
    print('Best Params:')
    print(model.best_params_)
    print('Best CV Score:')
    print(-model.best_score_)
    
    y_pred = model.predict(Xtest)
    return y_pred, -model.best_score_


In [None]:
test_predict,score = model_xgb_regression(Xtrain, Xtest, ytrain)

In [18]:
def model_extra_trees_regression(Xtrain, Xtest, ytrain):
    X_train = Xtrain
    y_train = ytrain
    
    etr = ExtraTreesRegressor(n_jobs=1, random_state=0)
    param_grid = {
        'n_estimators': [500], 
        'max_features': [10,15,20]
    }
    model = GridSearchCV(estimator=etr, 
                         param_grid=param_grid,
                         n_jobs=1,
                         cv=10,
                         scoring=RMSE)
    model.fit(X_train, y_train)
    
    print('Extra trees regression...')
    print('Best Params:')
    print(model.best_params_)
    print('Best CV Score:')
    print(-model.best_score_)
    
    y_pred = model.predict(Xtest)
    return y_pred, -model.best_score_
    

In [19]:
test_predict,score = model_extra_trees_regression(Xtrain, Xtest, ytrain)

Extra trees regression...
Best Params:
{'max_features': 20, 'n_estimators': 500}
Best CV Score:
0.13919812106


0    12.247699
1    12.109016
2    12.317171
3    11.849405
5    11.870607
Name: SalePrice, dtype: float64