In [114]:
import pandas as pd
import numpy as np
from bayes_opt import BayesianOptimization
import warnings
import xgboost as xgb
warnings.filterwarnings('ignore')

In [115]:
data = pd.read_csv('house_prices_clean.csv')
test = pd.read_csv("house_prices_clean_test.csv")

In [116]:
X_test_1 = test.drop('Id', axis = 1)

In [117]:
X = data.drop(['SalePrice', 'Id'],axis =1)

y = data['SalePrice']

In [118]:
set(X_test_1) - set(X)

{'BsmtFinSF1_is_missing',
 'BsmtFinSF2_is_missing',
 'BsmtFullBath_is_missing',
 'BsmtHalfBath_is_missing',
 'BsmtUnfSF_is_missing',
 'GarageArea_is_missing',
 'GarageCars_is_missing',
 'TotalBsmtSF_is_missing'}

In [None]:
X['GarageCars_is_missing'] = False #treba da se pominat site

In [90]:
X = X.reindex(sorted(X.columns), axis=1)

In [None]:
X_test_1 = X_test_1.reindex(sorted(X_test_1.columns), axis=1)

In [92]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=123)

In [9]:
dtrain = xgb.DMatrix(X, y, feature_names=X.columns.values)

In [5]:
def xgb_r2(preds, dtrain):
    labels = dtrain.get_label()
    return 'r2', r2_score(preds, labels)

In [10]:
# Create evaluation function (the competition uses RMSLE)
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, r2_score

def rmsle(y_test, y_preds):
    """
    Calculates Root Mean Squared Log Error between predictions and true labels
    """
    return np.sqrt(mean_squared_log_error(y_test,y_preds))

def rmse(predictions, targets): 

    return np.sqrt(((predictions - targets) ** 2).mean())

def rmsle2(y, y0):
    return np.sqrt(np.mean(np.square(np.log1p(y) - np.log1p(y0))))

# Create function to evaluate model on a few different levels
def show_scores(model):
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_test)
    scores = {'Training MAE': mean_absolute_error(y_train,train_preds),
             'Valid MAE': mean_absolute_error(y_test,val_preds),
             'Training R^2': r2_score(y_train,train_preds),
             'Valid R^2': r2_score(y_test,val_preds),
             'Training RMSE': rmse(y_train,train_preds),
             'Valid RMSE': rmse(y_test,val_preds),
             'Training RMSLE2': rmsle2(y_train,train_preds),
             'Valid RMSLE2': rmsle2(y_test,val_preds)
             }
    
    return scores
    

In [11]:
def hyp_xgb(max_depth, subsample, colsample_bytree,min_child_weight, gamma ):
    params = {
    'n_estimators': 300,
    'eta': 0.05,
    'objective': 'reg:squarederror',
    'eval_metric':'mae',
    'silent': 1
     }
    params['max_depth'] = int(round(max_depth))
    params['subsample'] = max(min(subsample, 1), 0)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['min_child_weight'] = int(min_child_weight)
    params['gamma'] = max(gamma, 0)
    scores = xgb.cv(params, dtrain, num_boost_round=1000,verbose_eval=False, early_stopping_rounds=5, feval=xgb_r2, maximize=True, nfold=5)
    return  scores['test-r2-mean'].iloc[-1]

In [21]:
pds ={
  'min_child_weight':(1, 20),
  'gamma':(0, 10),
  'subsample':(0.5, 2),
  'colsample_bytree':(0.1, 2),
  'max_depth': (40, 80)
}


In [22]:
optimizer = BayesianOptimization(hyp_xgb, pds, random_state=123)

In [112]:
optimizer.maximize(init_points=5, n_iter=15)

|   iter    |  target   | colsam... |   gamma   | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------


KeyboardInterrupt: 

In [15]:
print(optimizer.max)

{'target': 0.8703213999999999, 'params': {'colsample_bytree': 0.9245254134344366, 'gamma': 0.21477628589493092, 'max_depth': 49.99866565572221, 'min_child_weight': 2.4469275064453493, 'subsample': 0.6895211071879879}}


In [93]:
clf_xgb=xgb.XGBRegressor(colsample_bytree=0.9245254134344366,
                        gamma=0.21477628589493092,
                        max_depth=int(49.99866565572221),
                        min_child_weight=2.4469275064453493,
                        subsample=0.6895211071879879)

In [60]:
X_train.shape

(1168, 134)

In [61]:
X_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,SaleType_is_missing,SaleCondition_is_missing,BsmtFinSF1_is_missing,BsmtFinSF2_is_missing,BsmtFullBath_is_missing,BsmtHalfBath_is_missing,BsmtUnfSF_is_missing,GarageArea_is_missing,GarageCars_is_missing,TotalBsmtSF_is_missing
318,319,60,4,90.0,9900,2,0,4,3,1,...,False,False,False,False,False,False,False,False,False,False
580,581,20,4,69.0,14585,2,0,1,4,1,...,False,False,False,False,False,False,False,False,False,False
961,962,60,4,69.0,12227,2,0,1,4,1,...,False,False,False,False,False,False,False,False,False,False
78,79,90,4,72.0,10778,2,0,4,4,1,...,False,False,False,False,False,False,False,False,False,False
5,6,50,4,85.0,14115,2,0,1,4,1,...,False,False,False,False,False,False,False,False,False,False


In [94]:
clf_xgb.fit(X_train,y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.9245254134344366,
             gamma=0.21477628589493092, importance_type='gain',
             learning_rate=0.1, max_delta_step=0, max_depth=49,
             min_child_weight=2.4469275064453493, missing=None,
             n_estimators=100, n_jobs=1, nthread=None, objective='reg:linear',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=0.6895211071879879, verbosity=1)

In [95]:
show_scores(clf_xgb)

{'Training MAE': 615.488552145762,
 'Valid MAE': 15510.473512414384,
 'Training R^2': 0.9990515291348206,
 'Valid R^2': 0.896207143242311,
 'Training RMSE': 2451.8896257727183,
 'Valid RMSE': 25325.873982331857,
 'Training RMSLE2': 0.010382265055060775,
 'Valid RMSLE2': 0.11937917541563561}

In [109]:
y_preds = clf_xgb.predict(X_test_1)

In [111]:

predicted = pd.DataFrame()
predicted['Id'] = test['Id']
predicted['SalePrice'] = y_preds
predicted.to_csv('predicted.csv',index=False)