In [96]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
X = pd.read_csv('../input/train.csv', index_col='Id')
X_test_full = pd.read_csv('../input/test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice              
X.drop(['SalePrice'], axis=1, inplace=True)

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numeric columns
numeric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numeric_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

# One-hot encode the data (to shorten the code, we use pandas)
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_test = pd.get_dummies(X_test)
X_train, X_valid = X_train.align(X_valid, join='left', axis=1)
X_train, X_test = X_train.align(X_test, join='left', axis=1)

In [102]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numeric_cols),
        ('cat', categorical_transformer, low_cardinality_cols)
    ])

# Define model
model = XGBRegressor(silent=True,
                            scale_pos_weight=1,
                            learning_rate=0.01,
                            colsample_bytree = 0.44,
                            subsample = 0.86,
                            n_estimators=5000,
                            reg_alpha = 0.01,
                            max_depth=5,
                            gamma=4,
                            n_jobs=4,
                            reg_lambda=1.01)

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = clf.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, preds))

preds_test = clf.predict(X_test)

# Save predictions in format used for competition scoring
output = pd.DataFrame({'Id': X_test.index, 'SalePrice': preds_test})
output.to_csv('../output/submission.csv', index=False)

  if getattr(data, 'base', None) is not None and \


MAE: 15550.038554152397


In [458]:
from xgboost import XGBRegressor

# Define the model
my_model_1 = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)

# Fit the model
my_model_1.fit(X_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)], 
             verbose=False)

  if getattr(data, 'base', None) is not None and \




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.05, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
             n_jobs=4, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [92]:
#### Tunned XGBoostRegressor

from xgboost import XGBRegressor

# my_model_1 = XGBRegressor(silent=True, 
#                       scale_pos_weight=1,
#                       learning_rate=0.03,  
#                       colsample_bytree = 0.33,
#                       subsample = 1,
#                       n_estimators=300, 
#                       reg_alpha = 0.2,
#                       max_depth=4, 
#                       gamma=10,
#                       n_jobs=4,
#                       reg_lambda=0.9,
#                       min_child_weight=1)

my_model_1 = XGBRegressor(silent=True,
                            scale_pos_weight=1,
                            learning_rate=0.01,
                            colsample_bytree = 0.44,
                            subsample = 0.86,
                            n_estimators=5000,
                            reg_alpha = 0.01,
                            max_depth=5,
                            gamma=4,
                            n_jobs=4,
                            reg_lambda=1.01)

# Fit the model
my_model_1.fit(X_train, y_train, 
             early_stopping_rounds=100, 
             eval_set=[(X_valid, y_valid)], 
             verbose=False)

  if getattr(data, 'base', None) is not None and \


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.44, gamma=0,
             importance_type='gain', learning_rate=0.01, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=None, n_estimators=5000,
             n_jobs=4, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0.01, reg_lambda=1.01, scale_pos_weight=1, seed=None,
             silent=True, subsample=0.86, verbosity=1)

In [93]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits=5, random_state=42, shuffle=True)
scores = -1 * cross_val_score(my_model_1, X_train, y_train, cv=kfold, scoring='neg_mean_absolute_error')
print(scores)
print("Average MAE score:", scores.mean())
# print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

# Get predictions
predictions_1 = my_model_1.predict(X_valid)
# Calculate MAE
mae_1 = mean_absolute_error(y_valid, predictions_1)
# Print MAE
print("Mean Absolute Error:" , mae_1)

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


[17664.16870326 13749.81418603 16570.60692107 14022.1046305
 15409.59527562]
Average MAE score: 15483.257943295826
Mean Absolute Error: 15552.810453232021


In [469]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

def get_scores(model, X_train, X_valid, y_train, y_valid):
    # Fit the model
    model.fit(X_train, y_train, 
                 early_stopping_rounds=100, 
                 eval_set=[(X_valid, y_valid)], 
                 verbose=False)
    # Get predictions
    preds_val = model.predict(X_valid)
    # Calculate MAE
    mae = mean_absolute_error(y_valid, preds_val)
    
    return mae

In [470]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

def get_cross_val_scores(model, X_train, X_valid, y_train, y_valid):
    X = X_train + X_valid
    y = y_train + y_valid
    # Fit the model
#     model.fit(X_train, y_train, 
#                  early_stopping_rounds=5, 
#                  eval_set=[(X_valid, y_valid)], 
#                  verbose=False)
    
    kfold = KFold(n_splits=5, random_state=42, shuffle=True)
    scores = -1 * cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_mean_absolute_error')
    return scores.mean()

In [480]:
# XGBoost Tunning
score_statistic = {}

for parameter in range(1, 10):
    parameter *= 0.1
    print(parameter)
    model = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, 
             importance_type='gain', learning_rate=0.01, max_delta_step=0, max_depth=12, min_child_weight=1, missing=None, 
             n_estimators=1000, n_jobs=4, nthread=None, objective='reg:linear', random_state=0, reg_alpha=0.3, reg_lambda=1.01, 
             scale_pos_weight=1, seed=None, silent=None, subsample=parameter, verbosity=1, eval_metric='mae')

    scores = get_cross_val_scores(model, X_train, X_valid, y_train, y_valid)
    score_statistic[parameter] = scores
    print('Parameter is {}. Mae is {}'.format(str(parameter), str(scores)))

0.1


  if getattr(data, 'base', None) is not None and \




KeyboardInterrupt: 

In [474]:
min_mae_key = -1
min_mae_value = 0
for key, value in score_statistic.items():
    if min_mae_key == -1:
        min_mae_key = key
        min_mae_value = value
    elif min_mae_value > value:
            min_mae_key = key
            min_mae_value = value
    print(key, value)
print()
print('Best score is {}: {}'.format(str(min_mae_key), str(min_mae_value)))

0.01 20429.66263821829
0.02 16856.92099813206
0.03 16376.894323241937
0.04 17465.965854963793
0.05 18123.076700947553
0.06 18632.32221545173
0.07 18899.727346418564
0.08 19942.174797600877
0.09 20022.76209687288

Best score is 0.03: 16376.894323241937


In [62]:
# Use GridSearchCV

from sklearn.model_selection import GridSearchCV

xgb_model = XGBRegressor(silent=True,
                            scale_pos_weight=1,
                            learning_rate=0.01,
                            colsample_bytree = 0.44,
                            subsample = 0.86,
                            n_estimators=5000,
                            reg_alpha = 0.01,
                            max_depth=5,
                            gamma=4,
                            n_jobs=4,
                            reg_lambda=1.01)

test_params = {
    'reg_lambda':[1.15, 1.2, 1.3]
}

model = GridSearchCV(estimator = xgb_model, param_grid = test_params)
model.fit(X_train, y_train)

print(model.cv_results_)
print(model.best_params_)
print(model.best_score_)

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


{'mean_fit_time': array([11.29557657, 11.19309735, 11.3102173 ]), 'std_fit_time': array([0.11107089, 0.04195917, 0.09724857]), 'mean_score_time': array([0.06929763, 0.0666941 , 0.06571754]), 'std_score_time': array([0.00079668, 0.00121722, 0.00046019]), 'param_reg_lambda': masked_array(data=[1.15, 1.2, 1.3],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'reg_lambda': 1.15}, {'reg_lambda': 1.2}, {'reg_lambda': 1.3}], 'split0_test_score': array([0.83255821, 0.83414123, 0.83405723]), 'split1_test_score': array([0.88999092, 0.88993124, 0.88954497]), 'split2_test_score': array([0.90309396, 0.90251676, 0.90269865]), 'mean_test_score': array([0.87517784, 0.87549431, 0.87539819]), 'std_test_score': array([0.03064542, 0.02972563, 0.02975813]), 'rank_test_score': array([3, 1, 2])}
{'reg_lambda': 1.2}
0.8754943080181737


In [65]:
# Tunned XGBoostRegressor

from xgboost import XGBRegressor

# Model 1 score is 13922.76305
# model_1 = XGBRegressor(
#                  learning_rate =0.01,
#                  n_estimators=5000,
#                  max_depth=6,
#                  min_child_weight=0,
#                  gamma=0,
#                  subsample=0.7,
#                  colsample_bytree=0.7,
#                  objective= 'reg:linear',
#                  nthread=4,
#                  scale_pos_weight=1,
#                  seed=27,
#                  reg_alpha=0.01)

# Model 1 score is 13875.94723
# model_2 = XGBRegressor(silent=True,
#                             scale_pos_weight=1,
#                             learning_rate=0.045,
#                             colsample_bytree = 0.44,
#                             subsample = 0.78,
#                             n_estimators=1000,
#                             reg_alpha = 0.1,
#                             max_depth=5,
#                             gamma=10,
#                             n_jobs=4,
#                             reg_lambda=1.01)

# Model_3 score is 13757.60170
# model_3 = XGBRegressor(silent=True,
#                             scale_pos_weight=1,
#                             learning_rate=0.01,
#                             colsample_bytree = 0.44,
#                             subsample = 0.86,
#                             n_estimators=5000,
#                             reg_alpha = 0.1,
#                             max_depth=5,
#                             gamma=10,
#                             n_jobs=4,
#                             reg_lambda=1.01)

# Model 6 score is 13754.19725
# model_6 = XGBRegressor(silent=True,
#                             scale_pos_weight=1,
#                             learning_rate=0.01,
#                             colsample_bytree = 0.44,
#                             subsample = 0.86,
#                             n_estimators=5000,
#                             reg_alpha = 0.01,
#                             max_depth=5,
#                             gamma=4,
#                             n_jobs=4,
#                             reg_lambda=1.01)

my_model = XGBRegressor(silent=True,
                            scale_pos_weight=1,
                            learning_rate=0.01,
                            colsample_bytree = 0.44,
                            subsample = 0.86,
                            n_estimators=5000,
                            reg_alpha = 0.01,
                            max_depth=5,
                            gamma=4,
                            n_jobs=4,
                            reg_lambda=1.01)

# Fit the model
my_model.fit(X_train, y_train, 
             early_stopping_rounds=500, 
             eval_set=[(X_valid, y_valid)], 
             verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.44, gamma=4,
             importance_type='gain', learning_rate=0.01, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=None, n_estimators=5000,
             n_jobs=4, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0.01, reg_lambda=1.2, scale_pos_weight=1, seed=None,
             silent=True, subsample=0.86, verbosity=1)

In [66]:
preds_test = my_model.predict(X_test)

# Save predictions in format used for competition scoring
output = pd.DataFrame({'Id': X_test.index, 'SalePrice': preds_test})
output.to_csv('../output/submission.csv', index=False)