In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from matplotlib import pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, QuantileTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector

from xgboost import XGBRegressor

# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

RANDOM_STATE = 2137

# Load data

In [2]:
training_set = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv', index_col='Id')

In [3]:
x_test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv", index_col='Id')
x_test.shape

(1459, 79)

# Examine data

In [4]:
training_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [5]:
x, y = training_set.drop(columns='SalePrice'), training_set['SalePrice']
print(x.shape, y.shape)

(1460, 79) (1460,)


# Create pipeline for data processing

In [6]:
my_transformer = make_column_transformer(
    (
        OneHotEncoder(sparse=False, handle_unknown='ignore'),
        make_column_selector(dtype_include='object')
    ),
    remainder='passthrough',
    verbose_feature_names_out=False
)

pipe = make_pipeline(
    my_transformer,
    IterativeImputer(random_state=RANDOM_STATE),
    StandardScaler(),
    QuantileTransformer(output_distribution='normal', random_state=RANDOM_STATE)
)

## Transform training set

In [7]:
x = pd.DataFrame(pipe.fit_transform(x), columns=my_transformer.get_feature_names_out())

# Split into train and cross-validation

In [8]:
x_train, x_cv, y_train, y_cv = train_test_split(x, y, train_size=0.8, random_state=RANDOM_STATE)

print(x_train.shape, y_train.shape)
print(x_cv.shape, y_cv.shape)

(1168, 304) (1168,)
(292, 304) (292,)


# Tune hyperparameters using Hyperopt

In [9]:
space={
    'gamma': hp.uniform ('gamma', 0, 5),
    'eta': hp.uniform('eta', 0, .3),
    'subsample': hp.uniform('subsample', 0, 1),
    'n_estimators': 180,
    'max_depth': hp.quniform('max_depth', 1, 20, 1),
    'reg_lambda' : hp.uniform('reg_lambda', 0, 10),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.4,1),
    'seed': 0
    }

In [10]:
def objective(space):
    model=XGBRegressor(
        eta = space['eta'],
        n_estimators = int(space['n_estimators']),
        max_depth = int(space['max_depth']),
        reg_lambda = space['reg_lambda'],
        subsample = space['subsample'],
        min_child_weight = int(space['min_child_weight']),
        colsample_bytree = space['colsample_bytree'],
        eval_metric = "rmse",
        early_stopping_rounds = 10
    )
    
    
    evaluation = [( x_train, y_train), ( x_cv, y_cv)]
    
    model.fit(
        x_train, y_train,
        eval_set=evaluation,
        verbose=False
    )
    
    pred = model.predict(x_cv)
    mse = mean_squared_error(y_cv, pred, squared=True)
    return {'loss':mse, 'status': STATUS_OK, 'model': model}

In [11]:
trials = Trials()

#best_hyperparams = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals=5000, trials = trials)

## Extract best parameters

In [12]:
#best_hyperparams

In [13]:
n_estimators = 180
colsample_bytree = 0.4396092226617401
eta = 0.06258588022261978
max_depth = 5
min_child_weight = 4
reg_lambda = 6.675546531790164
subsample = 0.3974055570153105

# Create and train XGBRegressor model

In [14]:
model = XGBRegressor(
    max_depth=max_depth,
    reg_lambda=reg_lambda,
    n_estimators=n_estimators,
    eta=eta,
    subsample=subsample,
    min_child_weight=min_child_weight,
    colsample_bytree=colsample_bytree,
    early_stopping_rounds=10
)
model.fit(x_train, y_train,
          eval_set=[( x_train, y_train), ( x_cv, y_cv)],
          verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1,
             colsample_bytree=0.4396092226617401, early_stopping_rounds=10,
             enable_categorical=False, eta=0.06258588022261978,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.0625858828, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=4,
             missing=nan, monotone_constraints='()', n_estimators=180, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0, ...)

## Mesure model's performance

In [15]:
train_pred = model.predict(x_train)
cv_pred = model.predict(x_cv)

print(f'Train MSE: {mean_squared_error(y_train , train_pred)}')
print(f'CV MSE: {mean_squared_error(y_cv , cv_pred)}')

print(f'CGBoost score: {model.score(x_cv, y_cv)}')

Train MSE: 337133301.7677728
CV MSE: 399679983.9041567
CGBoost score: 0.9316756144019055


## Submission

In [16]:
x_test = pd.DataFrame(pipe.transform(x_test), columns=my_transformer.get_feature_names_out())

In [17]:
sample_submission_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
sample_submission_df['SalePrice'] = model.predict(x_test)
sample_submission_df.to_csv('/kaggle/working/submission.csv', index=False)
sample_submission_df

Unnamed: 0,Id,SalePrice
0,1461,121837.703125
1,1462,153240.250000
2,1463,182725.703125
3,1464,191240.500000
4,1465,180076.921875
...,...,...
1454,2915,87550.421875
1455,2916,79391.390625
1456,2917,173675.906250
1457,2918,118293.210938
