In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler, scale
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
import xgboost
from xgboost import XGBRegressor
import sklearn.metrics as metrics
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [2]:
X = pd.read_csv('./dataset-0510/train.csv', index_col='building_id')
X_test = pd.read_csv('./dataset-0510/test.csv', index_col='building_id')

y = X.total_price
X.drop(columns=['total_price'], inplace=True)

In [3]:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.3, random_state=42) 

In [8]:
# step1. Imputation transformer for completing missing values.
step1 = ('Imputer', Imputer())
# step2. MinMaxScaler
step2 = ('MinMaxScaler', MinMaxScaler())
# step3. feature selection
#step3 = ('FeatureSelection', SelectFromModel(RandomForestRegressor()))
step3 = ('FeatureSelection', VarianceThreshold())

finally_step = ('model',
                XGBRegressor(
                    subsample=0.6,
                    reg_lambda=0.45,
                    reg_alpha=0,
                    n_estimators=3000,
                    min_child_weight=10,
                    max_depth=9,
                    learning_rate=0.01,
                    gamma=0.03,
                    colsample_bytree=0.8,
                    tree_method='gpu_hist',
                    predictor='gpu_predictor',
                )
               )
parameters_for_testing = {
    'model__colsample_bytree':[0.4,0.6,0.8, 1],
    'model__gamma':[0, 0.1, 0.3, 0.5, 1],
    'model__min_child_weight':[1.5,6,10],
    'model__learning_rate':[0.0001, 0.001, 0.01, 0.1],
    'model__max_depth':[3,6,9, 12],
    'model__n_estimators':[1000, 3000, 5000, 7000, 10000],
    'model__reg_alpha':[0, 1e-5, 1e-2,  0.75, 1],
    'model__reg_lambda':[0, 1e-5, 1e-2, 0.45, 1],
    'model__subsample':[0.6,0.95]
}

pipeline = Pipeline(steps=[step1, step2, finally_step])

random_search = RandomizedSearchCV(pipeline,
                                  parameters_for_testing,
                                  cv=2)





In [9]:
random_search.fit(X_train, y_train)
























































































RandomizedSearchCV(cv=2, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('Imputer',
                                              Imputer(axis=0, copy=True,
                                                      missing_values='NaN',
                                                      strategy='mean',
                                                      verbose=0)),
                                             ('MinMaxScaler',
                                              MinMaxScaler(copy=True,
                                                           feature_range=(0,
                                                                          1))),
                                             ('model',
                                              XGBRegressor(base_score=0.5,
                                                           booster='gbtree',
                                                   

In [10]:
print('best params')
print (random_search.best_params_)

print('best score')
print (random_search.best_score_)

best params
{'model__subsample': 0.6, 'model__reg_lambda': 0.45, 'model__reg_alpha': 0, 'model__n_estimators': 3000, 'model__min_child_weight': 10, 'model__max_depth': 9, 'model__learning_rate': 0.01, 'model__gamma': 0.3, 'model__colsample_bytree': 0.8}
best score
0.7313087747543957


In [11]:
pipeline.fit(X, y)



Pipeline(memory=None,
         steps=[('Imputer',
                 Imputer(axis=0, copy=True, missing_values='NaN',
                         strategy='mean', verbose=0)),
                ('MinMaxScaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('model',
                 XGBRegressor(base_score=0.5, booster='gbtree',
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=0.6, gamma=0.03,
                              importance_type='gain', learning_rate=0.03,
                              max_delta_step=0, max_depth=9, min_child_weight=6,
                              missing=None, n_estimators=10000, n_jobs=1,
                              nthread=None, objective='reg:linear',
                              predictor='gpu_predictor', random_state=0,
                              reg_alpha=0.01, reg_lambda=1e-05,
                              scale_pos_weight=1, seed=None, silent=None,
               

## Train score

In [12]:
print(f"Train score: {pipeline.score(X_train, y_train)}")

Train score: 0.9999591014754754


In [13]:
y_pred = pipeline.predict(X_train)
y_pred

array([2.1412750e+06, 1.7718562e+07, 2.3313475e+06, ..., 3.1442176e+08,
       2.1774552e+06, 5.7069630e+06], dtype=float32)

In [14]:
#RMSLE = metrics.mean_squared_log_error(y_eval ,y_pred)
VARIANCE_SCORE = metrics.explained_variance_score(y_train, y_pred)
MSLE = np.sqrt(metrics.mean_squared_error(y_train, y_pred))
r2 = metrics.r2_score(y_train, y_pred)

print("MSLE = ", MSLE)
#print("RMSLE = ", RMSLE)
print("VARIANCE_SCORE = ", VARIANCE_SCORE)
print("R2", r2)
'''
Stand score

Train score: 0.9573181748061892
MSLE =  11238714.006184377
VARIANCE_SCORE =  0.9573181796966825
R2 0.9573181748061892
'''

MSLE =  347895.6100608243
VARIANCE_SCORE =  0.9999591014925479
R2 0.9999591014754754


'\nStand score\n\nTrain score: 0.9573181748061892\nMSLE =  11238714.006184377\nVARIANCE_SCORE =  0.9573181796966825\nR2 0.9573181748061892\n'

## Test score

In [15]:
print(f"Test score: {pipeline.score(X_eval, y_eval)}")

Test score: 0.9999660397815804


In [16]:
y_pred = pipeline.predict(X_eval)
y_pred

array([ 6840799.5,  4062207.5,  8703710. , ...,  4900668. , 51919900. ,
       19185432. ], dtype=float32)

In [17]:
#RMSLE = metrics.mean_squared_log_error(y_eval ,y_pred)
VARIANCE_SCORE = metrics.explained_variance_score(y_eval, y_pred)
MSLE = np.sqrt(metrics.mean_squared_error(y_eval, y_pred))
r2 = metrics.r2_score(y_eval, y_pred)

print("MSLE = ", MSLE)
#print("RMSLE = ", RMSLE)
print("VARIANCE_SCORE = ", VARIANCE_SCORE)
print("R2", r2)

'''
Stand score

Test score: 0.6710864790042561
MSLE =  32747921.6326721
VARIANCE_SCORE =  0.6711062838835371
R2 0.6710864790042561
'''

MSLE =  332757.7019821857
VARIANCE_SCORE =  0.9999660399345396
R2 0.9999660397815804


'\nStand score\n\nTest score: 0.6710864790042561\nMSLE =  32747921.6326721\nVARIANCE_SCORE =  0.6711062838835371\nR2 0.6710864790042561\n'

In [18]:
# save test predictions to file
predictions = pipeline.predict(X_test)
output = pd.DataFrame({'building_id': X_test.index, 'total_price': predictions})
output.to_csv('submission/XGB_predict.csv', index=False)

In [19]:
output

Unnamed: 0,building_id,total_price
0,X5gsdTWGS3W7JJQB,1.604546e+07
1,BTshNOJyKHnT2YIT,4.058191e+06
2,dhdymr0lV8N5kZOT,8.852710e+06
3,VEwyGGMcD56w5BOc,4.831482e+06
4,wmUeMoJZfsqaSX9b,1.322928e+06
5,EtBjGAHmHCe9t7TZ,2.974070e+06
6,hPNH34vmaZtvBtqc,1.223555e+07
7,wXjeI38bYDMJJwZC,5.905666e+06
8,fxZSGX6aPAFKU8W4,1.240651e+06
9,ewr0Fx6ign87OwaV,4.419357e+06
