In [30]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler, scale
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import xgboost
from xgboost import XGBRegressor
import sklearn.metrics as metrics
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [31]:
X = pd.read_csv('./dataset-0510/train.csv', index_col='building_id')
X_test = pd.read_csv('./dataset-0510/test.csv', index_col='building_id')

y = X.total_price
X.drop(columns=['total_price'], inplace=True)

In [32]:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.3, random_state=42) 

In [33]:
# step1. Imputation transformer for completing missing values.
step1 = ('Imputer', SimpleImputer())
# step2. MinMaxScaler
step2 = ('MinMaxScaler', MinMaxScaler())
# step3. feature selection
#step3 = ('FeatureSelection', SelectFromModel(RandomForestRegressor()))
step3 = ('FeatureSelection', VarianceThreshold())

finally_step = ('model',
                XGBRegressor(
                    subsample=0.6,
                    reg_lambda=1e-5,
                    reg_alpha=0.1,
                    n_estimators=100000,
                    min_child_weight=6,
                    max_depth=9,
                    learning_rate=0.07,
                    gamma=0.0,
                    colsample_bytree=0.7,
                    tree_method='gpu_hist',
                    predictor='gpu_predictor',
                )
               )
parameters_for_testing = {
    #'model__colsample_bytree':[0.6,0.7,0.8, 0.9],
    #'model__gamma':[0, 0.1, 0.2, 0.3, 0.4],
    #'model__min_child_weight':[1.5,6,10],
    #'model__learning_rate':[0.0001, 0.001, 0.01, 0.1],
    #'model__max_depth':[3, 6, 9],
    #'model__n_estimators':[10000, 13000, 15000],
    'model__reg_alpha':[0, 0.001, 0.005, 0.01, 0.05],
    #'model__reg_lambda':[0, 1e-5, 1e-2, 0.45, 1],
    #'model__subsample':[0.6, 0.7, 0.8, 0.9]
}

pipeline = Pipeline(steps=[step1, step2, step3,finally_step])

grid_search = GridSearchCV(pipeline,
                                  parameters_for_testing)




## Tuning

### gamma
[0, 0.1, 0.2, 0.3, 0.4] -> 0

### subsample & colsample bytree
{'model__colsample_bytree': 0.7, 'model__subsample': 0.6}

### reg_alpha
best params
{'model__reg_alpha': 0.1}
best score
0.7540034509599658

In [27]:
grid_search.fit(X_train, y_train)






GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('Imputer',
                                        SimpleImputer(add_indicator=False,
                                                      copy=True,
                                                      fill_value=None,
                                                      missing_values=nan,
                                                      strategy='mean',
                                                      verbose=0)),
                                       ('MinMaxScaler',
                                        MinMaxScaler(copy=True,
                                                     feature_range=(0, 1))),
                                       ('FeatureSelection',
                                        VarianceThreshold(threshold=0.0)),
                                       ('model',
                                        XGB

In [29]:
print('result')
print(grid_search.cv_results_)

print('best params')
print (grid_search.best_params_)

print('best score')
print (grid_search.best_score_)



result
{'mean_fit_time': array([6.07896821, 6.06961028, 6.05445385, 6.05004676, 6.05188918]), 'std_fit_time': array([0.30693766, 0.29511653, 0.29249228, 0.29546132, 0.29478944]), 'mean_score_time': array([0.13918686, 0.13923542, 0.13780228, 0.13799063, 0.14084395]), 'std_score_time': array([0.00029867, 0.0006985 , 0.00100802, 0.00103989, 0.00142107]), 'param_model__reg_alpha': masked_array(data=[0, 0.001, 0.005, 0.01, 0.05],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'model__reg_alpha': 0}, {'model__reg_alpha': 0.001}, {'model__reg_alpha': 0.005}, {'model__reg_alpha': 0.01}, {'model__reg_alpha': 0.05}], 'split0_test_score': array([0.76197589, 0.76197589, 0.76197589, 0.76197589, 0.76197589]), 'split1_test_score': array([0.77929524, 0.77929524, 0.77929524, 0.77929524, 0.77929524]), 'split2_test_score': array([0.7207392, 0.7207392, 0.7207392, 0.7207392, 0.7207392]), 'mean_test_score': array([0.75400344, 0.75400344, 

In [None]:
pipeline.fit(X_train, y_train)



## Train score

In [35]:
print(f"Train score: {pipeline.score(X_train, y_train)}")

Train score: 0.9999995374980827


In [36]:
y_pred = pipeline.predict(X_train)
y_pred

array([1.7951796e+06, 1.7418298e+07, 2.2213380e+06, ..., 3.1425210e+08,
       2.5333602e+06, 5.7807535e+06], dtype=float32)

In [37]:
#RMSLE = metrics.mean_squared_log_error(y_eval ,y_pred)
VARIANCE_SCORE = metrics.explained_variance_score(y_train, y_pred)
MSLE = np.sqrt(metrics.mean_squared_error(y_train, y_pred))
r2 = metrics.r2_score(y_train, y_pred)

print("MSLE = ", MSLE)
#print("RMSLE = ", RMSLE)
print("VARIANCE_SCORE = ", VARIANCE_SCORE)
print("R2", r2)
'''
0.03 lr
5w n_estimators
5080.7856385352
Train score: 0.9999994416836466
MSLE =  46193.834159808786
VARIANCE_SCORE =  0.9999992789294592
R2 0.9999992789294392
'''
'''
10w estimator, lr 0.07
MSLE =  36947.36986818785
VARIANCE_SCORE =  0.999999538707256
R2 0.9999995387070688
'''
'''
    subsample=0.6,
    reg_lambda=1e-5,
    reg_alpha=0.1,
    n_estimators=100000,
    min_child_weight=6,
    max_depth=9,
    learning_rate=0.07,
    gamma=0.0,
    colsample_bytree=0.7,
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    
    MSLE =  36995.755203011766
    VARIANCE_SCORE =  0.9999995374982386
    R2 0.9999995374980827
'''

MSLE =  36995.755203011766
VARIANCE_SCORE =  0.9999995374982386
R2 0.9999995374980827


'\n10w estimator, lr 0.1\nMSLE =  36947.36986818785\nVARIANCE_SCORE =  0.999999538707256\nR2 0.9999995387070688\n\n'

## Test score

In [38]:
print(f"Test score: {pipeline.score(X_eval, y_eval)}")

Test score: 0.9999999864925494


In [39]:
y_pred = pipeline.predict(X_eval)
y_pred

array([ 7003370.5,  3902604.2,  9034932. , ...,  4308731.5, 51746890. ,
       19445522. ], dtype=float32)

In [40]:
#RMSLE = metrics.mean_squared_log_error(y_eval ,y_pred)
VARIANCE_SCORE = metrics.explained_variance_score(y_eval, y_pred)
MSLE = np.sqrt(metrics.mean_squared_error(y_eval, y_pred))
r2 = metrics.r2_score(y_eval, y_pred)

print("MSLE = ", MSLE)
#print("RMSLE = ", RMSLE)
print("VARIANCE_SCORE = ", VARIANCE_SCORE)
print("R2", r2)

'''
0.03 lr
5080.7856385352
Test score: 0.9999999077079832
MSLE =  27300.38882949357
VARIANCE_SCORE =  0.9999997714168822
R2 0.9999997714128004
'''
'''
10w estimator, lr 0.07
MSLE =  6426.133544374753
VARIANCE_SCORE =  0.9999999873348979
R2 0.9999999873347509
'''
'''
    subsample=0.6,
    reg_lambda=1e-5,
    reg_alpha=0.1,
    n_estimators=100000,
    min_child_weight=6,
    max_depth=9,
    learning_rate=0.07,
    gamma=0.0,
    colsample_bytree=0.7,
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    
    MSLE =  6636.354426968303
    VARIANCE_SCORE =  0.9999999864928084
    R2 0.9999999864925494
'''

MSLE =  6636.354426968303
VARIANCE_SCORE =  0.9999999864928084
R2 0.9999999864925494


'\n10w estimator, lr 0.07\nMSLE =  6426.133544374753\nVARIANCE_SCORE =  0.9999999873348979\nR2 0.9999999873347509\n'

In [43]:
# save test predictions to file
predictions = pipeline.predict(X_test)
output = pd.DataFrame({'building_id': X_test.index, 'total_price': predictions})
output.to_csv('submission/XGB_predict.csv', index=False)

In [44]:
output

Unnamed: 0,building_id,total_price
0,X5gsdTWGS3W7JJQB,1.825720e+07
1,BTshNOJyKHnT2YIT,3.900890e+06
2,dhdymr0lV8N5kZOT,8.137778e+06
3,VEwyGGMcD56w5BOc,6.560984e+06
4,wmUeMoJZfsqaSX9b,8.243812e+05
5,EtBjGAHmHCe9t7TZ,2.755445e+06
6,hPNH34vmaZtvBtqc,1.205502e+07
7,wXjeI38bYDMJJwZC,5.858460e+06
8,fxZSGX6aPAFKU8W4,1.677364e+06
9,ewr0Fx6ign87OwaV,4.263161e+06
