In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler, scale
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import xgboost
from xgboost import XGBRegressor
import sklearn.metrics as metrics
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [2]:
X = pd.read_csv('./dataset-0510/train.csv', index_col='building_id')
X_test = pd.read_csv('./dataset-0510/test.csv', index_col='building_id')

y = X.total_price
X.drop(columns=['total_price'], inplace=True)

In [3]:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.3, random_state=42) 

In [None]:
# step1. Imputation transformer for completing missing values.
step1 = ('Imputer', Imputer())
# step2. MinMaxScaler
step2 = ('MinMaxScaler', MinMaxScaler())
# step3. feature selection
#step3 = ('FeatureSelection', SelectFromModel(RandomForestRegressor()))
step3 = ('FeatureSelection', VarianceThreshold())

finally_step = ('model',
                XGBRegressor(
                    booster="dart",
                    colsample_bytree=0.4,
                    gamma=0,
                    learning_rate=0.07,
                    max_depth=3,
                    min_child_weight=1.5,
                    n_estimators=500,                                                                    
                    reg_alpha=0.75,
                    reg_lambda=0.45,
                    subsample=0.6,
                    seed=42,
                    objective="reg:squarederror",
                    tree_method='gpu_hist',
                    predictor='gpu_predictor',
                )
               )

pipeline = Pipeline(steps=[step1, step2, finally_step])

pipeline.fit(X_train, y_train)



## Train score

In [5]:
print(f"Train score: {pipeline.score(X_train, y_train)}")

Train score: 0.948978609518378


In [6]:
y_pred = pipeline.predict(X_train)
y_pred

array([1.7855214e+06, 2.5297476e+07, 4.8054390e+06, ..., 2.5968616e+08,
       3.2785322e+06, 6.2602040e+06], dtype=float32)

In [7]:
#RMSLE = metrics.mean_squared_log_error(y_eval ,y_pred)
VARIANCE_SCORE = metrics.explained_variance_score(y_train, y_pred)
MSLE = np.sqrt(metrics.mean_squared_error(y_train, y_pred))
r2 = metrics.r2_score(y_train, y_pred)

print("MSLE = ", MSLE)
#print("RMSLE = ", RMSLE)
print("VARIANCE_SCORE = ", VARIANCE_SCORE)
print("R2", r2)
'''
Stand score

Train score: 0.9573181748061892
MSLE =  11238714.006184377
VARIANCE_SCORE =  0.9573181796966825
R2 0.9573181748061892
'''

MSLE =  12287719.064952325
VARIANCE_SCORE =  0.9489792839649881
R2 0.9489786095183781


'\nStand score\n\nTrain score: 0.9573181748061892\nMSLE =  11238714.006184377\nVARIANCE_SCORE =  0.9573181796966825\nR2 0.9573181748061892\n'

## Test score

In [8]:
print(f"Test score: {pipeline.score(X_eval, y_eval)}")

Test score: 0.9351397733741645


In [9]:
y_pred = pipeline.predict(X_eval)
y_pred

array([ 5809957.5,  5579895. , 14410850. , ...,  3826446.8, 45519070. ,
       17299888. ], dtype=float32)

In [10]:
#RMSLE = metrics.mean_squared_log_error(y_eval ,y_pred)
VARIANCE_SCORE = metrics.explained_variance_score(y_eval, y_pred)
MSLE = np.sqrt(metrics.mean_squared_error(y_eval, y_pred))
r2 = metrics.r2_score(y_eval, y_pred)

print("MSLE = ", MSLE)
#print("RMSLE = ", RMSLE)
print("VARIANCE_SCORE = ", VARIANCE_SCORE)
print("R2", r2)

'''
Stand score

Test score: 0.6710864790042561
MSLE =  32747921.6326721
VARIANCE_SCORE =  0.6711062838835371
R2 0.6710864790042561
'''

MSLE =  14542266.857868683
VARIANCE_SCORE =  0.9351437781455703
R2 0.9351397733741645


'\nStand score\n\nTest score: 0.6710864790042561\nMSLE =  32747921.6326721\nVARIANCE_SCORE =  0.6711062838835371\nR2 0.6710864790042561\n'

In [98]:
# save test predictions to file
predictions = pipeline.predict(X_test)
output = pd.DataFrame({'building_id': X_test.index, 'total_price': predictions})
output.to_csv('submission/XGB_predict.csv', index=False)

In [None]:
output