In [20]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import sklearn.metrics as metrics
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

import seaborn as sns
import math
from warnings import simplefilter

In [21]:
X = pd.read_csv('./dataset-0510/train.csv', index_col='building_id')
X_test = pd.read_csv('./dataset-0510/test.csv', index_col='building_id')

y = X.total_price
X.drop(columns=['total_price'], inplace=True)

In [22]:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.3, random_state=42) 

## Pipeline RF

In [24]:
# step1. Imputation transformer for completing missing values.
step1 = ('Imputer', Imputer())
# step2. MinMaxScaler
step2 = ('MinMaxScaler', MinMaxScaler())
# step3. feature selection
#step3 = ('FeatureSelection', SelectFromModel(RandomForestRegressor()))
step3 = ('FeatureSelection', VarianceThreshold())

finally_step = ('model', RandomForestRegressor(n_estimators=200))

pipeline = Pipeline(steps=[step1, step2, step3, finally_step])

grid = GridSearchCV(pipeline,  
                    param_grid={'model__n_estimators': [200, 300, 400, 500],
                               },
                    cv = 3,
                    scoring = 'neg_mean_absolute_error')
pipeline.fit(X, y)



Pipeline(memory=None,
         steps=[('Imputer',
                 Imputer(axis=0, copy=True, missing_values='NaN',
                         strategy='mean', verbose=0)),
                ('MinMaxScaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('FeatureSelection', VarianceThreshold(threshold=0.0)),
                ('model',
                 RandomForestRegressor(bootstrap=True, criterion='mse',
                                       max_depth=None, max_features='auto',
                                       max_leaf_nodes=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=200, n_jobs=None,
                                       oob_score=False, random_state=None,
                  

In [33]:
#print(f"Best model parameters: {grid.best_params_}")
print(f"Best score: {pipeline.score(X_eval, y_eval)}")

# 4500 score is scoring -0.9675898812079498
# 4.1k clip train and test got -0.8714843518790356

Best score: 0.6607899535775293


In [32]:
y_pred = pipeline.predict(X_eval)
RMSLE = metrics.mean_squared_log_error(y_eval ,y_pred)
VARIANCE_SCORE = metrics.explained_variance_score(y_eval, y_pred)
MSLE = np.sqrt(metrics.mean_squared_error(y_eval, y_pred))
r2 = metrics.r2_score(y_eval, y_pred)

print("MSLE = ", MSLE)
print("RMSLE = ", RMSLE)
print("VARIANCE_SCORE = ", VARIANCE_SCORE)
print("R2", r2)

'''
4.5k score scoring 
MSLE =  9941902.731254091
RMSLE =  0.013472120053191117
VARIANCE_SCORE =  0.9675908731095261
R2 0.9675898812079498
'''
'''
4.1k clip train and test got 
MSLE =  19797356.503786836
RMSLE =  0.03340000158427895
VARIANCE_SCORE =  0.8714853550598386
R2 0.8714843518790355
'''

MSLE =  33256552.925698794
RMSLE =  0.07592768359880137
VARIANCE_SCORE =  0.6607901577617321
R2 0.6607899535775293


'\nclip train and test got \nMSLE =  19797356.503786836\nRMSLE =  0.03340000158427895\nVARIANCE_SCORE =  0.8714853550598386\nR2 0.8714843518790355\n'

In [30]:
# save test predictions to file
predictions = pipeline.predict(X_test)
output = pd.DataFrame({'building_id': X_test.index, 'total_price': predictions})
output.to_csv('submission/RF_predict.csv', index=False)

## Result

model score : RF > XGBoost
feature selection : VarianceThreshold > featureSelection