In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

#--- Generative Ensemble Learning(集成式學習) Algorithm for House Pirce Prediciton
# Bootstrap aggregation (bagging)
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
# Gradient bootsting (梯度提升)
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb

# cross-validation
from sklearn.model_selection import GridSearchCV

# Metric
from sklearn.metrics import r2_score                         #內部測試使用
from sklearn.metrics import mean_absolute_percentage_error   #比賽使用

In [None]:
#------------------------------------------------------------------------
# load dataset (clean)
#------------------------------------------------------------------------
df=pd.read_csv('/content/clean_dataset_student_convenient.csv')

In [None]:
column_names = df.columns
print(column_names)

Index(['county', 'lon', 'lat', 'house_age', 'residence_housing',
       'congregate_housing', 'commercial_use', 'industrial_use', 'apartment',
       'building_low', 'building_high', 'RB', 'RC', 'SC', 'land_area',
       'building_area', 'main_building_area', 'balcony_area', 'auxiliary_area',
       'floor', 'total_floor', 'parking_area', 'parking_number', 'unit_price',
       'elementary', 'total_elem', 'd_elementary', 'junior_high',
       'total_junior', 'd_junior', 'high_scl', 'total_hight', 'd_high',
       'univr_scl', 'total_univr', 'd_univr', 'd_bike',
       'bikedistance_less_than_500m', 'd_bus', 'busdistance_less_than_500m',
       'd_tmrt', 'mrtdistance_less_than_500m', 'd_train',
       'traindistance_less_than_500m', 'convenient'],
      dtype='object')


In [None]:
df=df.drop(['county','elementary', 'junior_high', 'high_scl','univr_scl'],axis=1)

In [None]:
df

Unnamed: 0,lon,lat,house_age,residence_housing,congregate_housing,commercial_use,industrial_use,apartment,building_low,building_high,...,d_univr,d_bike,bikedistance_less_than_500m,d_bus,busdistance_less_than_500m,d_tmrt,mrtdistance_less_than_500m,d_train,traindistance_less_than_500m,convenient
0,121.547608,25.022469,32.583333,1,0,0,0,0,0,1,...,0.2498,0.086448,True,0.187799,True,0.565353,False,4.107494,False,False
1,121.502124,25.019127,24.166667,1,0,0,0,0,0,1,...,2.1874,0.143728,True,0.215006,True,1.466092,False,1.612514,False,False
2,120.365799,22.640966,6.166667,0,1,0,0,0,0,1,...,2.0588,0.068840,True,0.253504,True,1.771358,False,1.373044,False,False
3,121.462402,25.058663,8.833333,0,1,0,0,0,0,1,...,4.0438,0.164167,True,0.168638,True,0.406298,True,4.697392,False,True
4,121.469444,25.023585,11.000000,1,0,0,0,0,0,1,...,0.5154,0.089814,True,0.086150,True,0.144303,True,1.155269,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9729,120.310847,22.684429,1.333333,0,1,0,0,0,0,1,...,1.7820,0.044899,True,0.073679,True,0.482721,True,0.541974,False,True
9730,121.446439,25.054289,17.416667,1,0,0,0,0,1,0,...,2.6599,0.232913,True,0.073057,True,0.556869,False,4.727508,False,False
9731,121.464418,25.085886,18.083333,1,0,0,0,0,1,0,...,4.8854,0.238156,True,0.095856,True,0.627168,False,6.832858,False,False
9732,121.375563,25.083086,12.833333,0,1,0,0,0,0,1,...,2.2494,0.181346,True,0.091590,True,2.397203,False,11.205829,False,False


In [None]:
#------------------------------------------------------------------------
# Data Split: training & test data                  '
#------------------------------------------------------------------------
X=df.drop(['unit_price'],axis=1) # features (特徵)
y=df['unit_price']               # labels (目標變數)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state=2023)


In [None]:
X=X.dropna(axis=1)

In [None]:
#=============================================================================
# Random Forest Regressor
#=============================================================================
# Grid Search - Exhaustive search over specified parameter values for an estimator.

print('Cross-valation and Grid search for best parameters combination...')
print('--- Random Forest Regressor ---')
gs_params = {
    'n_estimators': [100,300,500],      #default=100
    'max_depth': [None, 10, 20, 30],    #default=None
    'min_samples_split': [2, 5, 10],    #default=2
    'min_samples_leaf': [1, 2, 4],      #default=1
    'max_features': [1.0, 0.8, 'sqrt']  #default=1.0
}


model = RandomForestRegressor(random_state=2023)
gscv = GridSearchCV(model, param_grid=gs_params, cv=10, scoring='neg_mean_absolute_percentage_error', n_jobs=-1, verbose=2)
gscv.fit(X_train, y_train)

# Using the best parameter combination to train model
best_rf_params=gscv.best_params_
model = RandomForestRegressor(random_state=2023, **best_rf_params)
model.fit(X_train, y_train)

# Prediction and evaluatio, by MAPE(official metric) and R2(internal metric)
y_pred=model.predict(X_test)
r2=r2_score(y_test, y_pred)
mape=mean_absolute_percentage_error(y_test, y_pred)
print('--- Random Forest ---')
print('最佳參數組合:{0}'.format(gscv.best_params_))
#print('最佳模型得分:{0}'.format(gscv.best_score_))
print('1. R2 Score(testing)：{0:.4f}'.format(r2))
print('2. MAPE(testing)：{0:.4f}'.format(mape))



Cross-valation and Grid search for best parameters combination...
--- Random Forest Regressor ---
Fitting 10 folds for each of 324 candidates, totalling 3240 fits


In [None]:
best_rf_params

In [None]:
#=============================================================================
# XGBoost
#=============================================================================
print('Cross-valation and Grid search for best parameters combination...')
print('--- XGB Regressor---')
'''
gs_params = {
     'eta': [0.25,0.3,0.35],      # alias learning rate, default=0.3
     'max_depth': [6,7,8],        # default=6
     'subsample' : [0.8,0.9,1]    # default=1
}
'''
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.9],
    'colsample_bytree': [0.7, 0.9],
    'gamma': [0, 0.1, 0.2],
    'lambda': [1, 1.5],
    'alpha': [0, 0.1]
}

model = xgb.XGBRegressor(objective='reg:squarederror',
                         eval_metric='mae',
                         random_state=2023,
                          )
gscv = GridSearchCV(model, param_grid=gs_params, cv=5, scoring='neg_mean_absolute_percentage_error', n_jobs=-1, verbose=2)
gscv.fit(X_train, y_train)


# Using the best parameter combination to train model
best_rgb_params=gscv.best_params_
model = xgb.XGBRegressor(objective='reg:squarederror',
                         eval_metric='mape',
                         random_state=2023,
                         **best_rgb_params)
model.fit(X_train, y_train)

# Prediction and evaluatio, by MAPE(official metric) and R2(internal metric)
y_pred=model.predict(X_test)
r2=r2_score(y_test, y_pred)
mape=mean_absolute_percentage_error(y_test, y_pred)

print('--- XGB Boost ---')
print('最佳參數組合:{0}'.format(gscv.best_params_))
#print('最佳模型得分:{0}'.format(gscv.best_score_))
print('1. R2 Score(testing)：{0:.4f}'.format(r2))
print('2. MAPE(testing)：{0:.4f}'.format(mape))


Fitting 10 folds for each of 27 candidates, totalling 270 fits
最佳參數組合:{'eta': 0.25, 'max_depth': 8, 'subsample': 1}
--- XGB Boost ---
1. R2 Score(testing)：0.9131
2. MAPE(testing)：0.1035


In [None]:
#=============================================================================
# ExtraTrees Regressor
#=============================================================================
print('Cross-valation and Grid search for best parameters combination...')
print('--- ExtraTrees Regressor---')
# Grid Search - Exhaustive search over specified parameter values for an estimator.
gs_params = {'n_estimators':[100,200,300],  # default=100
             'max_samples':[0.5,0.8,0.99],  # default=None(0.99代替)
             'max_features':[0.5,0.8,1.0],   # default=1.0
             'bootstrap':[True]
             }

model = ExtraTreesRegressor(random_state=2023)
gscv = GridSearchCV(model, param_grid=gs_params, cv=10, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=2)
gscv.fit(X_train, y_train)


# Using the best parameter combination to train model
best_et_params=gscv.best_params_
model = ExtraTreesRegressor(random_state=2023, **best_et_params)
model.fit(X_train, y_train)

# Prediction and evaluatio, by MAPE(official metric) and R2(internal metric)
y_pred=model.predict(X_test)
r2=r2_score(y_test, y_pred)
mape=mean_absolute_percentage_error(y_test, y_pred)

print('--- ExtraTrees---')
print('最佳參數組合:{0}'.format(gscv.best_params_))
#print('最佳模型得分:{0}'.format(gscv.best_score_))
print('1. R2 Score(testing)：{0:.4f}'.format(r2))
print('2. MAPE(testing)：{0:.4f}'.format(mape))


Cross-valation and Grid search for best parameters combination...
--- ExtraTrees Regressor---
Fitting 10 folds for each of 27 candidates, totalling 270 fits
--- ExtraTrees---
最佳參數組合:{'bootstrap': True, 'max_features': 1.0, 'max_samples': 0.99, 'n_estimators': 300}
1. R2 Score(testing)：0.9149
2. MAPE(testing)：0.0984
