In [66]:
# 載入需要的套件
import os
import numpy as np 
import pandas as pd
import copy
import seaborn as sns
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler, Imputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
%matplotlib inline

In [78]:
# 設定 data_path
dir_data = './data/'
Train = os.path.join(dir_data, 'train.csv')
Test = os.path.join(dir_data, 'test.csv')

# 讀取檔案
Train_data = pd.read_csv(Train)
Test_data = pd.read_csv(Test)

In [79]:
# 檢查 DataFrame 空缺值的狀態
def na_check(df_data):
    data_na = (df_data.isnull().sum() / len(df_data)) * 100
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    display(missing_data.head(10))
    
def area_type(row):
    if row >= 0:
        return 1
    else:
        return 0

In [80]:
train_Y = np.log1p(Train_data['total_price'])
ids = Test_data['building_id']

tp = copy.deepcopy(np.log1p(Train_data['total_price']))

Train_data = Train_data.drop(['building_id', 'total_price'] , axis=1)
Test_data = Test_data.drop(['building_id'] , axis=1)

#df = pd.concat([Train_data,Test_data])

#df.head()

In [81]:
na_check(Train_data)

Unnamed: 0,Missing Ratio
parking_area,94.828333
parking_price,76.775
txn_floor,26.503333
village_income_median,1.903333


In [82]:
na_check(Test_data)

Unnamed: 0,Missing Ratio
parking_area,95.0
parking_price,77.1
txn_floor,26.39
village_income_median,1.84


In [83]:
Train_data.loc[:, "parking_price"] = Train_data["parking_price"].clip(0.1, 300000)
Test_data.loc[:, "parking_price"] = Test_data["parking_price"].clip(0.1, 300000)

#train_num = train_Y.shape[0]
#train_X = df[:train_num]

#plt.scatter(train_X["parking_price"], train_Y)

In [84]:
parking_price_train = Train_data['parking_price']
parking_area_train = Train_data['parking_area']

Train_data = Train_data.drop(['parking_price'] , axis=1)
#df = df.drop(['parking_area'] , axis=1)

parking_price_test = Test_data['parking_price']
parking_area_test = Test_data['parking_area']

Test_data = Test_data.drop(['parking_price'] , axis=1)
#df = df.drop(['parking_area'] , axis=1)

Train_data = Train_data.fillna(Train_data.mean())
Train_data['parking_price'] = parking_price_train

Test_data = Test_data.fillna(Test_data.mean())
Test_data['parking_price'] = parking_price_test

In [85]:
park_not_null = Train_data[Train_data['parking_price'].notnull()]
park_is_null = Train_data[Train_data['parking_price'].isnull()].drop(['parking_price'] , axis=1)

park_X = park_not_null.drop(['parking_price'] , axis=1)
park_Y = np.log1p(park_not_null['parking_price'])

In [86]:
XGB = xgb.XGBRegressor()
XGB.fit(park_X, park_Y)

park_pred = XGB.predict(park_is_null)
Train_data.loc[Train_data['parking_price'].notnull(), 'parking_price'] = np.log1p(park_not_null['parking_price'])
Train_data.loc[Train_data['parking_price'].isnull(), 'parking_price'] = park_pred

Train_data.head()



Unnamed: 0,building_material,city,txn_dt,total_floor,building_type,building_use,building_complete_dt,parking_way,parking_area,txn_floor,...,XIV_500,XIV_index_500,XIV_1000,XIV_index_1000,XIV_5000,XIV_index_5000,XIV_10000,XIV_index_10000,XIV_MIN,parking_price
0,8,21,18674,4,3,2,6271,2,7.211837,3.0,...,58,1,157,1,2483,1,6011,1,34.469803,6.029283
1,8,7,18800,5,1,2,7885,2,7.211837,5.0,...,28,1,115,1,15872,1,32221,1,40.073573,10.448876
2,8,7,19289,4,1,2,6028,2,7.211837,1.0,...,78,1,212,1,15760,1,32228,1,54.462081,10.424281
3,8,21,20385,24,0,2,18325,0,7.211837,13.0,...,20,1,125,1,2568,1,7271,1,99.628966,11.30393
4,1,21,20657,2,4,2,6880,2,7.211837,5.389269,...,18,1,47,1,2587,1,7442,1,124.131236,8.312435


In [87]:
park_not_null = Test_data[Test_data['parking_price'].notnull()]
park_is_null = Test_data[Test_data['parking_price'].isnull()].drop(['parking_price'] , axis=1)

park_X = park_not_null.drop(['parking_price'] , axis=1)
park_Y = np.log1p(park_not_null['parking_price'])

XGB = xgb.XGBRegressor()
XGB.fit(park_X, park_Y)

park_pred = XGB.predict(park_is_null)
Test_data.loc[Test_data['parking_price'].notnull(), 'parking_price'] = np.log1p(park_not_null['parking_price'])
Test_data.loc[Test_data['parking_price'].isnull(), 'parking_price'] = park_pred

Test_data.head()



Unnamed: 0,building_material,city,txn_dt,total_floor,building_type,building_use,building_complete_dt,parking_way,parking_area,txn_floor,...,XIV_500,XIV_index_500,XIV_1000,XIV_index_1000,XIV_5000,XIV_index_5000,XIV_10000,XIV_index_10000,XIV_MIN,parking_price
0,1,13,18634,2,4,2,1857,2,6.645918,5.421682,...,21,1,129,1,3991,1,8409,1,105.462231,10.819075
1,8,7,20199,10,3,2,16011,2,6.645918,2.0,...,44,1,67,1,1950,1,10725,1,100.819809,9.821386
2,8,12,19769,4,4,10,18294,2,6.645918,5.421682,...,3,1,35,1,3707,1,19459,1,259.800543,9.554942
3,10,7,20479,19,2,0,17837,2,6.645918,16.0,...,60,1,144,1,6487,1,29400,1,22.941906,10.759605
4,8,3,18164,13,0,2,13272,2,6.645918,3.0,...,18,1,76,1,1346,1,3280,1,181.213095,9.132716


In [88]:
Train_data['parking_area'] = parking_area_train
Train_data.loc[:, "parking_area"] = Train_data["parking_area"].clip(0.1, 50)

Test_data['parking_area'] = parking_area_test
Test_data.loc[:, "parking_area"] = Test_data["parking_area"].clip(0.1, 50)

train_num = train_Y.shape[0]
#train_X = df[:train_num]

#plt.scatter(train_X["parking_area"], train_Y)

In [89]:
park_area_not_null = Train_data[Train_data['parking_area'].notnull()]
park_area_is_null = Train_data[Train_data['parking_area'].isnull()].drop(['parking_area'] , axis=1)

park_area_X = park_area_not_null.drop(['parking_area'] , axis=1)
park_area_Y = park_area_not_null['parking_area']

In [90]:
XGBR = xgb.XGBRegressor()
XGBR.fit(park_area_X, park_area_Y)

park_area_pred = XGBR.predict(park_area_is_null)

Train_data.loc[Train_data['parking_area'].isnull(), 'parking_area'] = park_area_pred

Train_data.head()



Unnamed: 0,building_material,city,txn_dt,total_floor,building_type,building_use,building_complete_dt,parking_way,parking_area,txn_floor,...,XIV_500,XIV_index_500,XIV_1000,XIV_index_1000,XIV_5000,XIV_index_5000,XIV_10000,XIV_index_10000,XIV_MIN,parking_price
0,8,21,18674,4,3,2,6271,2,1.869074,3.0,...,58,1,157,1,2483,1,6011,1,34.469803,6.029283
1,8,7,18800,5,1,2,7885,2,3.679982,5.0,...,28,1,115,1,15872,1,32221,1,40.073573,10.448876
2,8,7,19289,4,1,2,6028,2,3.090124,1.0,...,78,1,212,1,15760,1,32228,1,54.462081,10.424281
3,8,21,20385,24,0,2,18325,0,6.156056,13.0,...,20,1,125,1,2568,1,7271,1,99.628966,11.30393
4,1,21,20657,2,4,2,6880,2,1.458056,5.389269,...,18,1,47,1,2587,1,7442,1,124.131236,8.312435


In [91]:
park_area_not_null = Test_data[Test_data['parking_area'].notnull()]
park_area_is_null = Test_data[Test_data['parking_area'].isnull()].drop(['parking_area'] , axis=1)

park_area_X = park_area_not_null.drop(['parking_area'] , axis=1)
park_area_Y = park_area_not_null['parking_area']

XGBR = xgb.XGBRegressor()
XGBR.fit(park_area_X, park_area_Y)

park_area_pred = XGBR.predict(park_area_is_null)

Test_data.loc[Test_data['parking_area'].isnull(), 'parking_area'] = park_area_pred

Test_data.head()



Unnamed: 0,building_material,city,txn_dt,total_floor,building_type,building_use,building_complete_dt,parking_way,parking_area,txn_floor,...,XIV_500,XIV_index_500,XIV_1000,XIV_index_1000,XIV_5000,XIV_index_5000,XIV_10000,XIV_index_10000,XIV_MIN,parking_price
0,1,13,18634,2,4,2,1857,2,2.360311,5.421682,...,21,1,129,1,3991,1,8409,1,105.462231,10.819075
1,8,7,20199,10,3,2,16011,2,3.324365,2.0,...,44,1,67,1,1950,1,10725,1,100.819809,9.821386
2,8,12,19769,4,4,10,18294,2,1.661235,5.421682,...,3,1,35,1,3707,1,19459,1,259.800543,9.554942
3,10,7,20479,19,2,0,17837,2,2.694642,16.0,...,60,1,144,1,6487,1,29400,1,22.941906,10.759605
4,8,3,18164,13,0,2,13272,2,3.964656,3.0,...,18,1,76,1,1346,1,3280,1,181.213095,9.132716


In [13]:
temp_df = copy.deepcopy(df)

In [92]:
Train_data = Train_data.fillna(Train_data.mean())

Train_data["txn_dt"] = Train_data["txn_dt"]/100
Train_data["building_complete_dt"] = Train_data["building_complete_dt"]/100

Train_data["village_income_median"] = np.log1p(Train_data["village_income_median"])
Train_data["building_area"] = Train_data["building_area"].clip(1, 50)
Train_data["land_area"] = Train_data["land_area"].clip(1, 50)
Train_data["town_area"] = Train_data["town_area"].clip(1, 150)

Train_data["per_building_area"] = Train_data["building_area"] / Train_data["total_floor"]
Train_data["total_income"] = np.log1p(Train_data["village_income_median"] * Train_data["town_population"])
Train_data["area_percent"] = Train_data["land_area"] / Train_data["town_area"]
Train_data["per_parking_price"] = np.log1p(np.expm1(Train_data["parking_price"]) / Train_data["parking_area"])
Train_data["education_rate"] = Train_data["jobschool_rate"] + Train_data["highschool_rate"]

Train_data["area_diff"] = Train_data["land_area"] - Train_data["building_area"]
Train_data['area_type'] = Train_data['area_diff'].apply(area_type)
#Train_data = Train_data.drop(['area_diff'] , axis=1)

#train_num = train_Y.shape[0]
#train_X = Train_data[:train_num]
#test_X = Train_data[train_num:]

  


In [93]:
Test_data = Test_data.fillna(Test_data.mean())

Test_data["txn_dt"] = Test_data["txn_dt"]/100
Test_data["building_complete_dt"] = Test_data["building_complete_dt"]/100

Test_data["village_income_median"] = np.log1p(Test_data["village_income_median"])
Test_data["building_area"] = Test_data["building_area"].clip(1, 50)
Test_data["land_area"] = Test_data["land_area"].clip(1, 50)
Test_data["town_area"] = Test_data["town_area"].clip(1, 150)

Test_data["per_building_area"] = Test_data["building_area"] / Test_data["total_floor"]
Test_data["total_income"] = np.log1p(Test_data["village_income_median"] * Test_data["town_population"])
Test_data["area_percent"] = Test_data["land_area"] / Test_data["town_area"]
Test_data["per_parking_price"] = np.log1p(np.expm1(Test_data["parking_price"]) / Test_data["parking_area"])
Test_data["education_rate"] = Test_data["jobschool_rate"] + Test_data["highschool_rate"]

Test_data["area_diff"] = Test_data["land_area"] - Test_data["building_area"]
Test_data['area_type'] = Test_data['area_diff'].apply(area_type)
#Test_data = Test_data.drop(['area_diff'] , axis=1)

#train_num = train_Y.shape[0]
#train_X = Test_data[:train_num]
#test_X = Test_data[train_num:]

  


In [94]:
data = pd.concat([Train_data[:train_num], train_Y], axis=1)
test_data = Train_data[train_num:]

Train_data["txn_floor"] = Train_data["txn_floor"].astype('int')

cols = ["city", "building_type", "building_use", "total_floor", "txn_floor"]

for c in cols:
    
    mean_df = data.groupby([c])['total_price'].mean().reset_index()
    mean_df.columns = [c, f'{c}_mean']
    
    data = pd.merge(data, mean_df, on= c, how='left')
    data = data.drop([c] , axis=1)
    
    test_data = pd.merge(test_data, mean_df, on= c, how='left')   
    test_data = test_data.drop([c] , axis=1)
    
data = data.drop(['total_price'] , axis=1)

In [95]:
Train_data = pd.concat([data,test_data])
Train_data.head()

Unnamed: 0,building_material,txn_dt,building_complete_dt,parking_way,parking_area,land_area,building_area,town,lat,lon,...,area_percent,per_parking_price,education_rate,area_diff,area_type,city_mean,building_type_mean,building_use_mean,total_floor_mean,txn_floor_mean
0,8,186.74,62.71,2,1.869074,18.14446,3.418175,334,-39.14,117.08,...,12.81298,5.405929,0.429944,14.726285,1,14.763179,15.617864,15.495543,15.60855,15.427335
1,8,188.0,78.85,2,3.679982,11.387227,4.041309,180,-37.66,119.28,...,0.492159,9.146046,0.417768,7.345919,1,15.734002,15.368964,15.495543,15.399266,15.359153
2,8,192.89,60.28,2,3.090124,21.426802,5.584279,180,-37.67,119.29,...,0.926072,9.296132,0.417768,15.842523,1,15.734002,15.368964,15.495543,15.60855,15.786106
3,8,203.85,183.25,0,6.156056,11.387227,13.563031,343,-39.13,117.09,...,0.771895,9.486557,0.429944,-2.175803,0,14.763179,15.837634,15.495543,16.127585,15.839771
4,1,206.57,68.8,2,1.458056,50.0,4.688108,102,-39.24,117.19,...,0.703829,7.935444,0.429944,45.311892,1,14.763179,15.429428,15.495543,14.954292,15.429428


In [96]:
data = pd.concat([Test_data[:train_num], train_Y], axis=1)
test_data = Test_data[train_num:]

Test_data["txn_floor"] = Test_data["txn_floor"].astype('int')

cols = ["city", "building_type", "building_use", "total_floor", "txn_floor"]

for c in cols:
    
    mean_df = data.groupby([c])['total_price'].mean().reset_index()
    mean_df.columns = [c, f'{c}_mean']
    
    data = pd.merge(data, mean_df, on= c, how='left')
    data = data.drop([c] , axis=1)
    
    test_data = pd.merge(test_data, mean_df, on= c, how='left')   
    test_data = test_data.drop([c] , axis=1)
    
data = data.drop(['total_price'] , axis=1)

Test_data = pd.concat([data,test_data])
Test_data.head()

Unnamed: 0,building_material,txn_dt,building_complete_dt,parking_way,parking_area,land_area,building_area,town,lat,lon,...,area_percent,per_parking_price,education_rate,area_diff,area_type,city_mean,building_type_mean,building_use_mean,total_floor_mean,txn_floor_mean
0,1.0,186.34,18.57,2.0,2.360311,40.317789,3.418175,66.0,-37.72,119.45,...,1.84585,9.960309,0.380338,36.899614,1.0,15.489319,15.514043,15.494969,15.509137,15.514043
1,8.0,201.99,160.11,2.0,3.324365,18.14446,7.726227,4.0,-37.7,119.17,...,0.120963,8.620234,0.417768,10.418233,1.0,15.499352,15.486939,15.494969,15.607262,15.526547
2,8.0,197.69,182.94,2.0,1.661235,47.912742,12.170581,52.0,-38.03,118.24,...,0.764117,9.047427,0.430616,35.742162,1.0,15.528175,15.514043,15.475721,15.504763,15.514043
3,10.0,204.79,178.37,2.0,2.694642,2.222,2.252256,204.0,-37.7,119.34,...,0.388883,9.768376,0.417768,-0.030256,0.0,15.499352,15.512453,15.413125,15.452623,15.641323
4,8.0,181.64,132.72,2.0,3.964656,11.387227,5.813985,169.0,-37.74,119.59,...,0.498821,7.755617,0.445873,5.573243,1.0,15.486193,15.509054,15.494969,15.472321,15.46537


In [17]:
train_X = temp_df[:train_num]

estimator = xgb.XGBRegressor()
estimator.fit(train_X, train_Y)
feats = pd.Series(data=estimator.feature_importances_, index=temp_df.columns)
feats = feats.sort_values(ascending=False)



In [18]:
temp_df = temp_df.fillna(temp_df.mean())
train_X = temp_df[:train_num]

est = RandomForestRegressor()
est.fit(train_X, train_Y)
feats_est = pd.Series(data=est.feature_importances_, index=temp_df.columns)
feats_est = feats_est.sort_values(ascending=False)

In [55]:
high_feature = list(set(feats[feats!=0].index)&set(feats_est[feats_est!=0].index))
train_X = temp_df[:train_num][high_feature]
test_X = temp_df[train_num:][high_feature]

In [121]:
Test_data = Test_data.head(10000)

train_X = Train_data[high_feature]
test_X = Test_data[high_feature]

In [122]:
train_X = train_X.fillna(train_X.mean())
test_X = test_X.fillna(test_X.mean())

In [110]:
# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(train_X, train_Y, test_size=0.25, random_state=4)

estimator_ = RandomForestRegressor()
estimator_.fit(x_train, y_train)

print(cross_val_score(estimator_, x_train, y_train, cv=5).mean())

y_pred = estimator_.predict(x_test)
print("Mean squared error: %.4f"
      % mean_squared_error(y_test, y_pred))

0.9354817806763188
Mean squared error: 0.0750


In [124]:
estimator_ = RandomForestRegressor()
estimator_.fit(train_X, train_Y)

predict = estimator_.predict(test_X)

test_price = np.expm1(predict)

submit = pd.DataFrame(ids)
submit['total_price'] = test_price
submit.to_csv("2019-06-25.csv",index=False)

In [25]:
RFR = RandomForestRegressor()
pca = PCA()
pipe = Pipeline(steps=[('pca', pca), ('clf', RFR)])

In [29]:
# 先執行 GridSearchCV 跑出最佳參數
param_grid = {
    'pca__n_components': [4, 10, 20, 30, 40, 50, 64],
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth':[3, 5, 7]
}
search = GridSearchCV(pipe, param_grid, n_jobs=-1, iid=False, cv=5, return_train_score=False)
search.fit(train_X, train_Y)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

KeyboardInterrupt: 

In [None]:
# 繪製不同 components 的 PCA 解釋度
pca.fit(X_digits)

fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(6, 6))
ax0.plot(pca.explained_variance_ratio_, linewidth=2)
ax0.set_ylabel('PCA explained variance')

ax0.axvline(search.best_estimator_.named_steps['pca'].n_components, linestyle=':', label='n_components chosen')
ax0.legend(prop=dict(size=12))

# 繪製不同採樣點的分類正確率
results = pd.DataFrame(search.cv_results_)
components_col = 'param_pca__n_components'
best_clfs = results.groupby(components_col).apply(lambda g: g.nlargest(1, 'mean_test_score'))

best_clfs.plot(x=components_col, y='mean_test_score', yerr='std_test_score', legend=False, ax=ax1)
ax1.set_ylabel('Classification accuracy (val)')
ax1.set_xlabel('n_components')
plt.tight_layout()
plt.show()