In [1]:
# 載入需要的套件
import os
import numpy as np 
import pandas as pd
import copy
import seaborn as sns
import xgboost as xgb
from scipy.stats import skew
from lightgbm import LGBMRegressor
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler, Imputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
%matplotlib inline

In [32]:
# 設定 data_path
dir_data = './data/'
Train = os.path.join(dir_data, 'train.csv')
Test = os.path.join(dir_data, 'test.csv')

# 讀取檔案
Train_data = pd.read_csv(Train)
Test_data = pd.read_csv(Test)

In [33]:
ids = Test_data['building_id']

Train_data = Train_data.drop(['building_id'] , axis=1)
Test_data = Test_data.drop(['building_id'] , axis=1)

In [34]:
# 檢查 DataFrame 空缺值的狀態
def na_check(df_data):
    data_na = (df_data.isnull().sum() / len(df_data)) * 100
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    display(missing_data.head(10))
    
def area_type(row):
    if row >= 0:
        return 1
    else:
        return 0
    
def house_type(row):
    if row == 0:
        return 0
    else:
        return 1
    
def lat_diff(row):
    if row > -38.6:
        return 1
    else:
        return 0
    
def Parking_area_Fill(data):
    if np.isnan(data['parking_area']):
        if not np.isnan(data['Parking_area_Filling']):
            return data['Parking_area_Filling']
    return data['parking_area']

In [35]:
Train_data[Train_data.loc[:,"total_price"] > 3000000000]["total_floor"]
Train_data = Train_data.drop(2138)
Train_data = Train_data.drop(2317)
Train_data = Train_data.drop(32180)
Train_data = Train_data.drop(58858)

Train_data['txn_floor'] = Train_data['txn_floor'].fillna(0)
Train_data[Train_data.loc[:,"total_price"] > 1500000000]["txn_floor"]
Train_data = Train_data.drop(37480)

Train_data[Train_data.loc[:,"total_price"] > 2000000000]["building_complete_dt"]
Train_data = Train_data.drop(9491)
Train_data = Train_data.drop(46934)
Train_data = Train_data.drop(50076)

Train_data.sort_values(by = 'parking_price', ascending = False)[:4]
Train_data = Train_data.drop(55472)
Train_data = Train_data.drop(8795)
#Train_data = Train_data.drop(37480)
Train_data = Train_data.drop(51731)
Train_data = Train_data.drop(1351)

Train_data.sort_values(by = 'parking_area', ascending = False)[:2]
Train_data = Train_data.drop(38617)
Train_data = Train_data.drop(41914)
Train_data = Train_data.drop(4339)
Train_data = Train_data.drop(47253)
Train_data = Train_data.drop(38992)
Train_data = Train_data.drop(35611)
Train_data = Train_data.drop(30232)
Train_data = Train_data.drop(55200)

Train_data.sort_values(by = 'land_area', ascending = False)[:10]
Train_data = Train_data.drop(10670)
Train_data = Train_data.drop(41331)
Train_data = Train_data.drop(39191)
Train_data = Train_data.drop(53966)
Train_data = Train_data.drop(7814)
Train_data = Train_data.drop(13332)
Train_data = Train_data.drop(9785)
Train_data = Train_data.drop(14007)
#Train_data = Train_data.drop(50076)
Train_data = Train_data.drop(2086)
Train_data = Train_data.drop(36482)
Train_data = Train_data.drop(11170)
Train_data = Train_data.drop(16838)
Train_data = Train_data.drop(44123)
Train_data = Train_data.drop(59506)
Train_data = Train_data.drop(37526)
Train_data = Train_data.drop(29570)
Train_data = Train_data.drop(23399)

Train_data.sort_values(by = 'building_area', ascending = False)[:20]
#Train_data = Train_data.drop(9491)
#Train_data = Train_data.drop(46934)
Train_data = Train_data.drop(16846)
Train_data = Train_data.drop(16817)
Train_data = Train_data.drop(26767)
Train_data = Train_data.drop(33373)
Train_data = Train_data.drop(18748)
Train_data = Train_data.drop(43513)
Train_data = Train_data.drop(13096)
Train_data = Train_data.drop(20546)
Train_data = Train_data.drop(23605)
Train_data = Train_data.drop(29104)
Train_data = Train_data.drop(21239)
Train_data = Train_data.drop(53345)
Train_data = Train_data.drop(14944)
Train_data = Train_data.drop(44963)
Train_data = Train_data.drop(26014)
Train_data = Train_data.drop(21231)
Train_data = Train_data.drop(36955)

Train_data.sort_values(by = 'I_MIN', ascending = False)[:2]["I_MIN"]
Train_data = Train_data.drop(9119)
Train_data = Train_data.drop(3327)

Train_data.sort_values(by = 'III_MIN', ascending = False)[:3]["III_MIN"]
Train_data = Train_data.drop(52324)
Train_data = Train_data.drop(16859)
Train_data = Train_data.drop(53310)

Train_data.sort_values(by = 'V_MIN', ascending = False)[:2]["V_MIN"]
Train_data = Train_data.drop(9124)
Train_data = Train_data.drop(58248)

Train_data.sort_values(by = 'VII_MIN', ascending = False)[:5]["VII_MIN"]
Train_data = Train_data.drop(25569)
Train_data = Train_data.drop(58812)
Train_data = Train_data.drop(1011)
Train_data = Train_data.drop(34303)
Train_data = Train_data.drop(46392)

Train_data.sort_values(by = 'VIII_MIN', ascending = False)[:5]["VIII_MIN"]
Train_data = Train_data.drop(8686)
Train_data = Train_data.drop(25855)
Train_data = Train_data.drop(2652)

Train_data.sort_values(by = 'XI_MIN', ascending = False)[:5]["XI_MIN"]
Train_data = Train_data.drop(53654)
Train_data = Train_data.drop(15654)
Train_data = Train_data.drop(50764)

Train_data.sort_values(by = 'XII_MIN', ascending = False)[:5]["XII_MIN"]
Train_data = Train_data.drop(32153)
Train_data = Train_data.drop(57831)

#Train_data[Train_data.loc[:,"total_price"] > 21]["XIII_MIN"]
#Train_data = Train_data.drop(58858)

In [36]:
train_Y = np.log1p((Train_data["total_price"] / Train_data["building_area"]))
Train_data = Train_data.drop(['total_price'] , axis=1)

df = pd.concat([Train_data,Test_data])
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,building_material,city,txn_dt,total_floor,building_type,building_use,building_complete_dt,parking_way,parking_area,parking_price,...,XIV_250,XIV_500,XIV_index_500,XIV_1000,XIV_index_1000,XIV_5000,XIV_index_5000,XIV_10000,XIV_index_10000,XIV_MIN
0,8,21,18674,4,3,2,6271,2,,,...,21,58,1,157,1,2483,1,6011,1,34.469803
1,8,7,18800,5,1,2,7885,2,,,...,7,28,1,115,1,15872,1,32221,1,40.073573
2,8,7,19289,4,1,2,6028,2,,,...,27,78,1,212,1,15760,1,32228,1,54.462081
3,8,21,20385,24,0,2,18325,0,,81138.889762,...,2,20,1,125,1,2568,1,7271,1,99.628966
4,1,21,20657,2,4,2,6880,2,,,...,2,18,1,47,1,2587,1,7442,1,124.131236


In [37]:
na_check(df)

Unnamed: 0,Missing Ratio
parking_area,94.861711
parking_price,76.826931
txn_floor,3.77399
village_income_median,1.88485


In [38]:
# Change the features to categorical
df['building_material'] = df['building_material'].astype(str)
df['city'] = df['city'].astype(str)
df['town'] = df['town'].astype(str)
df['village'] = df['village'].astype(str)
df['building_type'] = df['building_type'].astype(str)
df['building_use'] = df['building_use'].astype(str)
df['parking_way'] = df['parking_way'].astype(str)

In [39]:
categorical_features = df.select_dtypes(include = ["object"]).columns
numerical_features = df.select_dtypes(exclude = ["object"]).columns

In [40]:
print("Numerical features : " + str(len(numerical_features)))
print("Categorical features : " + str(len(categorical_features)))

Numerical features : 226
Categorical features : 7


In [41]:
numerical_features = numerical_features.drop(['building_area'])

In [42]:
skewness = df[numerical_features].apply(lambda x: skew(x))
skewness = skewness[abs(skewness) > 1.5]
print(str(skewness.shape[0]) + " skewed numerical features to log transform")
skewed_features = skewness.index
df[skewed_features] = np.log1p(df[skewed_features])

df.head()

127 skewed numerical features to log transform


Unnamed: 0,building_material,city,txn_dt,total_floor,building_type,building_use,building_complete_dt,parking_way,parking_area,parking_price,...,XIV_250,XIV_500,XIV_index_500,XIV_1000,XIV_index_1000,XIV_5000,XIV_index_5000,XIV_10000,XIV_index_10000,XIV_MIN
0,8,21,18674,4,3,2,6271,2,,,...,3.091042,4.077537,0.693147,5.062595,0.693147,2483,1,6011,1,3.568682
1,8,7,18800,5,1,2,7885,2,,,...,2.079442,3.367296,0.693147,4.75359,0.693147,15872,1,32221,1,3.715365
2,8,7,19289,4,1,2,6028,2,,,...,3.332205,4.369448,0.693147,5.361292,0.693147,15760,1,32228,1,4.0157
3,8,21,20385,24,0,2,18325,0,,81138.889762,...,1.098612,3.044522,0.693147,4.836282,0.693147,2568,1,7271,1,4.61144
4,1,21,20657,2,4,2,6880,2,,,...,1.098612,2.944439,0.693147,3.871201,0.693147,2587,1,7442,1,4.829363


In [43]:
pd.options.mode.chained_assignment = None

In [44]:
#df = df.drop(['parking_price'] , axis=1)
#df = df.drop(['parking_area'] , axis=1)
df['txn_floor'] = df['txn_floor'].fillna(0)
df["house_type"] = df['txn_floor'].apply(house_type)
df["lat_diff"] = df['lat'].apply(lat_diff)

In [45]:
vimm = df.groupby('village').agg({'village_income_median':'mean'})

vim_isnan = np.isnan(df['village_income_median'])
for i in range(len(df)):
    if vim_isnan[i]:
        df['village_income_median'][i] = vimm.loc[df['village'][i], 'village_income_median']

In [46]:
tvimm = df.groupby('town').agg({'village_income_median':'median'})

vim_isnan = np.isnan(df['village_income_median'])
for i in range(len(df)):
    if vim_isnan[i]:
        df['village_income_median'][i] = tvimm.loc[df['town'][i], 'village_income_median']

In [47]:
na_check(df)

Unnamed: 0,Missing Ratio
parking_area,94.861711
parking_price,76.826931


In [48]:
parking_price = df['parking_price']
parking_area = df['parking_area']

df = df.drop(['parking_price'] , axis=1)
df = df.drop(['parking_area'] , axis=1)

df = df.fillna(df.mean())
df['parking_price'] = parking_price
df['parking_area'] = parking_area

In [50]:
df['building_material'] = df['building_material'].astype(int)
df['city'] = df['city'].astype(int)
df['town'] = df['town'].astype(int)
df['village'] = df['village'].astype(int)
df['building_type'] = df['building_type'].astype(int)
df['building_use'] = df['building_use'].astype(int)
df['parking_way'] = df['parking_way'].astype(int)

In [51]:
park_not_null = df[df['parking_price'].notnull()]
park_is_null = df[df['parking_price'].isnull()].drop(['parking_price'] , axis=1)

park_X = park_not_null.drop(['parking_price'] , axis=1)
park_Y = np.log1p(park_not_null['parking_price'])

XGB = xgb.XGBRegressor()
XGB.fit(park_X, park_Y)

park_pred = XGB.predict(park_is_null)
df.loc[df['parking_price'].notnull(), 'parking_price'] = np.log1p(park_not_null['parking_price'])
df.loc[df['parking_price'].isnull(), 'parking_price'] = park_pred

df.head()

Unnamed: 0,building_material,city,txn_dt,total_floor,building_type,building_use,building_complete_dt,parking_way,txn_floor,land_area,...,XIV_index_1000,XIV_5000,XIV_index_5000,XIV_10000,XIV_index_10000,XIV_MIN,house_type,lat_diff,parking_price,parking_area
0,8,21,18674,4,3,2,6271,2,3.0,2.952013,...,0.693147,2483,1,6011,1,3.568682,1,0,4.42988,
1,8,7,18800,5,1,2,7885,2,5.0,2.516666,...,0.693147,15872,1,32221,1,3.715365,1,1,10.524208,
2,8,7,19289,4,1,2,6028,2,1.0,3.110257,...,0.693147,15760,1,32228,1,4.0157,1,1,10.45083,
3,8,21,20385,24,0,2,18325,0,13.0,2.516666,...,0.693147,2568,1,7271,1,4.61144,1,0,11.30393,
4,1,21,20657,2,4,2,6880,2,0.0,4.132066,...,0.693147,2587,1,7442,1,4.829363,0,0,8.312578,


In [52]:
park_area_not_null = df[df['parking_area'].notnull()]
park_area_is_null = df[df['parking_area'].isnull()].drop(['parking_area'] , axis=1)

park_area_X = park_area_not_null.drop(['parking_area'] , axis=1)
park_area_Y = park_area_not_null['parking_area']

XGBR = xgb.XGBRegressor()
XGBR.fit(park_area_X, park_area_Y)

park_area_pred = XGBR.predict(park_area_is_null)

df.loc[df['parking_area'].isnull(), 'parking_area'] = park_area_pred

df.head()

Unnamed: 0,building_material,city,txn_dt,total_floor,building_type,building_use,building_complete_dt,parking_way,txn_floor,land_area,...,XIV_index_1000,XIV_5000,XIV_index_5000,XIV_10000,XIV_index_10000,XIV_MIN,house_type,lat_diff,parking_price,parking_area
0,8,21,18674,4,3,2,6271,2,3.0,2.952013,...,0.693147,2483,1,6011,1,3.568682,1,0,4.42988,4.278985
1,8,7,18800,5,1,2,7885,2,5.0,2.516666,...,0.693147,15872,1,32221,1,3.715365,1,1,10.524208,2.864383
2,8,7,19289,4,1,2,6028,2,1.0,3.110257,...,0.693147,15760,1,32228,1,4.0157,1,1,10.45083,2.450958
3,8,21,20385,24,0,2,18325,0,13.0,2.516666,...,0.693147,2568,1,7271,1,4.61144,1,0,11.30393,5.571222
4,1,21,20657,2,4,2,6880,2,0.0,4.132066,...,0.693147,2587,1,7442,1,4.829363,0,0,8.312578,1.145004


In [53]:
na_check(df)

Unnamed: 0,Missing Ratio


In [54]:
df_town = df["town"]
df_village = df["village"]
df = df.drop(['town'] , axis=1)
df = df.drop(['village'] , axis=1)
df = pd.get_dummies(df)
df["town"] = df_town
df["village"] = df_village
df.head()

Unnamed: 0,building_material,city,txn_dt,total_floor,building_type,building_use,building_complete_dt,parking_way,txn_floor,land_area,...,XIV_index_5000,XIV_10000,XIV_index_10000,XIV_MIN,house_type,lat_diff,parking_price,parking_area,town,village
0,8,21,18674,4,3,2,6271,2,3.0,2.952013,...,1,6011,1,3.568682,1,0,4.42988,4.278985,334,3132
1,8,7,18800,5,1,2,7885,2,5.0,2.516666,...,1,32221,1,3.715365,1,1,10.524208,2.864383,180,921
2,8,7,19289,4,1,2,6028,2,1.0,3.110257,...,1,32228,1,4.0157,1,1,10.45083,2.450958,180,1544
3,8,21,20385,24,0,2,18325,0,13.0,2.516666,...,1,7271,1,4.61144,1,0,11.30393,5.571222,343,3350
4,1,21,20657,2,4,2,6880,2,0.0,4.132066,...,1,7442,1,4.829363,0,0,8.312578,1.145004,102,63


In [55]:
temp_df = copy.deepcopy(df)

In [56]:
temp_df = temp_df.fillna(temp_df.mean())

temp_df["parking_price"] = temp_df["parking_price"].clip(0.1, temp_df["parking_price"].max())
temp_df["parking_area"] = temp_df["parking_area"].clip(0.1, temp_df["parking_area"].max())

temp_df["txn_dt"] = temp_df["txn_dt"]/100
temp_df["building_complete_dt"] = temp_df["building_complete_dt"]/100

temp_df["village_income_median"] = np.log1p(temp_df["village_income_median"])

temp_df["town_area"] = temp_df["town_area"].clip(1, 150)

temp_df["total_building_area"] = temp_df["building_area"] * temp_df["total_floor"]
temp_df["total_income"] = np.log1p(temp_df["village_income_median"] * temp_df["town_population"])
temp_df["area_percent"] = temp_df["land_area"] / temp_df["town_area"]
temp_df["per_parking_price"] = np.log1p(np.expm1(temp_df["parking_price"]) / temp_df["parking_area"])

temp_df["area_diff"] = temp_df["land_area"] - temp_df["building_area"]
temp_df['area_type'] = temp_df['area_diff'].apply(area_type)
#temp_df = temp_df.drop(['area_diff'] , axis=1)

temp_df = temp_df.fillna(temp_df.mean())

train_num = train_Y.shape[0]
train_X = temp_df[:train_num]
test_X = temp_df[train_num:]

In [21]:
temp_df = temp_df.drop(['doc_rate', 'master_rate', "bachelor_rate", 'jobschool_rate', 'highschool_rate', "elementary_rate"] , axis=1)
#temp_df = temp_df.drop(['XIII_5000'] , axis=1)

In [57]:
temp_df.reset_index(drop=True, inplace=True)
train_Y.reset_index(drop=True, inplace=True)

In [58]:
temp_df2 = copy.deepcopy(temp_df)

In [59]:
temp_df2 = temp_df2.fillna(temp_df2.mean())

train_num = train_Y.shape[0]
train_X = temp_df2[:train_num]
test_X = temp_df2[train_num:]

In [60]:
temp_train = copy.deepcopy(train_X)
temp_train["total_price"] = train_Y

corr = temp_train.corr()["total_price"]
high_feature = corr[abs(corr) >= 0.001]

In [61]:
high_feature = list(high_feature.index)

remove = []

for f in high_feature:
    if ("index" in f):
        remove.append(f)
    #elif ("_" in f) and (int(f.split("_")[1]) < 5000):
    #    remove.append(f)
    if ("MIN" in f):
        remove.append(f)
        
for f in remove:
    high_feature.remove(f)

In [62]:
high_feature.remove("total_price")
#train_X = train_X[high_feature]
#test_X = test_X[high_feature]

In [63]:
corr[high_feature].sort_values(ascending=False)

XIII_10000              0.788092
VII_10000               0.784207
parking_price           0.779524
V_10000                 0.778383
XIII_5000               0.775534
IX_10000                0.771217
VIII_10000              0.767341
III_10000               0.758846
X_10000                 0.755207
XII_10000               0.747922
II_10000                0.746080
VI_10000                0.746023
XI_10000                0.742514
jobschool_rate          0.741049
I_10000                 0.711451
IV_10000                0.710555
V_5000                  0.701114
bachelor_rate           0.688833
VII_5000                0.685748
VIII_5000               0.667650
XII_5000                0.657773
master_rate             0.654054
X_5000                  0.653218
III_5000                0.649411
IX_5000                 0.647394
II_5000                 0.645723
XI_5000                 0.643991
XIII_1000               0.632166
lon                     0.631868
VII_1000                0.619968
          

In [64]:
est = RandomForestRegressor()
est.fit(train_X, train_Y)
feats_est = pd.Series(data=est.feature_importances_, index=train_X.columns)
feats_est = feats_est.sort_values(ascending=False)
feats_est



XIII_10000               5.095287e-01
parking_price            9.192639e-02
VI_10000                 5.574545e-02
death_rate               2.515035e-02
building_complete_dt     2.389784e-02
jobschool_rate           2.121535e-02
master_rate              2.115588e-02
txn_floor                1.603033e-02
land_area                1.377616e-02
txn_dt                   1.120102e-02
village_income_median    1.113001e-02
bachelor_rate            1.034551e-02
doc_rate                 9.137057e-03
total_floor              9.111854e-03
divorce_rate             7.771217e-03
junior_rate              7.762351e-03
area_diff                7.510570e-03
IX_10000                 6.745674e-03
born_rate                6.333112e-03
building_type            6.249401e-03
total_income             5.241392e-03
VII_1000                 4.428388e-03
per_parking_price        4.409047e-03
XII_1000                 4.383390e-03
area_percent             4.350151e-03
parking_area             3.732046e-03
building_are

In [66]:
# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(train_X, train_Y, test_size=0.25, random_state=4)

estimator_ = LGBMRegressor(learning_rate=0.1, num_leaves=1291, max_bin=500)
estimator_.fit(x_train, y_train)

print(cross_val_score(estimator_, x_train, y_train, cv=5).mean())

y_pred = estimator_.predict(x_test)
print("Mean squared error: %.4f"
      % mean_squared_error(y_test, y_pred))

0.9484813996896907
Mean squared error: 0.0449


In [67]:
estimator_ = LGBMRegressor(learning_rate=0.1, num_leaves=1291, max_bin=500)
estimator_.fit(train_X, train_Y)

predict = estimator_.predict(test_X)

test_price = (np.floor(np.expm1(predict)) * test_X['building_area'])#np.expm1(predict)# * (test_X['building_area'].values)
test_price.reset_index(drop=True, inplace=True)

submit = pd.DataFrame(ids)
submit['total_price'] = test_price
submit.to_csv("2019-07-10.csv",index=False)