In [2]:
# 載入需要的套件
import os
import numpy as np 
import pandas as pd
import copy
import seaborn as sns
import xgboost as xgb
from scipy.stats import skew
from lightgbm import LGBMRegressor
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler, Imputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# 設定 data_path
dir_data = './data/'
Train = os.path.join(dir_data, 'train.csv')
Test = os.path.join(dir_data, 'test.csv')

# 讀取檔案
Train_data = pd.read_csv(Train)
Test_data = pd.read_csv(Test)

In [4]:
ids = Test_data['building_id']

Train_data = Train_data.drop(['building_id'] , axis=1)
Test_data = Test_data.drop(['building_id'] , axis=1)

In [5]:
# 檢查 DataFrame 空缺值的狀態
def na_check(df_data):
    data_na = (df_data.isnull().sum() / len(df_data)) * 100
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    display(missing_data.head(10))
    
def area_type(row):
    if row >= 0:
        return 1
    else:
        return 0
    
def house_type(row):
    if row == 0:
        return 0
    else:
        return 1
    
def lat_diff(row):
    if row > -38.6:
        return 1
    else:
        return 0
    
def Parking_area_Fill(data):
    if np.isnan(data['parking_area']):
        if not np.isnan(data['Parking_area_Filling']):
            return data['Parking_area_Filling']
    return data['parking_area']

In [6]:
Train_data[Train_data.loc[:,"total_price"] > 3000000000]["total_floor"]
Train_data = Train_data.drop(2138)
Train_data = Train_data.drop(2317)
Train_data = Train_data.drop(32180)
Train_data = Train_data.drop(58858)

Train_data['txn_floor'] = Train_data['txn_floor'].fillna(0)
Train_data[Train_data.loc[:,"total_price"] > 1500000000]["txn_floor"]
Train_data = Train_data.drop(37480)

Train_data[Train_data.loc[:,"total_price"] > 2000000000]["building_complete_dt"]
Train_data = Train_data.drop(9491)
Train_data = Train_data.drop(46934)
Train_data = Train_data.drop(50076)

Train_data.sort_values(by = 'parking_price', ascending = False)[:4]
Train_data = Train_data.drop(55472)
Train_data = Train_data.drop(8795)
#Train_data = Train_data.drop(37480)
Train_data = Train_data.drop(51731)
Train_data = Train_data.drop(1351)

Train_data.sort_values(by = 'parking_area', ascending = False)[:2]
Train_data = Train_data.drop(38617)
Train_data = Train_data.drop(41914)
Train_data = Train_data.drop(4339)
Train_data = Train_data.drop(47253)
Train_data = Train_data.drop(38992)
Train_data = Train_data.drop(35611)
Train_data = Train_data.drop(30232)
Train_data = Train_data.drop(55200)

Train_data.sort_values(by = 'land_area', ascending = False)[:10]
Train_data = Train_data.drop(10670)
Train_data = Train_data.drop(41331)
Train_data = Train_data.drop(39191)
Train_data = Train_data.drop(53966)
Train_data = Train_data.drop(7814)
Train_data = Train_data.drop(13332)
Train_data = Train_data.drop(9785)
Train_data = Train_data.drop(14007)
#Train_data = Train_data.drop(50076)
Train_data = Train_data.drop(2086)
Train_data = Train_data.drop(36482)
Train_data = Train_data.drop(11170)
Train_data = Train_data.drop(16838)
Train_data = Train_data.drop(44123)
Train_data = Train_data.drop(59506)
Train_data = Train_data.drop(37526)
Train_data = Train_data.drop(29570)
Train_data = Train_data.drop(23399)

Train_data.sort_values(by = 'building_area', ascending = False)[:20]
#Train_data = Train_data.drop(9491)
#Train_data = Train_data.drop(46934)
Train_data = Train_data.drop(16846)
Train_data = Train_data.drop(16817)
Train_data = Train_data.drop(26767)
Train_data = Train_data.drop(33373)
Train_data = Train_data.drop(18748)
Train_data = Train_data.drop(43513)
Train_data = Train_data.drop(13096)
Train_data = Train_data.drop(20546)
Train_data = Train_data.drop(23605)
Train_data = Train_data.drop(29104)
Train_data = Train_data.drop(21239)
Train_data = Train_data.drop(53345)
Train_data = Train_data.drop(14944)
Train_data = Train_data.drop(44963)
Train_data = Train_data.drop(26014)
Train_data = Train_data.drop(21231)
Train_data = Train_data.drop(36955)

Train_data.sort_values(by = 'I_MIN', ascending = False)[:2]["I_MIN"]
Train_data = Train_data.drop(9119)
Train_data = Train_data.drop(3327)

Train_data.sort_values(by = 'III_MIN', ascending = False)[:3]["III_MIN"]
Train_data = Train_data.drop(52324)
Train_data = Train_data.drop(16859)
Train_data = Train_data.drop(53310)

Train_data.sort_values(by = 'V_MIN', ascending = False)[:2]["V_MIN"]
Train_data = Train_data.drop(9124)
Train_data = Train_data.drop(58248)

Train_data.sort_values(by = 'VII_MIN', ascending = False)[:5]["VII_MIN"]
Train_data = Train_data.drop(25569)
Train_data = Train_data.drop(58812)
Train_data = Train_data.drop(1011)
Train_data = Train_data.drop(34303)
Train_data = Train_data.drop(46392)

Train_data.sort_values(by = 'VIII_MIN', ascending = False)[:5]["VIII_MIN"]
Train_data = Train_data.drop(8686)
Train_data = Train_data.drop(25855)
Train_data = Train_data.drop(2652)

Train_data.sort_values(by = 'XI_MIN', ascending = False)[:5]["XI_MIN"]
Train_data = Train_data.drop(53654)
Train_data = Train_data.drop(15654)
Train_data = Train_data.drop(50764)

Train_data.sort_values(by = 'XII_MIN', ascending = False)[:5]["XII_MIN"]
Train_data = Train_data.drop(32153)
Train_data = Train_data.drop(57831)

#Train_data[Train_data.loc[:,"total_price"] > 21]["XIII_MIN"]
#Train_data = Train_data.drop(58858)

In [7]:
train_Y = np.log1p((Train_data["total_price"] / Train_data["building_area"]))
Train_data = Train_data.drop(['total_price'] , axis=1)

df = pd.concat([Train_data,Test_data])
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,building_material,city,txn_dt,total_floor,building_type,building_use,building_complete_dt,parking_way,parking_area,parking_price,...,XIV_250,XIV_500,XIV_index_500,XIV_1000,XIV_index_1000,XIV_5000,XIV_index_5000,XIV_10000,XIV_index_10000,XIV_MIN
0,8,21,18674,4,3,2,6271,2,,,...,21,58,1,157,1,2483,1,6011,1,34.469803
1,8,7,18800,5,1,2,7885,2,,,...,7,28,1,115,1,15872,1,32221,1,40.073573
2,8,7,19289,4,1,2,6028,2,,,...,27,78,1,212,1,15760,1,32228,1,54.462081
3,8,21,20385,24,0,2,18325,0,,81138.889762,...,2,20,1,125,1,2568,1,7271,1,99.628966
4,1,21,20657,2,4,2,6880,2,,,...,2,18,1,47,1,2587,1,7442,1,124.131236


In [8]:
na_check(df)

Unnamed: 0,Missing Ratio
parking_area,94.861711
parking_price,76.826931
txn_floor,3.77399
village_income_median,1.88485


In [9]:
# Change the features to categorical
#df['building_material'] = df['building_material'].astype(str)
df['city'] = df['city'].astype(str)
#df['town'] = df['town'].astype(str)
#df['village'] = df['village'].astype(str)
#df['building_type'] = df['building_type'].astype(str)
#df['building_use'] = df['building_use'].astype(str)
#df['parking_way'] = df['parking_way'].astype(str)

In [10]:
pd.options.mode.chained_assignment = None

In [11]:
df = df.drop(['parking_price'] , axis=1)
df = df.drop(['parking_area'] , axis=1)
df['txn_floor'] = df['txn_floor'].fillna(0)
df["house_type"] = df['txn_floor'].apply(house_type)
df["lat_diff"] = df['lat'].apply(lat_diff)

In [12]:
vimm = df.groupby('village').agg({'village_income_median':'mean'})

vim_isnan = np.isnan(df['village_income_median'])
for i in range(len(df)):
    if vim_isnan[i]:
        df['village_income_median'][i] = vimm.loc[df['village'][i], 'village_income_median']

In [13]:
tvimm = df.groupby('town').agg({'village_income_median':'median'})

vim_isnan = np.isnan(df['village_income_median'])
for i in range(len(df)):
    if vim_isnan[i]:
        df['village_income_median'][i] = tvimm.loc[df['town'][i], 'village_income_median']

In [14]:
na_check(df)

Unnamed: 0,Missing Ratio


In [29]:
temp_df = copy.deepcopy(df)

In [30]:
temp_df = temp_df.fillna(temp_df.mean())

#temp_df["parking_price"] = temp_df["parking_price"].clip(0.1, temp_df["parking_price"].max())
#temp_df["parking_area"] = temp_df["parking_area"].clip(0.1, temp_df["parking_area"].max())

temp_df['building_age'] = temp_df.txn_dt - temp_df.building_complete_dt
#temp_df.drop(['txn_dt', 'building_complete_dt'], axis=1, inplace=True)

temp_df["village_income_median"] = np.log1p(temp_df["village_income_median"])

temp_df["town_area"] = temp_df["town_area"].clip(1, 150)

temp_df["total_building_area"] = temp_df["building_area"] * temp_df["total_floor"]
temp_df["total_income"] = np.log1p(temp_df["village_income_median"] * temp_df["town_population"])
temp_df["area_percent"] = temp_df["land_area"] / temp_df["town_area"]
#temp_df["per_parking_price"] = np.log1p(np.expm1(temp_df["parking_price"]) / temp_df["parking_area"])

temp_df["area_diff"] = temp_df["land_area"] - temp_df["building_area"]
temp_df['area_type'] = temp_df['area_diff'].apply(area_type)
#temp_df = temp_df.drop(['area_diff'] , axis=1)

temp_df = temp_df.fillna(temp_df.mean())

train_num = train_Y.shape[0]
train_X = temp_df[:train_num]
test_X = temp_df[train_num:]

In [31]:
temp_df.reset_index(drop=True, inplace=True)
train_Y.reset_index(drop=True, inplace=True)

In [32]:
Data = temp_df[:train_num]
Data["total_price"] = train_Y

test_Data = temp_df[train_num:]

temp_df["txn_floor"] = temp_df["txn_floor"].astype('int')

cols = ["city"]

for c in cols:
    
    mean = Data['total_price'].mean()
    agg = Data.groupby(c)['total_price'].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']
    mean_df = ((counts * means + 300 * mean) / (counts + 300)).reset_index()
    
    #mean_df = Data.groupby([c])['total_price'].mean().reset_index()
    mean_df.columns = [c, f'{c}_mean']
    
    Data = pd.merge(Data, mean_df, on= c, how='left')
    Data = Data.drop([c] , axis=1)
    
    test_Data = pd.merge(test_Data, mean_df, on= c, how='left')   
    test_Data = test_Data.drop([c] , axis=1)
    
Data = Data.drop(['total_price'] , axis=1)

temp_df = pd.concat([Data,test_Data])
temp_df.head()

Unnamed: 0,building_material,txn_dt,total_floor,building_type,building_use,building_complete_dt,parking_way,txn_floor,land_area,building_area,...,XIV_MIN,house_type,lat_diff,building_age,total_building_area,total_income,area_percent,area_diff,area_type,city_mean
0,8,18674,4,3,2,6271,2,3.0,18.14446,3.418175,...,34.469803,1,0,12403,13.672701,11.943494,12.81298,14.726285,1,12.760014
1,8,18800,5,1,2,7885,2,5.0,11.387227,4.041309,...,40.073573,1,1,10915,20.206544,15.062772,0.492159,7.345919,1,13.876577
2,8,19289,4,1,2,6028,2,1.0,21.426802,5.584279,...,54.462081,1,1,13261,22.337115,15.082789,0.926072,15.842523,1,13.876577
3,8,20385,24,0,2,18325,0,13.0,11.387227,13.563031,...,99.628966,1,0,2060,325.512737,13.769437,0.771895,-2.175803,0,12.760014
4,1,20657,2,4,2,6880,2,0.0,61.306524,4.688108,...,124.131236,0,0,13777,9.376217,13.468119,0.862986,56.618416,1,12.760014


In [33]:
temp_df2 = copy.deepcopy(temp_df)

In [34]:
temp_df2 = temp_df2.fillna(temp_df2.mean())

train_num = train_Y.shape[0]
train_X = temp_df2[:train_num]
test_X = temp_df2[train_num:]

In [35]:
temp_train = copy.deepcopy(train_X)
temp_train["total_price"] = train_Y

corr = temp_train.corr()["total_price"]
high_feature = corr[abs(corr) >= 0.1]
high_feature = list(high_feature.index)
high_feature.remove("total_price")
corr[high_feature].sort_values(ascending=False)

city_mean              0.819205
XIII_10000             0.788092
VII_10000              0.784207
V_10000                0.778383
XIII_5000              0.775534
IX_10000               0.771217
VIII_10000             0.767341
III_10000              0.758846
X_10000                0.755207
XII_10000              0.747922
II_10000               0.746080
VI_10000               0.746023
XI_10000               0.742514
jobschool_rate         0.741049
I_10000                0.711451
IV_10000               0.710555
V_5000                 0.701114
bachelor_rate          0.688833
VII_5000               0.685748
VIII_5000              0.667650
XII_5000               0.657773
master_rate            0.654054
X_5000                 0.653218
III_5000               0.649411
IX_5000                0.647394
II_5000                0.645723
XI_5000                0.643991
lon                    0.631868
V_1000                 0.619983
doc_rate               0.606340
                         ...   
XIV_1000

In [36]:
# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(train_X, train_Y, test_size=0.25, random_state=4)

estimator_ = LGBMRegressor(learning_rate=0.1, num_leaves=1291, max_bin=500)
estimator_.fit(x_train, y_train)

print(cross_val_score(estimator_, x_train, y_train, cv=5).mean())

y_pred = estimator_.predict(x_test)
print("Mean squared error: %.4f"
      % mean_squared_error(y_test, y_pred))

0.9495237912211538
Mean squared error: 0.0441


In [37]:
estimator_ = LGBMRegressor(learning_rate=0.1, num_leaves=1291, max_bin=500)
estimator_.fit(train_X, train_Y)

predict = estimator_.predict(test_X)

test_price = (np.floor(np.expm1(predict)) * test_X['building_area'])#np.expm1(predict)# * (test_X['building_area'].values)
test_price.reset_index(drop=True, inplace=True)

submit = pd.DataFrame(ids)
submit['total_price'] = test_price
submit.to_csv("2019-07-18.csv",index=False)