In [1]:
# 載入需要的套件
import os
import numpy as np 
import pandas as pd
import copy
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# 設定 data_path
dir_data = './data/'
Train = os.path.join(dir_data, 'train.csv')
Test = os.path.join(dir_data, 'test.csv')

# 讀取檔案
Train_data = pd.read_csv(Train)
Test_data = pd.read_csv(Test)

In [3]:
train_Y = np.log1p(Train_data['total_price'])
ids = Test_data['building_id']

tp = copy.deepcopy(np.log1p(Train_data['total_price']))

Train_data = Train_data.drop(['building_id', 'total_price'] , axis=1)
Test_data = Test_data.drop(['building_id'] , axis=1)

df = pd.concat([Train_data,Test_data])
df.head()

Unnamed: 0,building_material,city,txn_dt,total_floor,building_type,building_use,building_complete_dt,parking_way,parking_area,parking_price,...,XIV_250,XIV_500,XIV_index_500,XIV_1000,XIV_index_1000,XIV_5000,XIV_index_5000,XIV_10000,XIV_index_10000,XIV_MIN
0,8,21,18674,4,3,2,6271,2,,,...,21,58,1,157,1,2483,1,6011,1,34.469803
1,8,7,18800,5,1,2,7885,2,,,...,7,28,1,115,1,15872,1,32221,1,40.073573
2,8,7,19289,4,1,2,6028,2,,,...,27,78,1,212,1,15760,1,32228,1,54.462081
3,8,21,20385,24,0,2,18325,0,,81138.889762,...,2,20,1,125,1,2568,1,7271,1,99.628966
4,1,21,20657,2,4,2,6880,2,,,...,2,18,1,47,1,2587,1,7442,1,124.131236


In [4]:
df = df.fillna(df.mean())
train_num = train_Y.shape[0]
train_X = df[:train_num]
test_X = df[train_num:]

In [5]:
estimator = RandomForestRegressor()
estimator.fit(train_X, train_Y)
feats = pd.Series(data=estimator.feature_importances_, index=df.columns)
feats = feats.sort_values(ascending=False)
feats

building_area            4.917167e-01
XIII_10000               2.306228e-01
building_complete_dt     2.730216e-02
land_area                2.107373e-02
jobschool_rate           2.029163e-02
elementary_rate          1.480585e-02
junior_rate              1.397445e-02
txn_dt                   1.151142e-02
XIII_5000                1.071521e-02
V_10000                  1.019834e-02
village_income_median    8.576955e-03
VII_1000                 5.880789e-03
marriage_rate            5.328903e-03
txn_floor                4.675455e-03
highschool_rate          4.605363e-03
V_5000                   3.464196e-03
XIV_5000                 3.383985e-03
II_5000                  3.103298e-03
lat                      3.060009e-03
II_1000                  2.908440e-03
total_floor              2.811217e-03
XII_1000                 2.762140e-03
parking_price            2.523695e-03
divorce_rate             2.414652e-03
X_5000                   2.395842e-03
VII_5000                 2.351098e-03
VII_500     

In [7]:
feats.head(37)

building_area            0.491717
XIII_10000               0.230623
building_complete_dt     0.027302
land_area                0.021074
jobschool_rate           0.020292
elementary_rate          0.014806
junior_rate              0.013974
txn_dt                   0.011511
XIII_5000                0.010715
V_10000                  0.010198
village_income_median    0.008577
VII_1000                 0.005881
marriage_rate            0.005329
txn_floor                0.004675
highschool_rate          0.004605
V_5000                   0.003464
XIV_5000                 0.003384
II_5000                  0.003103
lat                      0.003060
II_1000                  0.002908
total_floor              0.002811
XII_1000                 0.002762
parking_price            0.002524
divorce_rate             0.002415
X_5000                   0.002396
VII_5000                 0.002351
VII_500                  0.002290
building_type            0.002275
XII_500                  0.002085
death_rate    

In [18]:
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

0.946078274922838

In [53]:
# 高重要性特徵，經測試取前37個效果為最佳
high_feature = list(feats[:37].index)
train_X_ = train_X[high_feature]
cross_val_score(estimator, train_X_, train_Y, cv=5).mean()

0.9492400844012309

In [68]:
high_feature = list(feats[:37].index)
estimator_ = RandomForestRegressor(max_depth=20)
estimator_.fit(train_X[high_feature], train_Y)

predict = estimator_.predict(test_X[high_feature])

test_price = np.expm1(predict)

submit = pd.DataFrame(ids)
submit['total_price'] = test_price.astype(np.int32)
submit.to_csv("2019-05-31.csv",index=False)

In [104]:
from sklearn.linear_model import Lasso
L1_Reg = Lasso(alpha=0.000005)
MMEncoder = MinMaxScaler()
train_X = MMEncoder.fit_transform(train_X)
L1_Reg.fit(train_X, train_Y)
#L1_Reg.coef_



Lasso(alpha=5e-06, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [105]:
from itertools import compress
L1_mask = list(abs(L1_Reg.coef_)>0)
L1_list = list(compress(list(train_X), list(L1_mask)))
#L1_list

In [81]:
train_X_ = train_X[L1_list]
cross_val_score(estimator, train_X_, train_Y, cv=5).mean()

0.9466210498100425

In [106]:
MMEncoder = MinMaxScaler()
train_X = MMEncoder.fit_transform(train_X[L1_list])
test_X = MMEncoder.fit_transform(test_X[L1_list])

estimator_ = RandomForestRegressor()
estimator_.fit(train_X, train_Y)

predict = estimator_.predict(test_X)

test_price = np.expm1(predict)

submit = pd.DataFrame(ids)
submit['total_price'] = test_price.astype(np.int32)
submit.to_csv("2019-05-31.csv",index=False)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices