# ***資料清整(training_data)***

In [None]:
import pandas as pd
import pyproj

# load dataset (raw)
df=pd.read_csv('training_data.csv')

#-------------------------------------------------------------------------------'
# 「主要用途」欄位整理(精簡並歸納)
# Before:
#    住家用      8230
#    集合住宅     2660
#    其他        471
#    商業用       263
#    一般事務所      59
#    國民住宅       29
#    住商用        11
#    工業用        11
#    辦公室        11
#    住工用         3
#    店鋪          2
#    廠房          1
# After:
#    住家用
#    集合住宅:  集合住宅,國民住宅
#    商業用:  商業用,一般事務所,住商用,辦公室,店鋪
#    工業用:  工業用,住工用,廠房
#    其他
#-------------------------------------------------------------------------------'
dct = {'住家用':'住家用',
       '集合住宅':'集合住宅',
       '其他':'其他',
       '商業用':'商業用',
       '一般事務所':'商業用',
       '國民住宅':'集合住宅',
       '住商用':'商業用',
       '工業用':'工業用',
       '辦公室':'商業用',
       '住工用':'工業用',
       '店鋪':'商業用',
       '廠房':'工業用'
       }   # use value_counts() to check unique value
df['main_purpose']=df['主要用途'].map(dct)

#-------------------------------------------------------------------------------'
# 「主要建材」欄位整理(精簡並歸納)
# Before:
#    鋼筋混凝土造       10923
#    鋼骨造            419
#    加強磚造           251
#    其他              145
#    鋼筋混凝土加強磚    12
#    磚造               1
# After:
#    加強磚造: 磚造,加強磚造
#    鋼筋混凝土造:鋼筋混凝土造,鋼筋混凝土加強磚
#    鋼骨造
#    其他
#-------------------------------------------------------------------------------'
dct = {'鋼筋混凝土造':'鋼筋混凝土造',
       '鋼骨造':'鋼骨造',
       '加強磚造':'加強磚造',
       '其他':'其他',
       '鋼筋混凝土加強磚':'鋼筋混凝土造',
       '磚造':'加強磚造'
       }   # use value_counts() to check unique value
df['building_material']=df['主要建材'].map(dct)

#-------------------------------------------------------------------------------'
# 「建物型態」欄位整理(精簡並歸納)
# Before:
#    住宅大樓(11層含以上有電梯)    7148
#    公寓(5樓含以下無電梯)       2437
#    華廈(10層含以下有電梯)      2158
#    透天厝                   8
# After
#    公寓
#    華廈
#    住宅大樓
#    透天厝
#-------------------------------------------------------------------------------'
dct = {'住宅大樓(11層含以上有電梯)':'住宅大樓',
       '公寓(5樓含以下無電梯)':'公寓',
       '華廈(10層含以下有電梯)':'華廈',
       '透天厝':'透天厝'
       }   # use value_counts() to check unique value
df['building_type']=df['建物型態'].map(dct)

#--------------------------------------------------------------------------
# 類別變數編碼：main_purpose, building_material, building_type
#--------------------------------------------------------------------------
df1=df.copy()
# 對 main_purpose 進行One-Hot Encoding
dummy=pd.get_dummies(df1['main_purpose'])
df1=pd.concat((df1,dummy),axis=1)
df1.drop(['其他'],axis=1, inplace=True)  # drop one colume to avoid dummy variable trap

# 對 building_material 進行One-Hot Encoding
dummy=pd.get_dummies(df1['building_material'])
df1=pd.concat((df1,dummy),axis=1)
df1.drop(['其他'],axis=1, inplace=True)  # drop one colume to avoid dummy variable trap

# 對 building_type 進行One-Hot Encoding
dummy=pd.get_dummies(df1['building_type'])
df1=pd.concat((df1,dummy),axis=1)
df1.drop(['透天厝'],axis=1, inplace=True)  # drop one colume to avoid dummy variable trap

#-------------------------------------------------------------------------------'
# 經緯度座標轉換 : 二度分帶座標(TWD97) -> 經緯度座標(WGS84)
# Example:
#    input  "橫坐標, 縱坐標" = "305266, 2768378" (TWD97)
#    output "經度, 緯度"    = "121.5476,25.0225" (WGS84)
#
#-------------------------------------------------------------------------------
def tran_coordination(x):
    x1, y1  = x['橫坐標'], x['縱坐標']
    proj = pyproj.Transformer.from_crs(3826, 4326, always_xy=True) #EPSG:3826(TWD97/121分帶)
    x2, y2 = proj.transform(x1, y1)  # 轉換成 lon, lat
    return x2, y2

df1[['經度','緯度']]=df1.apply(tran_coordination,axis=1,result_type='expand')


#--------------------------------------------------------------------------
# 取出想要的欄位，並重新安排欄位順序
#--------------------------------------------------------------------------
new_cols = ['ID',
    '經度','緯度',    # 房屋地點：經度(lon)/ 緯度(lat)
            '屋齡',
            '住家用','集合住宅','商業用','工業用',  # 主要用途
            '公寓','華廈','住宅大樓',              # 建物型態
            '加強磚造','鋼筋混凝土造','鋼骨造',     # 主要建材
            '土地面積','建物面積',
            '主建物面積','陽台面積','附屬建物面積',
            '移轉層次','總樓層數',                #出售標的物所在樓層/總樓層
            '車位面積','車位個數',
            '單價']
df2=df1[new_cols]


#--------------------------------------------------------------------------
# Rename column name
#--------------------------------------------------------------------------
renamed_cols={'經度':'lon','緯度':'lat',
              '屋齡':'house_age',
              '住家用':'residence_housing','集合住宅':'congregate_housing','商業用':'commercial_use','工業用':'industrial_use',
              '公寓':'apartment','華廈':'building_low','住宅大樓':'building_high',
              '加強磚造':'RB','鋼筋混凝土造':'RC','鋼骨造':'SC',
              '土地面積': 'land_area','建物面積': 'building_area',
              '主建物面積': 'main_building_area','陽台面積': 'balcony_area','附屬建物面積':'auxiliary_area',
              '移轉層次': 'floor','總樓層數': 'total_floor',
              '車位面積': 'parking_area', '車位個數': 'parking_number',
              '單價': 'unit_price'
             }
df3=df2.rename(columns=renamed_cols)

#------------------------------------------------------------------------
# save file
#------------------------------------------------------------------------
df3.to_csv('clean_dataset.csv', index=False)




In [None]:
df2

Unnamed: 0,ID,經度,緯度,屋齡,住家用,集合住宅,商業用,工業用,公寓,華廈,...,土地面積,建物面積,主建物面積,陽台面積,附屬建物面積,移轉層次,總樓層數,車位面積,車位個數,單價
0,TR-1,121.547608,25.022469,32.583333,1,0,0,0,0,0,...,-0.256716,-0.174154,0.393926,0.183700,-0.438452,11,11,-0.819326,0.0,4.627714
1,TR-2,121.502124,25.019127,24.166667,1,0,0,0,0,0,...,0.100134,0.314204,-0.316131,0.608577,-0.438452,7,12,-0.819326,0.0,1.887258
2,TR-3,120.365799,22.640966,6.166667,0,1,0,0,0,0,...,0.181921,0.423366,-0.098871,-0.360620,1.525881,10,15,0.161624,1.0,1.489072
3,TR-4,121.462402,25.058663,8.833333,0,1,0,0,0,0,...,0.085594,0.164249,-0.071147,0.315088,0.231984,9,14,0.524653,1.0,2.051217
4,TR-5,121.469444,25.023585,11.000000,1,0,0,0,0,0,...,-0.938116,0.985839,0.791954,1.719400,-0.438452,41,43,0.532377,1.0,3.269198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11746,TR-11747,121.298684,24.935576,28.333333,1,0,0,0,1,0,...,0.706189,-0.271825,0.146962,-0.181455,-0.171559,4,5,0.484488,1.0,0.903505
11747,TR-11748,120.973622,24.795109,29.250000,1,0,0,0,0,1,...,-0.225809,-0.325832,-0.181192,-0.067131,-0.163018,2,7,-0.819326,0.0,1.044041
11748,TR-11749,121.660968,25.072204,22.833333,1,0,0,0,0,0,...,-0.943908,-0.848662,-0.901999,-0.495421,-0.171559,15,17,-0.819326,0.0,2.144908
11749,TR-11750,121.453610,24.982953,25.083333,1,0,0,0,0,0,...,-1.147111,-1.199130,-1.333408,-0.708713,-0.438452,12,16,-0.819326,0.0,2.285444


In [None]:
df3

Unnamed: 0,ID,lon,lat,house_age,residence_housing,congregate_housing,commercial_use,industrial_use,apartment,building_low,...,land_area,building_area,main_building_area,balcony_area,auxiliary_area,floor,total_floor,parking_area,parking_number,unit_price
0,TR-1,121.547608,25.022469,32.583333,1,0,0,0,0,0,...,-0.256716,-0.174154,0.393926,0.183700,-0.438452,11,11,-0.819326,0.0,4.627714
1,TR-2,121.502124,25.019127,24.166667,1,0,0,0,0,0,...,0.100134,0.314204,-0.316131,0.608577,-0.438452,7,12,-0.819326,0.0,1.887258
2,TR-3,120.365799,22.640966,6.166667,0,1,0,0,0,0,...,0.181921,0.423366,-0.098871,-0.360620,1.525881,10,15,0.161624,1.0,1.489072
3,TR-4,121.462402,25.058663,8.833333,0,1,0,0,0,0,...,0.085594,0.164249,-0.071147,0.315088,0.231984,9,14,0.524653,1.0,2.051217
4,TR-5,121.469444,25.023585,11.000000,1,0,0,0,0,0,...,-0.938116,0.985839,0.791954,1.719400,-0.438452,41,43,0.532377,1.0,3.269198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11746,TR-11747,121.298684,24.935576,28.333333,1,0,0,0,1,0,...,0.706189,-0.271825,0.146962,-0.181455,-0.171559,4,5,0.484488,1.0,0.903505
11747,TR-11748,120.973622,24.795109,29.250000,1,0,0,0,0,1,...,-0.225809,-0.325832,-0.181192,-0.067131,-0.163018,2,7,-0.819326,0.0,1.044041
11748,TR-11749,121.660968,25.072204,22.833333,1,0,0,0,0,0,...,-0.943908,-0.848662,-0.901999,-0.495421,-0.171559,15,17,-0.819326,0.0,2.144908
11749,TR-11750,121.453610,24.982953,25.083333,1,0,0,0,0,0,...,-1.147111,-1.199130,-1.333408,-0.708713,-0.438452,12,16,-0.819326,0.0,2.285444


***確認是否有缺失值***

In [None]:
df3.isnull().sum()

ID                    0
lon                   0
lat                   0
house_age             0
residence_housing     0
congregate_housing    0
commercial_use        0
industrial_use        0
apartment             0
building_low          0
building_high         0
RB                    0
RC                    0
SC                    0
land_area             0
building_area         0
main_building_area    0
balcony_area          0
auxiliary_area        0
floor                 0
total_floor           0
parking_area          0
parking_number        0
unit_price            0
dtype: int64

# ***modeling***

這段程式碼使用了迴圈，依次對 regressors 中的每個回歸器進行以下操作：

1.
reg.fit(X_train, y_train): 使用訓練集 X_train 和 y_train 來訓練回歸模型 reg。

2.
y_pred=reg.predict(X_test): 使用訓練好的模型 reg 對測試集 X_test 進行預測，並將預測結果存儲在 y_pred 中。

3.
r2=r2_score(y_test, y_pred): 使用 r2_score 函數計算模型在測試集上的 R2 分數。

4.
mape=mean_absolute_percentage_error(y_test, y_pred): 使用 mean_absolute_percentage_error 函數計算模型在測試集上的平均絕對百分比誤差。

最後，使用 print 函數輸出每個模型的評估結果。

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Oct  4 14:51:50 2023

@author:
"""

import pandas as pd
from sklearn.model_selection import train_test_split

#--- Generative Ensemble Learning(集成式學習) Algorithm for House Pirce Prediciton
# Bootstrap aggregation (bagging)
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
# Gradient bootsting (梯度提升)
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb

# Metric
from sklearn.metrics import r2_score                         #內部測試使用
from sklearn.metrics import mean_absolute_percentage_error   #比賽使用

#------------------------------------------------------------------------
# load dataset (clean)
#------------------------------------------------------------------------
df=pd.read_csv('clean_dataset.csv')


#------------------------------------------------------------------------
# Data Split: training & test data
#------------------------------------------------------------------------
X = df.drop(['unit_price', 'ID'], axis=1) # features
y=df['unit_price']####代表價格             # labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=2023)
####test_size:指定"測試集"的比例 0.2 = 20%

#------------------------------------------------------------------------
# Modeling & Performace Evaluation
#------------------------------------------------------------------------
# Create a list of regressors
regressors = [
    ('Bagging Regressor', BaggingRegressor(random_state=2023)),
    ('ExtraTrees Regressor', ExtraTreesRegressor(random_state=2023)),
    ('Random Forest', RandomForestRegressor(random_state=2023)),
    ('Gradient Boosting Regressor', GradientBoostingRegressor(random_state=2023)),
    ('XGB Regressor', xgb.XGBRegressor(random_state=2023)),
    ('LightGBM Regressor', lgb.LGBMRegressor(objective='regression', metric='l1', seed=2023))
]

#modeling
for name, reg in regressors:
    reg.fit(X_train, y_train)     # Train the regressor
    y_pred=reg.predict(X_test)    # Predict on the test set
    # evaluate the performace
    r2=r2_score(y_test, y_pred)  ####計算模型在測試集上的 R2 分數
    mape=mean_absolute_percentage_error(y_test, y_pred) ####計算模型在測試集上的平均絕對百分比誤差
    print('{0}:\n\tR2 Score(testing)={1:.4f}, MAPE(testing)={2:.4f}'.format(name, r2, mape))







Bagging Regressor:
	R2 Score(testing)=0.9125, MAPE(testing)=0.1103
ExtraTrees Regressor:
	R2 Score(testing)=0.8873, MAPE(testing)=0.1089
Random Forest:
	R2 Score(testing)=0.9229, MAPE(testing)=0.1046
Gradient Boosting Regressor:
	R2 Score(testing)=0.8769, MAPE(testing)=0.1366
XGB Regressor:
	R2 Score(testing)=0.9172, MAPE(testing)=0.1102
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2392
[LightGBM] [Info] Number of data points in the train set: 9400, number of used features: 21
[LightGBM] [Info] Start training from score 1.995595
LightGBM Regressor:
	R2 Score(testing)=0.9175, MAPE(testing)=0.1136


# ***就結果來說，隨機森林較為適用，它具有最高的 R2 Score 和最低的 MAPE***

# **R2 分數（R-squared）：**

定義：R2 分數是一個統計量，表示模型所解釋的變異量相對於總變異量的比例。它衡量了模型對目標變數的解釋能力，**取值範圍介於 0 和 1 之間。一般來說，R2 分數越接近 1，表示模型的解釋能力越好。**

解釋：R2 分數可以理解為模型所預測的變異量占總變異量的比例。如果 **R2 = 1，表示模型完美地擬合了資料；如果 R2 = 0，表示模型無法解釋目標變數的變異性。**

注意事項：然而，R2 分數也可能因為模型過度擬合或者特徵選擇不當等原因而失效，因此在使用時需要結合其他指標進行綜合評估。

# **平均絕對百分比誤差（MAPE）：**

定義：MAPE 是一個衡量預測值與實際值之間相對誤差的指標，以百分比的形式表示。它計算了**預測值相對於實際值的平均相對誤差率。**

解釋：**MAPE的值越低越好，表示模型的預測值與實際值之間的相對誤差越小**。MAPE 的一個特點是，它可以直觀地評估模型的預測準確度，並且可以用百分比的形式來呈現。

注意事項：MAPE 也可能存在一些問題，特別是在實際值中包含零或接近零的情況下，因為這可能會導致 MAPE 的計算出現問題。

**綜合來說，R2 分數主要用於評估模型的解釋能力，而 MAPE 則用於評估模型的預測準確度。在實際應用中，建議綜合考慮這兩個指標以全面評估模型的性能。**

# ***資料清整(public_data)***

In [None]:
import pandas as pd
import pyproj

# load dataset (raw)
df=pd.read_csv('/content/public_dataset.csv')

#-------------------------------------------------------------------------------'
# 「主要用途」欄位整理(精簡並歸納)
# Before:
#    住家用      8230
#    集合住宅     2660
#    其他        471
#    商業用       263
#    一般事務所      59
#    國民住宅       29
#    住商用        11
#    工業用        11
#    辦公室        11
#    住工用         3
#    店鋪          2
#    廠房          1
# After:
#    住家用
#    集合住宅:  集合住宅,國民住宅
#    商業用:  商業用,一般事務所,住商用,辦公室,店鋪
#    工業用:  工業用,住工用,廠房
#    其他
#-------------------------------------------------------------------------------'
dct = {'住家用':'住家用',
       '集合住宅':'集合住宅',
       '其他':'其他',
       '商業用':'商業用',
       '一般事務所':'商業用',
       '國民住宅':'集合住宅',
       '住商用':'商業用',
       '工業用':'工業用',
       '辦公室':'商業用',
       '住工用':'工業用',
       '店鋪':'商業用',
       '廠房':'工業用'
       }   # use value_counts() to check unique value
df['main_purpose']=df['主要用途'].map(dct)

#-------------------------------------------------------------------------------'
# 「主要建材」欄位整理(精簡並歸納)
# Before:
#    鋼筋混凝土造       10923
#    鋼骨造            419
#    加強磚造           251
#    其他              145
#    鋼筋混凝土加強磚    12
#    磚造               1
# After:
#    加強磚造: 磚造,加強磚造
#    鋼筋混凝土造:鋼筋混凝土造,鋼筋混凝土加強磚
#    鋼骨造
#    其他
#-------------------------------------------------------------------------------'
dct = {'鋼筋混凝土造':'鋼筋混凝土造',
       '鋼骨造':'鋼骨造',
       '加強磚造':'加強磚造',
       '其他':'其他',
       '鋼筋混凝土加強磚':'鋼筋混凝土造',
       '磚造':'加強磚造'
       }   # use value_counts() to check unique value
df['building_material']=df['主要建材'].map(dct)

#-------------------------------------------------------------------------------'
# 「建物型態」欄位整理(精簡並歸納)
# Before:
#    住宅大樓(11層含以上有電梯)    7148
#    公寓(5樓含以下無電梯)       2437
#    華廈(10層含以下有電梯)      2158
#    透天厝                   8
# After
#    公寓
#    華廈
#    住宅大樓
#    透天厝
#-------------------------------------------------------------------------------'
dct = {'住宅大樓(11層含以上有電梯)':'住宅大樓',
       '公寓(5樓含以下無電梯)':'公寓',
       '華廈(10層含以下有電梯)':'華廈',
       '透天厝':'透天厝'
       }   # use value_counts() to check unique value
df['building_type']=df['建物型態'].map(dct)

#--------------------------------------------------------------------------
# 類別變數編碼：main_purpose, building_material, building_type
#--------------------------------------------------------------------------
df1=df.copy()
# 對 main_purpose 進行One-Hot Encoding
dummy=pd.get_dummies(df1['main_purpose'])
df1=pd.concat((df1,dummy),axis=1)
df1.drop(['其他'],axis=1, inplace=True)  # drop one colume to avoid dummy variable trap

# 對 building_material 進行One-Hot Encoding
dummy=pd.get_dummies(df1['building_material'])
df1=pd.concat((df1,dummy),axis=1)
df1.drop(['其他'],axis=1, inplace=True)  # drop one colume to avoid dummy variable trap

# 對 building_type 進行One-Hot Encoding
dummy=pd.get_dummies(df1['building_type'])
df1=pd.concat((df1,dummy),axis=1)
df1.drop(['透天厝'],axis=1, inplace=True)  # drop one colume to avoid dummy variable trap

#-------------------------------------------------------------------------------'
# 經緯度座標轉換 : 二度分帶座標(TWD97) -> 經緯度座標(WGS84)
# Example:
#    input  "橫坐標, 縱坐標" = "305266, 2768378" (TWD97)
#    output "經度, 緯度"    = "121.5476,25.0225" (WGS84)
#
#-------------------------------------------------------------------------------
def tran_coordination(x):
    x1, y1  = x['橫坐標'], x['縱坐標']
    proj = pyproj.Transformer.from_crs(3826, 4326, always_xy=True) #EPSG:3826(TWD97/121分帶)
    x2, y2 = proj.transform(x1, y1)  # 轉換成 lon, lat
    return x2, y2

df1[['經度','緯度']]=df1.apply(tran_coordination,axis=1,result_type='expand')


#--------------------------------------------------------------------------
# 取出想要的欄位，並重新安排欄位順序
#--------------------------------------------------------------------------
new_cols = ["ID",
    '經度','緯度',    # 房屋地點：經度(lon)/ 緯度(lat)
            '屋齡',
            '住家用','集合住宅','商業用','工業用',  # 主要用途
            '公寓','華廈','住宅大樓',              # 建物型態
            '加強磚造','鋼筋混凝土造','鋼骨造',     # 主要建材
            '土地面積','建物面積',
            '主建物面積','陽台面積','附屬建物面積',
            '移轉層次','總樓層數',                #出售標的物所在樓層/總樓層
            '車位面積','車位個數']
df2=df1[new_cols]


#--------------------------------------------------------------------------
# Rename column name
#--------------------------------------------------------------------------
renamed_cols={'經度':'lon','緯度':'lat',
              '屋齡':'house_age',
              '住家用':'residence_housing','集合住宅':'congregate_housing','商業用':'commercial_use','工業用':'industrial_use',
              '公寓':'apartment','華廈':'building_low','住宅大樓':'building_high',
              '加強磚造':'RB','鋼筋混凝土造':'RC','鋼骨造':'SC',
              '土地面積': 'land_area','建物面積': 'building_area',
              '主建物面積': 'main_building_area','陽台面積': 'balcony_area','附屬建物面積':'auxiliary_area',
              '移轉層次': 'floor','總樓層數': 'total_floor',
              '車位面積': 'parking_area', '車位個數': 'parking_number'}
df3=df2.rename(columns=renamed_cols)

#------------------------------------------------------------------------
# save file
#------------------------------------------------------------------------
df3.to_csv('clean_public_dataset.csv', index=False)




In [None]:
df3.isnull().sum()

ID                    0
lon                   0
lat                   0
house_age             0
residence_housing     0
congregate_housing    0
commercial_use        0
industrial_use        0
apartment             0
building_low          0
building_high         0
RB                    0
RC                    0
SC                    0
land_area             0
building_area         0
main_building_area    0
balcony_area          0
auxiliary_area        0
floor                 0
total_floor           0
parking_area          0
parking_number        0
dtype: int64

# ***predicted***

In [None]:
# 加載 public_data (請確保 public_data 包含所有用於模型的特徵)
df_public = pd.read_csv("/content/clean_public_dataset.csv")
#先刪除ID欄位(因為前面建模時有先刪除)
df_public_withoutID = df_public.drop(['ID'], axis=1)
# 創建一個新的 DataFrame 來存儲預測結果
df_predictions = pd.DataFrame()

for name, model in regressors:
    y_pred_new_data = model.predict(df_public_withoutID)  # 使用模型進行預測

    # 將預測結果新增到新資料中
    df_predictions['predicted_price_{}'.format(name)] = y_pred_new_data


# 將預測結果儲存為 CSV 檔案
df_predictions.to_csv('predicted_public_data.csv', index=False)


In [None]:
df_predictions

Unnamed: 0,predicted_price_Bagging Regressor,predicted_price_ExtraTrees Regressor,predicted_price_Random Forest,predicted_price_Gradient Boosting Regressor,predicted_price_XGB Regressor,predicted_price_LightGBM Regressor
0,1.648347,1.420116,1.739976,1.498928,1.628334,1.556677
1,1.756091,1.691163,1.711963,1.701257,1.625395,1.678118
2,2.662550,2.604696,2.711035,2.657292,2.389256,2.553599
3,1.718615,1.528657,1.594474,1.571107,1.442334,1.463020
4,3.939087,3.610466,3.871630,3.804075,3.703986,3.689169
...,...,...,...,...,...,...
5871,1.352285,1.380204,1.510715,1.338632,1.460928,1.458705
5872,1.827296,1.585855,1.785885,1.804351,1.836127,1.841921
5873,2.100405,2.120080,2.182150,2.456374,2.185474,2.306425
5874,2.782005,2.735394,2.799104,3.180606,2.909889,2.754383


# ***上傳格式***

結果這樣跑出來的是6種方法所預測的平均值(應該)

In [None]:
# 創建一個新的 DataFrame 來存儲預測結果
df_predictions = pd.DataFrame()
#先刪除ID欄位(因為前面建模時有先刪除)
df_public_withoutID = df_public.drop(['ID'], axis=1)
# 創建一個新的 DataFrame 來存儲預測結果
df_predictions = pd.DataFrame()
for name, model in regressors:
    y_pred_new_data = model.predict(df_public_withoutID)  # 使用模型進行預測

    # 將預測結果新增到新資料中
    df_predictions['predicted_price'.format(name)] = y_pred_new_data

# 將 ID 欄位加回到預測結果中
df_predictions['ID'] = df_public['ID']

# 儲存預測結果為 CSV 檔案
for name, model in regressors:
    file_name = '{}.csv'.format(name)  # 使用模型名稱作為檔案名稱
    df_predictions[['ID', 'predicted_price'.format(name)]].to_csv(file_name, index=False)


In [None]:
df_predictions = df_predictions[['ID','predicted_price']]
df_predictions.to_csv('predicted_public_data_withallthe_models.csv', index=False)

In [None]:
df_predictions

Unnamed: 0,ID,predicted_price
0,PU-1,1.556677
1,PU-2,1.678118
2,PU-3,2.553599
3,PU-4,1.463020
4,PU-5,3.689169
...,...,...
5871,PU-5872,1.458705
5872,PU-5873,1.841921
5873,PU-5874,2.306425
5874,PU-5875,2.754383


這樣看來最好，會生成6個檔名為"模型名稱"的檔案，並且數值正常

In [None]:
# 創建一個新的 DataFrame 來存儲預測結果
df_predictions = pd.DataFrame()
df_public_withoutID = df_public.drop(['ID'], axis=1)
# 創建一個新的 DataFrame 來存儲預測結果
df_predictions = pd.DataFrame()
for name, model in regressors:
    y_pred_new_data = model.predict(df_public_withoutID)  # 使用模型進行預測

    # 將預測結果新增到新資料中
    df_predictions['ID'] = df_public['ID']  # 將 ID 欄位添加到預測結果中
    df_predictions['predicted_price'] = y_pred_new_data  # 將預測值添加到預測結果中

    # 儲存預測結果為 CSV 檔案
    file_name = '{}.csv'.format(name)  # 使用模型名稱作為檔案名稱
    df_predictions.to_csv(file_name, index=False)
