In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

In [2]:
train_dataset = pd.read_csv('/Users/wei7614/Documents/Programming/Python/ML/Kaggle/datasets/melbourne_housing_snapshot/train.csv')
test_dataset = pd.read_csv('/Users/wei7614/Documents/Programming/Python/ML/Kaggle/datasets/melbourne_housing_snapshot/test.csv')

train_dataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# 將dataset去除object的feature，只留下
def get_digit_dataset(dataset):
    return dataset[dataset.columns[dataset.dtypes != object]]

digit_train_data = get_digit_dataset(train_dataset)
digit_train_data.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,192,84,0,0,0,0,0,12,2008,250000


In [4]:
# 用imputer補空值
from sklearn.impute import SimpleImputer

def fill_null_value_to_data(data):
    imputer = SimpleImputer()
    result = pd.DataFrame(imputer.fit_transform(data))
    result.columns = data.columns
    return result

train_data = fill_null_value_to_data(digit_train_data)
train_data.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1.0,60.0,65.0,8450.0,7.0,5.0,2003.0,2003.0,196.0,706.0,...,0.0,61.0,0.0,0.0,0.0,0.0,0.0,2.0,2008.0,208500.0
1,2.0,20.0,80.0,9600.0,6.0,8.0,1976.0,1976.0,0.0,978.0,...,298.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,181500.0
2,3.0,60.0,68.0,11250.0,7.0,5.0,2001.0,2002.0,162.0,486.0,...,0.0,42.0,0.0,0.0,0.0,0.0,0.0,9.0,2008.0,223500.0
3,4.0,70.0,60.0,9550.0,7.0,5.0,1915.0,1970.0,0.0,216.0,...,0.0,35.0,272.0,0.0,0.0,0.0,0.0,2.0,2006.0,140000.0
4,5.0,60.0,84.0,14260.0,8.0,5.0,2000.0,2000.0,350.0,655.0,...,192.0,84.0,0.0,0.0,0.0,0.0,0.0,12.0,2008.0,250000.0


In [20]:
y_data = train_data.SalePrice
x_data = train_data.drop(['Id', 'SalePrice'], axis=1)

# 將資料分成訓練以及驗證
x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, train_size=0.8, random_state=2)

# 找出最好的子樹數
val_scores = []
for tree_count in range(10, 310, 50):
    rf = RandomForestRegressor(n_estimators=tree_count)
    rf.fit(x_train, y_train)
    predict_val_y = rf.predict(x_val)
    mae = mean_absolute_error(predict_val_y, y_val)
    val_scores.append([tree_count, mae])

sorted_val = sorted(val_scores, key=lambda x : x[1])
best_tree_count = sorted_val[0][0]
print('best tree count = %d with mae = %.2f' %(best_tree_count, sorted_val[0][1]))

    

best tree count = 60 with mae = 18613.31


In [37]:
# 預測測試資料
digit_test_data = get_digit_dataset(test_dataset)
test_data = fill_null_value_to_data(digit_test_data)

x_test_data = test_data.drop('Id', axis=1)
x_test_data.head()

rf = RandomForestRegressor(n_estimators=best_tree_count)
rf.fit(x_train, y_train)
predict_y_test = rf.predict(x_test_data)

pd_predict_y = pd.DataFrame({'Id' : test_data['Id'].astype(int),
                             'SalePrice': predict_y_test})

#產生預測值
pd_predict_y.to_csv("./submission.csv", index=False)
pd_predict_y.head()

Unnamed: 0,Id,SalePrice
0,1461,125209.883333
1,1462,159289.166667
2,1463,185145.0
3,1464,181808.333333
4,1465,195148.15
