In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
import sklearn.feature_selection
from sklearn.preprocessing import StandardScaler
import warnings
# filter warnings
warnings.filterwarnings('ignore')
# 正常显示中文
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
# 正常显示符号
from matplotlib import rcParams
rcParams['axes.unicode_minus']=False
properties = pd.read_csv('dataset/PropertiesAfterPreprocessed_For_ModelTraining.csv')
Xy = properties.loc[properties["Size Type"] == "Built-up"]

Xy = Xy.loc[:, [
    "Location", "Bathrooms", "Car Parks", "Furnishing", 
    "Rooms Num", "Property Type Supergroup", "Size Num", 
    "Price", "Price per Area", "Price per Room"]]

Xy.loc[:, "Car Parks"] = Xy["Car Parks"].fillna(0)

Xy = Xy.loc[Xy.isna().sum(axis=1) == 0]

Xy = Xy.loc[Xy["Furnishing"] != "Unknown"]

Xy = pd.get_dummies(Xy)
Xy["Size Num"].sort_values()
Xy["Size Num"].sort_values(ascending=False)
Xy = Xy.loc[Xy["Size Num"].between(250, 20000)]
selectors = []
for feature in ["Bathrooms", "Car Parks", "Rooms Num"]:
    selectors.append(Xy[feature].between(
        Xy[feature].quantile(0.001), 
        Xy[feature].quantile(0.999)))

Xy = Xy.loc[(~pd.DataFrame(selectors).T).sum(axis=1) == 0]
Xy, Xy_feature_selection = sklearn.model_selection.train_test_split(
    Xy, test_size=0.25, random_state=101)
cols = ["Bathrooms", "Car Parks", "Rooms Num", "Size Num"]
Xy_feature_selection[cols] = sklearn.preprocessing.MinMaxScaler().fit_transform(
    Xy_feature_selection[cols])
Xy[cols] = sklearn.preprocessing.MinMaxScaler().fit_transform(Xy[cols])
Xy = Xy.drop(["Bathrooms", "Rooms Num"], axis=1)
Xy_feature_selection = Xy_feature_selection.drop(["Bathrooms", "Rooms Num"], axis=1)
Xy = Xy.drop("Price per Room", axis=1)
Xy_feature_selection = Xy_feature_selection.drop("Price per Room", axis=1)

Xy_train, Xy_test = sklearn.model_selection.train_test_split(Xy, test_size=0.2, random_state=101)
X_train = Xy_train.drop(["Price", "Price per Area"], axis=1)
y_train = Xy_train[["Price", "Price per Area"]]
X_test = Xy_test.drop(["Price", "Price per Area"], axis=1)
y_test = Xy_test[["Price", "Price per Area"]]

In [None]:
# 上面放在一起方便放进系统

In [8]:
X_train.columns

Index(['Car Parks', 'Size Num', 'Location_ampang', 'Location_ampang hilir',
       'Location_bandar damai perdana', 'Location_bandar menjalara',
       'Location_bangsar', 'Location_bangsar south', 'Location_batu caves',
       'Location_brickfields', 'Location_bukit bintang',
       'Location_bukit jalil', 'Location_bukit tunku (kenny hills)',
       'Location_cheras', 'Location_city centre',
       'Location_country heights damansara', 'Location_damansara heights',
       'Location_desa pandan', 'Location_desa parkcity',
       'Location_desa petaling', 'Location_dutamas', 'Location_jalan ipoh',
       'Location_jalan klang lama (old klang road)', 'Location_jalan kuching',
       'Location_jalan sultan ismail', 'Location_kepong', 'Location_keramat',
       'Location_kl city', 'Location_kl eco city', 'Location_kl sentral',
       'Location_klcc', 'Location_kuchai lama', 'Location_mont kiara',
       'Location_oug', 'Location_pandan perdana', 'Location_pantai',
       'Location_salak s

In [9]:
Xy["Car Parks"].unique()

array([0.14285714, 0.        , 0.42857143, 0.28571429, 0.57142857,
       1.        , 0.71428571, 0.85714286])

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [11]:
model = ExtraTreeRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(y_pred)

[[7.50000000e+05 4.50992183e+02]
 [7.40000000e+05 6.33561644e+02]
 [4.05000000e+05 2.79310345e+02]
 ...
 [5.92500000e+05 4.47845805e+02]
 [2.15000000e+06 6.82106599e+02]
 [1.30642857e+06 1.08869048e+03]]


In [12]:
# for name,model in zip(models_str,models):
#     print('Start to train the Model: '+name)
#     model=model   #creating model
#     model.fit(X_train,y_train)
    
#     y_pred=model.predict(X_test)  
#     score=model.score(X_test,y_test)
#     score_.append(str(score)[:5])
#     scores.append(round(score, 2))
#     print(name +' Score: '+str(score))
#     print('-----------------------------------------------------------------')