In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
import sklearn.feature_selection
from sklearn.preprocessing import StandardScaler
import warnings

In [2]:
warnings.filterwarnings('ignore')
# 正常显示中文
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
# 正常显示符号
from matplotlib import rcParams
rcParams['axes.unicode_minus']=False
properties = pd.read_csv('dataset/PropertiesAfterPreprocessed_For_ModelTraining.csv')
Xy = properties.loc[properties["Size Type"] == "Built-up"]

In [4]:
Xy = Xy.loc[:, [
    "Location", "Bathrooms", "Car Parks", "Furnishing", 
    "Rooms Num", "Property Type Supergroup", "Size Num", 
    "Price", "Price per Area", "Price per Room"]]

Xy.loc[:, "Car Parks"] = Xy["Car Parks"].fillna(0)

Xy = Xy.loc[Xy.isna().sum(axis=1) == 0]

Xy = Xy.loc[Xy["Furnishing"] != "Unknown"]

In [5]:
Xy

Unnamed: 0,Location,Bathrooms,Car Parks,Furnishing,Rooms Num,Property Type Supergroup,Size Num,Price,Price per Area,Price per Room
0,klcc,3.0,2.0,Fully Furnished,3.0,Serviced Residence,1335.0,1250000,936.329588,416666.666667
2,dutamas,4.0,2.0,Partly Furnished,3.0,Condominium,1875.0,1030000,549.333333,343333.333333
3,bukit jalil,3.0,2.0,Partly Furnished,5.0,Condominium,1513.0,900000,594.844679,180000.000000
7,sri petaling,2.0,1.0,Partly Furnished,3.0,Apartment,904.0,385000,425.884956,128333.333333
13,damansara heights,7.0,0.0,Partly Furnished,6.0,Bungalow,4842.0,4500000,929.368030,750000.000000
...,...,...,...,...,...,...,...,...,...,...
52940,batu caves,2.0,0.0,Partly Furnished,3.0,Condominium,1092.0,480000,439.560440,160000.000000
52941,jalan klang lama (old klang road),2.0,0.0,Partly Furnished,3.0,Serviced Residence,852.0,475000,557.511737,158333.333333
52942,sri hartamas,6.0,3.0,Partly Furnished,6.0,Condominium,3973.0,2700000,679.587214,450000.000000
52947,wangsa maju,2.0,0.0,Unfurnished,3.0,Condominium,1150.0,480000,417.391304,160000.000000


In [6]:
Xy = pd.get_dummies(Xy)
Xy["Size Num"].sort_values()
Xy["Size Num"].sort_values(ascending=False)
Xy = Xy.loc[Xy["Size Num"].between(250, 20000)]
selectors = []
for feature in ["Bathrooms", "Car Parks", "Rooms Num"]:
    selectors.append(Xy[feature].between(
        Xy[feature].quantile(0.001), 
        Xy[feature].quantile(0.999)))

In [7]:
Xy["Size Num"]

0        1335.0
2        1875.0
3        1513.0
7         904.0
13       4842.0
          ...  
52940    1092.0
52941     852.0
52942    3973.0
52947    1150.0
52948    1313.0
Name: Size Num, Length: 32403, dtype: float64

In [8]:
Xy = Xy.loc[(~pd.DataFrame(selectors).T).sum(axis=1) == 0]

In [9]:
Xy["Size Num"]

0        1335.0
2        1875.0
3        1513.0
7         904.0
13       4842.0
          ...  
52940    1092.0
52941     852.0
52942    3973.0
52947    1150.0
52948    1313.0
Name: Size Num, Length: 32347, dtype: float64

In [10]:
TestXy = Xy
# 模型边界0 - 7
def getMinMaxCarParks(carNum, Xy):
    up = carNum - Xy["Car Parks"].min()
    down = Xy["Car Parks"].max()-Xy["Car Parks"].min()
    res = up/down
    if res>0:
        return res
    else:
        return -res

# 模型边界11 - 820000
def getMinMaxSizeNum(sizeNum, Xy):
    up = sizeNum - Xy["Size Num"].mean()
    down = Xy["Size Num"].unique().std()
    res = up/down
    if res>0:
        return res
    else:
        return -res
    
print(getMinMaxCarParks(2, TestXy))
print(getMinMaxSizeNum(1335, TestXy))

0.2857142857142857
0.14785547282460298


In [11]:
Xy['Size Num'].max()

19180.0

In [12]:
Xy['Size Num'].mean()

1617.3873620428478

In [13]:
Xy['Size Num'].min()

250.0

In [15]:
Xy['Size Num'].unique().std()

1909.8877887180845

In [14]:
Xy, Xy_feature_selection = sklearn.model_selection.train_test_split(
    Xy, test_size=0.25, random_state=101)

In [15]:
cols = ["Bathrooms", "Car Parks", "Rooms Num", "Size Num"]

In [16]:
Xy_feature_selection[cols] = sklearn.preprocessing.MinMaxScaler().fit_transform(
    Xy_feature_selection[cols])

In [17]:
Xy["Size Num"]

4740      830.0
1073     1131.0
21186    4025.0
40465    1238.0
30856     819.0
          ...  
9376     3234.0
13062    1218.0
28996    1182.0
29319     986.0
21802    2070.0
Name: Size Num, Length: 24260, dtype: float64

In [18]:
Xy["Car Parks"]

4740     1.0
1073     0.0
21186    3.0
40465    0.0
30856    1.0
        ... 
9376     0.0
13062    2.0
28996    2.0
29319    2.0
21802    0.0
Name: Car Parks, Length: 24260, dtype: float64

In [19]:
Xy[cols] = sklearn.preprocessing.MinMaxScaler().fit_transform(Xy[cols])
Xy = Xy.drop(["Bathrooms", "Rooms Num"], axis=1)

In [20]:
Xy["Size Num"]

4740     0.030639
1073     0.046540
21186    0.199419
40465    0.052192
30856    0.030058
           ...   
9376     0.157633
13062    0.051136
28996    0.049234
29319    0.038880
21802    0.096144
Name: Size Num, Length: 24260, dtype: float64

In [21]:
Xy["Car Parks"]

4740     0.142857
1073     0.000000
21186    0.428571
40465    0.000000
30856    0.142857
           ...   
9376     0.000000
13062    0.285714
28996    0.285714
29319    0.285714
21802    0.000000
Name: Car Parks, Length: 24260, dtype: float64

In [18]:
Xy

Unnamed: 0,Car Parks,Size Num,Price,Price per Area,Price per Room,Location_ampang,Location_ampang hilir,Location_bandar damai perdana,Location_bandar menjalara,Location_bangsar,...,Furnishing_Unfurnished,Property Type Supergroup_Apartment,Property Type Supergroup_Bungalow,Property Type Supergroup_Condominium,Property Type Supergroup_Flat,Property Type Supergroup_Residential Land,Property Type Supergroup_Semi-detached House,Property Type Supergroup_Serviced Residence,Property Type Supergroup_Terrace/Link House,Property Type Supergroup_Townhouse
4740,0.142857,0.030639,350000,421.686747,1.166667e+05,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1073,0.000000,0.046540,1420000,1255.526083,1.420000e+06,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
21186,0.428571,0.199419,7200000,1788.819876,1.800000e+06,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
40465,0.000000,0.052192,693600,560.258481,2.312000e+05,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
30856,0.142857,0.030058,390000,476.190476,1.300000e+05,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9376,0.000000,0.157633,1600000,494.743352,4.000000e+05,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
13062,0.285714,0.051136,920000,755.336617,3.066667e+05,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
28996,0.285714,0.049234,430000,363.790186,1.433333e+05,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
29319,0.285714,0.038880,600000,608.519270,2.000000e+05,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [17]:
Xy_feature_selection = Xy_feature_selection.drop(["Bathrooms", "Rooms Num"], axis=1)
Xy = Xy.drop("Price per Room", axis=1)
Xy_feature_selection = Xy_feature_selection.drop("Price per Room", axis=1)

In [18]:
Xy["Size Num"]

4740     0.030639
1073     0.046540
21186    0.199419
40465    0.052192
30856    0.030058
           ...   
9376     0.157633
13062    0.051136
28996    0.049234
29319    0.038880
21802    0.096144
Name: Size Num, Length: 24260, dtype: float64

In [19]:
Xy_train, Xy_test = sklearn.model_selection.train_test_split(Xy, test_size=0.2, random_state=101)

In [20]:
# 检查用

In [49]:
columns = ['Car Parks', 'Size Num', 'Location_ampang', 'Location_ampang hilir',
       'Location_bandar damai perdana', 'Location_bandar menjalara',
       'Location_bangsar', 'Location_bangsar south', 'Location_batu caves',
       'Location_brickfields', 'Location_bukit bintang',
       'Location_bukit jalil', 'Location_bukit tunku (kenny hills)',
       'Location_cheras', 'Location_city centre',
       'Location_country heights damansara', 'Location_damansara heights',
       'Location_desa pandan', 'Location_desa parkcity',
       'Location_desa petaling', 'Location_dutamas', 'Location_jalan ipoh',
       'Location_jalan klang lama (old klang road)', 'Location_jalan kuching',
       'Location_jalan sultan ismail', 'Location_kepong', 'Location_keramat',
       'Location_kl city', 'Location_kl eco city', 'Location_kl sentral',
       'Location_klcc', 'Location_kuchai lama', 'Location_mont kiara',
       'Location_oug', 'Location_pandan perdana', 'Location_pantai',
       'Location_salak selatan', 'Location_segambut', 'Location_sentul',
       'Location_seputeh', 'Location_setapak', 'Location_setiawangsa',
       'Location_sri hartamas', 'Location_sri petaling',
       'Location_sungai besi', 'Location_sunway spk', 'Location_taman desa',
       'Location_taman melawati', 'Location_taman tun dr ismail',
       'Location_titiwangsa', 'Location_wangsa maju',
       'Furnishing_Fully Furnished', 'Furnishing_Partly Furnished',
       'Furnishing_Unfurnished', 'Property Type Supergroup_Apartment',
       'Property Type Supergroup_Bungalow',
       'Property Type Supergroup_Condominium', 'Property Type Supergroup_Flat',
       'Property Type Supergroup_Residential Land',
       'Property Type Supergroup_Semi-detached House',
       'Property Type Supergroup_Serviced Residence',
       'Property Type Supergroup_Terrace/Link House',
       'Property Type Supergroup_Townhouse']

In [50]:
data_location = [0] * len([item for item in columns if item.find("Location") !=-1])
# 28
data_location[18] = 1
data_furnishing = [0] * len([item for item in columns if item.find("Furnishing") !=-1])
# 0
data_furnishing[1] = 1
data_property_type = [0] * len([item for item in columns if item.find("Property") !=-1])
# 6
data_property_type[2] = 1

In [51]:
a = getMinMaxCarParks(2, TestXy)
b = getMinMaxSizeNum(1875, TestXy)

In [52]:
data = [a, b]
data.extend(data_location)
data.extend(data_furnishing)
data.extend(data_property_type)

In [53]:
df = pd.DataFrame(columns=columns, data=[data])

In [54]:
df

Unnamed: 0,Car Parks,Size Num,Location_ampang,Location_ampang hilir,Location_bandar damai perdana,Location_bandar menjalara,Location_bangsar,Location_bangsar south,Location_batu caves,Location_brickfields,...,Furnishing_Unfurnished,Property Type Supergroup_Apartment,Property Type Supergroup_Bungalow,Property Type Supergroup_Condominium,Property Type Supergroup_Flat,Property Type Supergroup_Residential Land,Property Type Supergroup_Semi-detached House,Property Type Supergroup_Serviced Residence,Property Type Supergroup_Terrace/Link House,Property Type Supergroup_Townhouse
0,0.285714,0.134884,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [55]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor

X_train = Xy_train.drop(["Price", "Price per Area"], axis=1)
y_train = Xy_train[["Price"]]
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(df)

print(y_pred)

[820000.]
