# Singapore Private Property Price Prediction

# 1.0 Loading Packages & Reading Dataset

## 1.1 Loading Packages

In [None]:
import pandas_profiling
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn
%matplotlib inline

os.chdir("C:\Python\data\project")

## 1.2 Reading Dataset

In [None]:
apartment = pd.read_csv("Apartments.csv", index_col=False)
condos = pd.read_csv("Condos.csv", index_col=False)
ec = pd.read_csv("Executive Condos.csv", index_col=False)

# 2.0 Data Exploration

## 2.1 Understanding the Size of Dataset

In [None]:
print(len(apartment), len(condos), len(ec))

## 2.2 Combining All Datasets into a Single Dataframe

In [None]:
df = pd.concat([apartment, condos, ec])
print("The total number of observations in the combined dataset is", len(df), "rows")

## 2.3 Preliminary Data Exploration Using ProfileReport

In [None]:
pandas_profiling.ProfileReport(df)

Based on the Profile Report, there are various important items to take note of as follow:
1. There were several projects that had unknown completion dates, and their age was wrongly calculated as a result. To ensure the Age column has correct values, we manually found the completion dates of these 88 projects online and imputed these values into the dataset.

2. There are transactions that have more than 1 unit. In order to allow our model to be more robust and to focus on predicting property prices for transactions involving only 1 unit, we will remove transactions that have more than 1 unit.

3. Various transactions have missing values. In this case, we will remove transactions with no facilities information, impute missing values for Floor No, Floor No (Final) and Unit No as zero.

4. There are several variables that would not be useful in training our models. Therefore, these variables will be removed.

## 2.4 Impute Completion Dates for 88 Projects And Recalculate Property Age

### 2.4.1 Impute Completion Dates for 88 Projects

In [None]:
# Read in list of completion dates manually found
comp_date = pd.read_csv("Projects with Completion Date.csv", index_col = None, dtype = {'Completion Date': str})

In [None]:
# Check projects with currently unknown completion dates
df[df["Completion Date"] == "Unknown"]

In [None]:
# Set projects with unknown completion dates to a value of None
df.loc[df['Completion Date'] == "Unknown", 'Completion Date'] = None

In [None]:
# Index the dataframes using Project Name
df = df.set_index("Project Name")
comp_date = comp_date.set_index("Project Name")

In [None]:
# Check that the None values have been set correctly
df[df["Completion Date"].isna()]

In [None]:
# Impute values
df["Completion Date"] = df['Completion Date'].fillna(comp_date['Completion Date'])

In [None]:
df["Completion Date"].unique()

In [None]:
df = df.reset_index()

### 2.4.2 Recalculate Property Age

In [None]:
# Remove rows with unknown completion date
df = df[df["Completion Date"].isna() == False]

In [None]:
df["Completion Date"].unique()

#### In the case of uncompleted projects at time of sale, the age of the property should be set as 0.

In [None]:
df["Sale Date"] = pd.to_datetime(df["Sale Date"], format = "%d-%b-%y")

In [None]:
df["Age"] = df.apply(lambda x: x["Sale Date"].year - int(x["Completion Date"]) if (x["Completion Date"] != "Uncompleted") else 0, axis = 1)

In [None]:
df["Age"].describe()

In [None]:
df.loc[df['Age'] < 0, 'Age'] = 0

In [None]:
df["Age"].describe()

## 2.5 Removing Transactions Involving More Than 1 Unit

In [None]:
df_new = df[df["No. of Units"] == 1] # 38 records
print("The total number of observations after removing transactions involving more than 1 unit is now", len(df_new))

## 2.6 Removing Transactions with No Facilities Information

In [None]:
df_new = df_new[df_new["Carpark"].isna() == False]

In [None]:
df_new.columns[df_new.isna().any()].tolist()

## 2.7 Impute Missing Values for Floor No., Floor No. (Final) and Unit No.

In [None]:
df_new["Floor No"].unique()

In [None]:
# Impute 0 for units without Floor No
df_new['Floor No'][df_new['Floor No'].isna()] = 0

In [None]:
df_new["Floor No"] = df_new["Floor No"].astype(str)

In [None]:
# Impute 0 for units without Unit No
df_new['Unit No'][df_new['Unit No'].isna()] = 0

In [None]:
df_new["Unit No"] = df_new["Unit No"].astype(str)

In [None]:
df_new["Floor No (Final)"].unique()

In [None]:
df_new["Floor No (Final)"][df_new["Floor No (Final)"].isna()] = 0

In [None]:
df_new["Floor No (Final)"].unique()

In [None]:
df_new.dtypes

## 2.8 Remove Unwanted Variables

In [None]:
df_new = df_new.drop(["SN", 
                      "Project Name", 
                      "Address", 
                      "No. of Units", 
                      "Type of Area", 
                      "Nett Price($)", 
                      "Unit Price ($ psm)", 
                      #"Unit Price ($ psf)",
                      "Sale Date",
                      "Address 1",
                      "Address for Geocode",
                      "Lease Start Date",
                      "Latitude",
                      "Longitude",
                      "Completion Date",
                      "Postal Sector",
                      "Postal Code",
                      "Planning Region",
                      "Tenure",
                      "Tenure (New)"], axis = 1)

In [None]:
df_new.head()

In [None]:
df_new.columns[df_new.isna().any()].tolist()

In [None]:
len(df_new)

# 3.0 Exporting Preliminarily Cleaned Dataset

In [None]:
df_new.to_csv("Cleaned Dataset_v3.csv", index=None)

In [None]:
#Check Exported Dataset
df = pd.read_csv("Cleaned Dataset_v3.csv", index_col=None, dtype = {'Unit No': str, 'Floor No': str})
df.head()

# 4.0 Additional Data Exploration

## 4.1 Loading Additional Packages

In [None]:
import pandas_profiling
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(color_codes=True)
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn import  metrics, linear_model, ensemble, model_selection,feature_selection
from yellowbrick.regressor import PredictionError
%matplotlib inline

## 4.2 Reading New Dataset

In [None]:
df = pd.read_csv("Cleaned Dataset_v3.csv", index_col=None, dtype = {'Unit No': str, 'Floor No': str})

## 4.3 Data Exploration

### 4.3.1 Using ProfileReport

After loading our dataset, we will perform additional data exploration using the pandas_profiling package, which allows us to gain insights on the following:
1. Dataset Information
    a. Number of Variables
    b. Number of Observations
    c. Total Percentage of Missing Observations
2. Variable Types
3. Warnings on the following:
    a. Variables with large numbers of zeros
    b. Variables that are highly correlated
    c. Variables with unsupported data type
    d. Variables with high cardinality
    e. Presence and number of duplicated rows

In [None]:
import pandas_profiling as pp
pp.ProfileReport(df)

### 4.3.2 Discussion on ProfileReport
Based on the Profile Report above, we can see that there are several pairs of variables with high correlation. These variables are:
1. Condominiums and Private Flats Households (% of Total Households) AND Resident in Condominiums and Other Apartments (% of total residents)
Correlation Coefficient = 0.94806

2. Time(Month) AND Time(Quarter)
Correlation Coefficient = 0.99722

3. Uni Qualification (% of total) AND Monthly Household Income $8000 and above (% of Total Households)
Correlation Coefficient = 0.90933

In order to identify other possible highly correlated independent variables, we will plot an IV-IV correlation matrix.

### 4.3.3 Independent Variable - Independent Variable Correlation Matrix

In [None]:
corr = df.corr()
corr.style.background_gradient()

### 4.3.4 Discussion on IV-IV Correlation Matrix

Based on the IV-IV Correlation Matrix above, we can see that other than the IV pairs highlighted from the ProfileReport that are highly correlated, the following variable pair is also highly correlated:

#### 1. Condominiums and Private Flats Households (% of Total Households) AND Uni Qualification (% of total)
0.914189	


### 4.3.5 Summary of Data Exploration

Based on the above, we will be performing further data pre-processing as indicated below:

1. Changing Area (sqm) to Area (sqf)
2. Dropping Unnecessary columns
3. Dropping One Variable in Each Pair of Highly Correlated Variables
4. Dropping Variables Due to Business Domain Reasons
5. Log Distance
6. Create Expensive Area Indicator
7. Converting Categorical Data Types - Creating Dummy Variables
8. Create Lucky and Unlucky Unit No and Floor No
9. Re-arranging Target Variable, 'Unit Price (psf)', To the Last Column

# 5.0 Additional Data Pre-Processing

## 5.1. Changing Area (sqm) to Area (sqf)

In [None]:
df['Area (sqf)'] = df['Area (sqm)']*10.76

## 5.2. Dropping Unnecessary columns

In [None]:
#three years data not enought to look at time series

df = df.drop(['Transacted Price ($)','LN (Price)','Area (sqm)','Sale YM','Time (Quarter)','Time (Month)'],axis = 1)

## 5.3. Dropping One Variable in Each Pair of Highly Correlated Variables

In [None]:


df = df.drop(['Resident in Condominiums and Other Apartments (% of total residents)','Monthly Household Income $8000 and above (% of Total Households)','Condominiums and Private Flats Households (% of Total Households)'],axis = 1)

## 5.4. Dropping Variables Due to Business Domain Reasons

In [None]:
df = df.drop(['Postal District','Chill Out'],axis = 1)
#Dropping postal district and keeping planning area
#Dropping chill out between chill out and food/dining due to contingency table showing that both follow similar pattern.


## 5.5. Log Distance

Based on business intuition, it can be assumed that short and long distances are more important in predicting property prices. Therefore, we want to perform data transformation on these features to reflect this intuition. Additionally, when we visualize an example of one of the distance features, bus stop km, onto a histogram, we can see that the datapoints are typically skewed. Correspondingly, the data distribution does not closely follow the diagonal line in the probability plot that represents normal distribution.

In [None]:
from scipy.stats import norm
from scipy import stats


#histogram and normal probability plot
sns.distplot(df['bus stop km'], fit=norm);
fig = plt.figure()
res = stats.probplot(df['bus stop km'], plot=plt)

Upon log transformation, we can see that the data distribution now follows a normal distribution more closely based on the plot below:

In [None]:
#histogram and normal probability plot
sns.distplot(np.log(df['bus stop km']), fit=norm);
fig = plt.figure()
res = stats.probplot(np.log(df['bus stop km']), plot=plt)

Therefore, we want to perform a log transformation on all distance variables.

In [None]:
df.iloc[:,14:35] = np.log(df.iloc[:,14:35]+0.1)

## 5.6 Create Expensive Area Indicator

In [None]:
fig = plt.figure(figsize=(20,10))
sns.boxplot(x = df["Planning Area"],y = df["CBD km"])
plt.title("Boxplot of CBD km Against Planning Area")
plt.xticks(rotation=90)
plt.show()

In [None]:
fig = plt.figure(figsize=(20,10))
sns.boxplot(x = df["Planning Area"],y = df["Unit Price ($ psf)"])
plt.title("Boxplot of CBD km Against Planning Area")
plt.xticks(rotation=90)
plt.show()

Based on the two plots above, we can see that for Planning Areas such as Newton, Orchard, River Valley, Downtown Core and Museum, the price range of properties in these area tend to be of a relatively higher price than other properties even though they might not be the nearest to the CBD as compared to other planning areas. For example, properties in Rochor seems to be nearer to CBD, but the price range of these properties are not as high as properties in Musuem. In this case, we want to construct an additional feature "Expensive_Area_Indicator" to be fed into our models to make our models more robust.

In [None]:
def expensive_area_indicator(df):
    if df['Planning Area'] =='Newton':  
        indicator = 1
    elif df['Planning Area'] == 'Orchard':
        indicator = 1
    elif df['Planning Area'] == 'River Valley':
        indicator = 1
    elif df['Planning Area'] == 'Downtown Core':
        indicator = 1
    elif df['Planning Area'] == 'Museum':
        indicator = 1
    else:
        indicator = 0
    return indicator


df['Expensive_Area_Indicator'] = df.apply(expensive_area_indicator, axis=1)

## 5.7. Converting Categorical Data Types - Creating Dummy Variables

In order to fit the categorical data into our models, we have to first create dummy variables for our categorical variables.

In [None]:
df_with_dummy = pd.get_dummies(df,columns = ['Purchaser Address Indicator','Planning Area'])

## 5.8. Create Lucky and Unlucky Unit No and Floor No

In Singapore, auspicious or inauspicious numbers are typically considered when one purchases new goods or services. This belief could therefore extend to the purchase of properties as well. Therefore, based on social-cultural factors in the local context, the following features were created as they are likely to be important in influencing buyers’ decision in purchasing a property:

In [None]:
def isunluckyunit(df):
    string = df['Unit No']
    result = 0
    if string.isdigit() == True:
        number = int(string)
        if number == 4:
               result = 1
        elif number == 44:
                result = 1
        elif number == 144:
                result = 1
        elif number == 244:
                result = 1
        elif number == 344:
                result = 1    
        else:
            result = 0
    return result

def isunluckyfloor(df):
    string = df['Floor No']
    result = 0
    if string.isdigit() == True:
        number = int(string)
        if number == 4:
               result = 1
        else:
            result = 0
    return result

def isluckyunit(df):
    string = df['Unit No']
    result = 0
    if string.isdigit() == True:
        number = int(string)
        if number == 8:
               result = 1
        elif number == 88:
                result = 1
        elif number == 188:
                result = 1
        elif number == 6:
                result = 1
        elif number == 66:
                result = 1
   
        else:
            result = 0
    return result

def isluckyfloor(df):
    string = df['Floor No']
    result = 0
    if string.isdigit() == True:
        number = int(string)
        if number == 8:
               result = 1
        elif number == 6:
                result = 1   
        else:
            result = 0
    return result

In [None]:
df_with_dummy['Isunlucky Unit No'] = df_with_dummy.apply(isunluckyunit, axis=1)
df_with_dummy['Islucky Unit No'] = df_with_dummy.apply(isluckyunit, axis=1)
df_with_dummy['Isunlucky Floor No'] = df_with_dummy.apply(isunluckyfloor, axis=1)
df_with_dummy['Islucky Floor No'] = df_with_dummy.apply(isluckyfloor, axis=1)

df_with_dummy = df_with_dummy.drop(['Unit No','Floor No'],axis = 1)

## 5.9. Re-arranging target variable, 'Unit Price (psf)', to the last column

In [None]:
print('shape of dataset before re-arrangement:', "\n", df_with_dummy.shape, "\n")
cols = df_with_dummy.columns.tolist()
print('Cols before re-arrangement:', "\n", cols, "\n")
cols = cols[1:86] + cols[0:1]


df_with_dummy = df_with_dummy[cols]
print('shape of dataset after re-arrangement:', "\n", df_with_dummy.shape, "\n")
print('Cols after re-arrangement:', "\n", cols, "\n")

# 6.0 Spliting Dataset Into Train and Test

## 6.1. Split by Property Type and Sale Type

In [None]:
condo = df_with_dummy[df_with_dummy['Property Type'] == "Condominium"]
condo_NS = condo[condo['Type of Sale'] != "Resale"]
condo_RS = condo[condo['Type of Sale'] == "Resale"]
condo_NS = condo_NS.drop(['Property Type','Type of Sale'],axis = 1)
condo_RS = condo_RS.drop(['Property Type','Type of Sale'],axis = 1)
condo = condo.drop(['Property Type'],axis = 1)

apart = df_with_dummy[df_with_dummy['Property Type'] == "Apartment"]
apart_NS = apart[apart['Type of Sale'] != "Resale"]
apart_RS = apart[(apart['Type of Sale'] == "Resale")]
apart_NS = apart_NS.drop(['Property Type','Type of Sale'],axis = 1)
apart_RS = apart_RS.drop(['Property Type','Type of Sale'],axis = 1)
apart = apart.drop(['Property Type'],axis = 1)

Econdo = df_with_dummy[df_with_dummy['Property Type'] == "Executive Condominium"]
Econdo_NS = Econdo[Econdo['Type of Sale'] != "Resale"]
Econdo_RS = Econdo[(Econdo['Type of Sale'] == "Resale")]
Econdo_NS = Econdo_NS.drop(['Property Type','Type of Sale'],axis = 1)
Econdo_RS = Econdo_RS.drop(['Property Type','Type of Sale'],axis = 1)
Econdo = Econdo.drop(['Property Type'],axis = 1)

## 6.2. Split X and Y

In [None]:
x_apart_NS = apart_NS.iloc[:,:-1]
y_apart_NS = apart_NS.iloc[:,-1]

x_apart_RS = apart_RS.iloc[:,:-1]
y_apart_RS = apart_RS.iloc[:,-1]

x_condo_NS = condo_NS.iloc[:,:-1]
y_condo_NS = condo_NS.iloc[:,-1]

x_condo_RS = condo_RS.iloc[:,:-1]
y_condo_RS = condo_RS.iloc[:,-1]

x_Econdo_NS = Econdo_NS.iloc[:,:-1]
y_Econdo_NS = Econdo_NS.iloc[:,-1]

x_Econdo_RS = Econdo_RS.iloc[:,:-1]
y_Econdo_RS = Econdo_RS.iloc[:,-1]


## 6.3. Split Train and Test

In [None]:
np.random.seed(2018)
train_apart_NS = np.random.choice([True, False], x_apart_NS.shape[0], replace=True, p=[0.6, 0.4])
train_apart_RS = np.random.choice([True, False], x_apart_RS.shape[0], replace=True, p=[0.6, 0.4])

train_condo_NS = np.random.choice([True, False], x_condo_NS.shape[0], replace=True, p=[0.6, 0.4])
train_condo_RS = np.random.choice([True, False], x_condo_RS.shape[0], replace=True, p=[0.6, 0.4])

train_Econdo_NS = np.random.choice([True, False], x_Econdo_NS.shape[0], replace=True, p=[0.6, 0.4])
train_Econdo_RS = np.random.choice([True, False], x_Econdo_RS.shape[0], replace=True, p=[0.6, 0.4])



x_apart_NS_train, y_apart_NS_train = x_apart_NS.iloc[train_apart_NS, :], y_apart_NS.iloc[train_apart_NS]
x_apart_NS_test, y_apart_NS_test = x_apart_NS.iloc[~train_apart_NS, :], y_apart_NS.iloc[~train_apart_NS]

x_apart_RS_train, y_apart_RS_train = x_apart_RS.iloc[train_apart_RS, :], y_apart_RS.iloc[train_apart_RS]
x_apart_RS_test, y_apart_RS_test = x_apart_RS.iloc[~train_apart_RS, :], y_apart_RS.iloc[~train_apart_RS]

x_condo_NS_train, y_condo_NS_train = x_condo_NS.iloc[train_condo_NS, :], y_condo_NS.iloc[train_condo_NS]
x_condo_NS_test, y_condo_NS_test = x_condo_NS.iloc[~train_condo_NS, :], y_condo_NS.iloc[~train_condo_NS]

x_condo_RS_train, y_condo_RS_train = x_condo_RS.iloc[train_condo_RS, :], y_condo_RS.iloc[train_condo_RS]
x_condo_RS_test, y_condo_RS_test = x_condo_RS.iloc[~train_condo_RS, :], y_condo_RS.iloc[~train_condo_RS]

x_Econdo_NS_train, y_Econdo_NS_train = x_Econdo_NS.iloc[train_Econdo_NS, :], y_Econdo_NS.iloc[train_Econdo_NS]
x_Econdo_NS_test, y_Econdo_NS_test = x_Econdo_NS.iloc[~train_Econdo_NS, :], y_Econdo_NS.iloc[~train_Econdo_NS]

x_Econdo_RS_train, y_Econdo_RS_train = x_Econdo_RS.iloc[train_Econdo_RS, :], y_Econdo_RS.iloc[train_Econdo_RS]
x_Econdo_RS_test, y_Econdo_RS_test = x_Econdo_RS.iloc[~train_Econdo_RS, :], y_Econdo_RS.iloc[~train_Econdo_RS]


## 6.4 Back up all x of train and test data as original before standardization for linear regression 

In [None]:
x_apart_NS_train_original = x_apart_NS_train
x_apart_NS_test_original = x_apart_NS_test

x_apart_RS_train_original = x_apart_RS_train
x_apart_RS_test_original = x_apart_RS_test

x_condo_NS_train_original =x_condo_NS_train
x_condo_NS_test_orinigal =x_condo_NS_test

x_condo_RS_train_original =x_condo_RS_train
x_condo_RS_test_original =x_condo_RS_test

x_Econdo_NS_train_orinigal =x_Econdo_NS_train
x_Econdo_NS_test_original =x_Econdo_NS_test

x_Econdo_RS_train_original =x_Econdo_RS_train
x_Econdo_RS_test_original =x_Econdo_RS_test


# 7.0.Model Building

## 7.1. Linear Regression

### 7.1.1 MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

x_apart_NS_train = scaler.fit_transform(x_apart_NS_train)
x_apart_NS_test = scaler.transform(x_apart_NS_test)

x_apart_RS_train = scaler.fit_transform(x_apart_RS_train)
x_apart_RS_test = scaler.transform(x_apart_RS_test)

x_condo_NS_train = scaler.fit_transform(x_condo_NS_train)
x_condo_NS_test = scaler.transform(x_condo_NS_test)

x_condo_RS_train = scaler.fit_transform(x_condo_RS_train)
x_condo_RS_test = scaler.transform(x_condo_RS_test)

x_Econdo_NS_train = scaler.fit_transform(x_Econdo_NS_train)
x_Econdo_NS_test = scaler.transform(x_Econdo_NS_test)

x_Econdo_RS_train = scaler.fit_transform(x_Econdo_RS_train)
x_Econdo_RS_test = scaler.transform(x_Econdo_RS_test)

#### 7.1.2 Apartments New Sale

In [None]:
x_columns_all = Econdo_NS.columns

In [None]:
# Lasso regression
alphas = [0.01, 0.1, 1, 10, 100]
MSE = []
r2 = []
complexity = []
cost = []
lasso = linear_model.Lasso()

for a in alphas:
    lasso.set_params(alpha = a)
    lasso.fit(x_apart_NS_train, y_apart_NS_train)
    apartment_y_pred = lasso.predict(x_apart_NS_test)
    error = metrics.mean_squared_error(apartment_y_pred, y_apart_NS_test)
    MSE.append(error)
    r2.append(lasso.score(x_apart_NS_test, y_apart_NS_test))
    c = (np.linalg.norm(lasso.coef_, ord = 1))
    complexity.append(c)
    cost.append(error + (a*c))

In [None]:
print(MSE)
plt.plot(range(5), MSE)
plt.xticks(np.arange(5), alphas)

In [None]:
print(r2)
plt.plot(range(5), r2)
plt.xticks(np.arange(5), alphas)

In [None]:
print(complexity)
plt.plot(range(5), complexity)
plt.xticks(np.arange(5), alphas)

In [None]:
# Check variable importance from lasso regression
lasso.set_params(alpha = 1)
lasso.fit(x_apart_NS_train, y_apart_NS_train)
lasso.predict(x_apart_NS_test)
lasso_coef = lasso.coef_
print(x_columns_all[np.argsort(np.absolute(lasso_coef))])
print(np.sort(np.absolute(lasso_coef)))
np.count_nonzero(lasso_coef)

In [None]:
# Try out top 10 features
x_apart_NS_train_reg = x_apart_NS_train[:, np.argsort(np.absolute(lasso_coef))[-10:]]
x_apart_NS_test_reg = x_apart_NS_test[:, np.argsort(np.absolute(lasso_coef))[-10:]]

In [None]:
# Linear regression with 10 features
reg = linear_model.LinearRegression().fit(x_apart_NS_train_reg, y_apart_NS_train)
apartment_y_pred = reg.predict(x_apart_NS_train_reg)
print("MSE: " 
      + str(metrics.mean_squared_error(apartment_y_pred, y_apart_NS_train)) 
      + "\nR^2: " 
      + str(reg.score(x_apart_NS_train_reg, y_apart_NS_train)))

apartment_y_pred = reg.predict(x_apart_NS_test_reg)
print("MSE: " 
      + str(metrics.mean_squared_error(apartment_y_pred, y_apart_NS_test)) 
      + "\nR^2: " 
      + str(reg.score(x_apart_NS_test_reg, y_apart_NS_test)))

In [None]:
# Try out all selected features
x_apart_NS_train_reg = x_apart_NS_train[:, np.argsort(np.absolute(lasso_coef))[-41:]]
x_apart_NS_test_reg = x_apart_NS_test[:, np.argsort(np.absolute(lasso_coef))[-41:]]
x_columns = x_columns_all[np.argsort(np.absolute(lasso_coef))][-41:]

In [None]:
# Linear regression with all selected features
reg = linear_model.LinearRegression().fit(x_apart_NS_train_reg, y_apart_NS_train)
apartment_y_pred = reg.predict(x_apart_NS_train_reg)
print("MSE: " 
      + str(metrics.mean_squared_error(apartment_y_pred, y_apart_NS_train)) 
      + "\nR^2: " 
      + str(reg.score(x_apart_NS_train_reg, y_apart_NS_train)))

apartment_y_pred = reg.predict(x_apart_NS_test_reg)
print("MSE: " 
      + str(metrics.mean_squared_error(apartment_y_pred, y_apart_NS_test))
      + "\nR^2: " 
      + str(reg.score(x_apart_NS_test_reg, y_apart_NS_test)))

reg_coef = reg.coef_
print(x_columns[np.argsort(np.absolute(reg_coef))])
print(np.sort(np.absolute(reg_coef)))

In [None]:
from yellowbrick.regressor import PredictionError

# Instantiate the linear model and visualizer
#lasso = Lasso()
visualizer = PredictionError(reg)

visualizer.fit(x_apart_NS_train_reg, y_apart_NS_train)  # Fit the training data to the visualizer
visualizer.score(x_apart_NS_test_reg, y_apart_NS_test)  # Evaluate the model on the test data
g = visualizer.poof()

### 7.1.3 Apartments Resale

In [None]:
# Lasso regression
alphas = [0.01, 0.1, 1, 10, 100]

MSE = []
r2 = []
complexity = []
cost = []
lasso = linear_model.Lasso()

for a in alphas:
    lasso.set_params(alpha = a)
    lasso.fit(x_apart_RS_train, y_apart_RS_train)
    apartment_y_pred = lasso.predict(x_apart_RS_test)
    error = metrics.mean_squared_error(apartment_y_pred, y_apart_RS_test)
    MSE.append(error)
    r2.append(lasso.score(x_apart_RS_test, y_apart_RS_test))
    c = (np.linalg.norm(lasso.coef_, ord = 1))
    complexity.append(c)
    cost.append(error + (a*c))

In [None]:
print(MSE)
plt.plot(range(5), MSE)
plt.xticks(np.arange(5), alphas)

In [None]:
print(r2)
plt.plot(range(5), r2)
plt.xticks(np.arange(5), alphas)

In [None]:
print(complexity)
plt.plot(range(5), complexity)
plt.xticks(np.arange(5), alphas)

In [None]:
# Check variable importance for lasso regression
lasso.set_params(alpha = 0.1)
lasso.fit(x_apart_RS_train, y_apart_RS_train)
lasso.predict(x_apart_RS_test)
lasso_coef = lasso.coef_
print(x_columns_all[np.argsort(np.absolute(lasso_coef))])
print(np.sort(np.absolute(lasso_coef)))
np.count_nonzero(lasso_coef)

In [None]:
x_apart_RS_train_reg = x_apart_RS_train[:, np.argsort(np.absolute(lasso_coef))[-10:]]
x_apart_RS_test_reg = x_apart_RS_test[:, np.argsort(np.absolute(lasso_coef))[-10:]]

In [None]:
# Linear regression
reg = linear_model.LinearRegression().fit(x_apart_RS_train_reg, y_apart_RS_train)
apartment_y_pred = reg.predict(x_apart_RS_test_reg)
print("MSE: " 
      + str(metrics.mean_squared_error(apartment_y_pred, y_apart_RS_test)) 
      + "\nR^2: " 
      + str(reg.score(x_apart_RS_test_reg, y_apart_RS_test)))

In [None]:
x_apart_RS_train_reg = x_apart_RS_train[:, np.argsort(np.absolute(lasso_coef))[-69:]]
x_apart_RS_test_reg = x_apart_RS_test[:, np.argsort(np.absolute(lasso_coef))[-69:]]
x_columns = x_columns_all[np.argsort(np.absolute(lasso_coef))][-69:]

In [None]:
# Linear regression
reg = linear_model.LinearRegression().fit(x_apart_RS_train_reg, y_apart_RS_train)
apartment_y_pred = reg.predict(x_apart_RS_train_reg)
print("MSE: " 
      + str(metrics.mean_squared_error(apartment_y_pred, y_apart_RS_train)) 
      + "\nR^2: " 
      + str(reg.score(x_apart_RS_train_reg, y_apart_RS_train)))

apartment_y_pred = reg.predict(x_apart_RS_test_reg)
print("MSE: " 
      + str(metrics.mean_squared_error(apartment_y_pred, y_apart_RS_test)) 
      + "\nR^2: " 
      + str(reg.score(x_apart_RS_test_reg, y_apart_RS_test)))

#reg_coef = reg.coef_
#print(x_columns[np.argsort(np.absolute(reg_coef))])
#print(np.sort(np.absolute(reg_coef)))

In [None]:
from yellowbrick.regressor import PredictionError

# Instantiate the linear model and visualizer
#lasso = Lasso()
visualizer = PredictionError(reg)

visualizer.fit(x_apart_RS_train_reg, y_apart_RS_train)  # Fit the training data to the visualizer
visualizer.score(x_apart_RS_test_reg, y_apart_RS_test)  # Evaluate the model on the test data
g = visualizer.poof()

### 7.1.4 Condominiums New Sale

In [None]:
# Lasso regression
alphas = [0.01, 0.1, 1, 10, 100]
MSE = []
r2 = []
complexity = []
cost = []
lasso = linear_model.Lasso()

for a in alphas:
    lasso.set_params(alpha = a)
    lasso.fit(x_condo_NS_train, y_condo_NS_train)
    condo_y_pred = lasso.predict(x_condo_NS_test)
    error = metrics.mean_squared_error(condo_y_pred, y_condo_NS_test)
    MSE.append(error)
    r2.append(lasso.score(x_condo_NS_test, y_condo_NS_test))
    c = (np.linalg.norm(lasso.coef_, ord = 1))
    complexity.append(c)
    cost.append(error + (a*c))

In [None]:
print(MSE)
plt.plot(range(5), MSE)
plt.xticks(np.arange(5), alphas)

In [None]:
print(r2)
plt.plot(range(5), r2)
plt.xticks(np.arange(5), alphas)

In [None]:
print(complexity)
plt.plot(range(5), complexity)
plt.xticks(np.arange(5), alphas)

In [None]:
# Check variable importance for lasso regression
lasso.set_params(alpha = 1)
lasso.fit(x_condo_NS_train, y_condo_NS_train)
lasso.predict(x_condo_NS_test)
lasso_coef = lasso.coef_
print(x_columns_all[np.argsort(np.absolute(lasso_coef))])
print(np.sort(np.absolute(lasso_coef)))
np.count_nonzero(lasso_coef)

In [None]:
# Try out top 10 features
x_condo_NS_train_reg = x_condo_NS_train[:, np.argsort(np.absolute(lasso_coef))[-10:]]
x_condo_NS_test_reg = x_condo_NS_test[:, np.argsort(np.absolute(lasso_coef))[-10:]]

In [None]:
# Linear regression with 10 features
reg = linear_model.LinearRegression().fit(x_condo_NS_train_reg, y_condo_NS_train)
condo_y_pred = reg.predict(x_condo_NS_train_reg)
print("MSE: " 
      + str(metrics.mean_squared_error(condo_y_pred, y_condo_NS_train)) 
      + "\nR^2: " 
      + str(reg.score(x_condo_NS_train_reg, y_condo_NS_train)))

condo_y_pred = reg.predict(x_condo_NS_test_reg)
print("MSE: " 
      + str(metrics.mean_squared_error(condo_y_pred, y_condo_NS_test)) 
      + "\nR^2: " 
      + str(reg.score(x_condo_NS_test_reg, y_condo_NS_test)))

In [None]:
# Try out all selected features
x_condo_NS_train_reg = x_condo_NS_train[:, np.argsort(np.absolute(lasso_coef))[-36:]]
x_condo_NS_test_reg = x_condo_NS_test[:, np.argsort(np.absolute(lasso_coef))[-36:]]
x_columns = x_columns_all[np.argsort(np.absolute(lasso_coef))][-36:]

In [None]:
# Linear regression with all selected features
reg = linear_model.LinearRegression().fit(x_condo_NS_train_reg, y_condo_NS_train)
condo_y_pred = reg.predict(x_condo_NS_train_reg)
print("MSE: " 
      + str(metrics.mean_squared_error(condo_y_pred, y_condo_NS_train)) 
      + "\nR^2: " 
      + str(reg.score(x_condo_NS_train_reg, y_condo_NS_train)))

condo_y_pred = reg.predict(x_condo_NS_test_reg)
print("MSE: " 
      + str(metrics.mean_squared_error(condo_y_pred, y_condo_NS_test)) 
      + "\nR^2: " 
      + str(reg.score(x_condo_NS_test_reg, y_condo_NS_test)))

#reg_coef = reg.coef_
#print(x_columns[np.argsort(np.absolute(reg_coef))])
#print(np.sort(np.absolute(reg_coef)))

In [None]:
from yellowbrick.regressor import PredictionError

# Instantiate the linear model and visualizer
#lasso = Lasso()
visualizer = PredictionError(reg)

visualizer.fit(x_condo_NS_train_reg, y_condo_NS_train)  # Fit the training data to the visualizer
visualizer.score(x_condo_NS_test_reg, y_condo_NS_test)  # Evaluate the model on the test data
g = visualizer.poof()

### 7.1.5 Condominiums Resale

In [None]:
# Lasso regression
alphas = [0.01, 0.1, 1, 10, 100]

MSE = []
r2 = []
complexity = []
cost = []
lasso = linear_model.Lasso()

for a in alphas:
    lasso.set_params(alpha = a)
    lasso.fit(x_condo_RS_train, y_condo_RS_train)
    condo_y_pred = lasso.predict(x_condo_RS_test)
    error = metrics.mean_squared_error(condo_y_pred, y_condo_RS_test)
    MSE.append(error)
    r2.append(lasso.score(x_condo_RS_test, y_condo_RS_test))
    c = (np.linalg.norm(lasso.coef_, ord = 1))
    complexity.append(c)
    cost.append(error + (a*c))

In [None]:
print(MSE)
plt.plot(range(5), MSE)
plt.xticks(np.arange(5), alphas)

In [None]:
print(r2)
plt.plot(range(5), r2)
plt.xticks(np.arange(5), alphas)

In [None]:
print(complexity)
plt.plot(range(5), complexity)
plt.xticks(np.arange(5), alphas)

In [None]:
# Check variable importance for lasso regression
lasso.set_params(alpha = 1)
lasso.fit(x_condo_RS_train, y_condo_RS_train)
lasso.predict(x_condo_RS_test)
lasso_coef = lasso.coef_
print(x_columns_all[np.argsort(np.absolute(lasso_coef))])
print(np.sort(np.absolute(lasso_coef)))
np.count_nonzero(lasso_coef)

In [None]:
x_condo_RS_train_reg = x_condo_RS_train[:, np.argsort(np.absolute(lasso_coef))[-10:]]
x_condo_RS_test_reg = x_condo_RS_test[:, np.argsort(np.absolute(lasso_coef))[-10:]]

In [None]:
# Linear regression
reg = linear_model.LinearRegression().fit(x_condo_RS_train_reg, y_condo_RS_train)
condo_y_pred = reg.predict(x_condo_RS_train_reg)
print("MSE: " 
      + str(metrics.mean_squared_error(condo_y_pred, y_condo_RS_train)) 
      + "\nR^2: " 
      + str(reg.score(x_condo_RS_train_reg, y_condo_RS_train)))

condo_y_pred = reg.predict(x_condo_RS_test_reg)
print("MSE: " 
      + str(metrics.mean_squared_error(condo_y_pred, y_condo_RS_test)) 
      + "\nR^2: " 
      + str(reg.score(x_condo_RS_test_reg, y_condo_RS_test)))

In [None]:
x_condo_RS_train_reg = x_condo_RS_train[:, np.argsort(np.absolute(lasso_coef))[-51:]]
x_condo_RS_test_reg = x_condo_RS_test[:, np.argsort(np.absolute(lasso_coef))[-51:]]

In [None]:
# Linear regression
reg = linear_model.LinearRegression().fit(x_condo_RS_train_reg, y_condo_RS_train)
condo_y_pred = reg.predict(x_condo_RS_train_reg)
print("MSE: " 
      + str(metrics.mean_squared_error(condo_y_pred, y_condo_RS_train)) 
      + "\nR^2: " 
      + str(reg.score(x_condo_RS_train_reg, y_condo_RS_train)))

condo_y_pred = reg.predict(x_condo_RS_test_reg)
print("MSE: " 
      + str(metrics.mean_squared_error(condo_y_pred, y_condo_RS_test)) 
      + "\nR^2: " 
      + str(reg.score(x_condo_RS_test_reg, y_condo_RS_test)))

In [None]:
from yellowbrick.regressor import PredictionError

# Instantiate the linear model and visualizer
#lasso = Lasso()
visualizer = PredictionError(reg)

visualizer.fit(x_condo_RS_train_reg, y_condo_RS_train)  # Fit the training data to the visualizer
visualizer.score(x_condo_RS_test_reg, y_condo_RS_test)  # Evaluate the model on the test data
g = visualizer.poof()

### 7.1.6 Executive Condominiums New Sale

In [None]:
# Lasso regression
alphas = [0.01, 0.1, 1, 10, 100]

MSE = []
r2 = []
complexity = []
cost = []
lasso = linear_model.Lasso()

for a in alphas:
    lasso.set_params(alpha = a)
    lasso.fit(x_Econdo_NS_train, y_Econdo_NS_train)
    Econdo_y_pred = lasso.predict(x_Econdo_NS_test)
    error = metrics.mean_squared_error(Econdo_y_pred, y_Econdo_NS_test)
    MSE.append(error)
    r2.append(lasso.score(x_Econdo_NS_test, y_Econdo_NS_test))
    c = (np.linalg.norm(lasso.coef_, ord = 1))
    complexity.append(c)
    cost.append(error + (a*c))

In [None]:
print(MSE)
plt.plot(range(5), MSE)
plt.xticks(np.arange(5), alphas)

In [None]:
print(r2)
plt.plot(range(5), r2)
plt.xticks(np.arange(5), alphas)

In [None]:
print(complexity)
plt.plot(range(5), complexity)
plt.xticks(np.arange(5), alphas)

In [None]:
# Check variable importance from lasso regression
lasso.set_params(alpha = 1)
lasso.fit(x_Econdo_NS_train, y_Econdo_NS_train)
lasso.predict(x_Econdo_NS_test)
lasso_coef = lasso.coef_
print(x_columns_all[np.argsort(np.absolute(lasso_coef))])
print(np.sort(np.absolute(lasso_coef)))
np.count_nonzero(lasso_coef)

In [None]:
# Try out top 10 features
x_Econdo_NS_train_reg = x_Econdo_NS_train[:, np.argsort(np.absolute(lasso_coef))[-10:]]
x_Econdo_NS_test_reg = x_Econdo_NS_test[:, np.argsort(np.absolute(lasso_coef))[-10:]]

In [None]:
# Linear regression with 10 features
reg = linear_model.LinearRegression().fit(x_Econdo_NS_train_reg, y_Econdo_NS_train)
Econdo_y_pred = reg.predict(x_Econdo_NS_train_reg)
print("MSE: " 
      + str(metrics.mean_squared_error(Econdo_y_pred, y_Econdo_NS_train)) 
      + "\nR^2: " 
      + str(reg.score(x_Econdo_NS_train_reg, y_Econdo_NS_train)))

Econdo_y_pred = reg.predict(x_Econdo_NS_test_reg)
print("MSE: " 
      + str(metrics.mean_squared_error(Econdo_y_pred, y_Econdo_NS_test)) 
      + "\nR^2: " 
      + str(reg.score(x_Econdo_NS_test_reg, y_Econdo_NS_test)))

In [None]:
# Try out all selected features
x_Econdo_NS_train_reg = x_Econdo_NS_train[:, np.argsort(np.absolute(lasso_coef))[-15:]]
x_Econdo_NS_test_reg = x_Econdo_NS_test[:, np.argsort(np.absolute(lasso_coef))[-15:]]
x_columns = x_columns_all[np.argsort(np.absolute(lasso_coef))][-15:]

In [None]:
# Linear regression with all selected features
reg = linear_model.LinearRegression().fit(x_Econdo_NS_train_reg, y_Econdo_NS_train)
Econdo_y_pred = reg.predict(x_Econdo_NS_train_reg)
print("MSE: " 
      + str(metrics.mean_squared_error(Econdo_y_pred, y_Econdo_NS_train)) 
      + "\nR^2: " 
      + str(reg.score(x_Econdo_NS_train_reg, y_Econdo_NS_train)))

Econdo_y_pred = reg.predict(x_Econdo_NS_test_reg)
print("MSE: " 
      + str(metrics.mean_squared_error(Econdo_y_pred, y_Econdo_NS_test)) 
      + "\nR^2: " 
      + str(reg.score(x_Econdo_NS_test_reg, y_Econdo_NS_test)))

In [None]:
from yellowbrick.regressor import PredictionError

# Instantiate the linear model and visualizer
#lasso = Lasso()
visualizer = PredictionError(reg)

visualizer.fit(x_Econdo_NS_train_reg, y_Econdo_NS_train)  # Fit the training data to the visualizer
visualizer.score(x_Econdo_NS_test_reg, y_Econdo_NS_test)  # Evaluate the model on the test data
g = visualizer.poof()

### 7.1.7 Executive Condominiums Resale

In [None]:
# Lasso regression
alphas = [0.01, 0.1, 1, 10, 100]

MSE = []
r2 = []
complexity = []
cost = []
lasso = linear_model.Lasso()

for a in alphas:
    lasso.set_params(alpha = a)
    lasso.fit(x_Econdo_RS_train, y_Econdo_RS_train)
    Econdo_y_pred = lasso.predict(x_Econdo_RS_test)
    error = metrics.mean_squared_error(Econdo_y_pred, y_Econdo_RS_test)
    MSE.append(error)
    r2.append(lasso.score(x_Econdo_RS_test, y_Econdo_RS_test))
    c = (np.linalg.norm(lasso.coef_, ord = 1))
    complexity.append(c)
    cost.append(error + (a*c))

In [None]:
print(MSE)
plt.plot(range(5), MSE)
plt.xticks(np.arange(5), alphas)

In [None]:
print(r2)
plt.plot(range(5), r2)
plt.xticks(np.arange(5), alphas)

In [None]:
print(complexity)
plt.plot(range(5), complexity)
plt.xticks(np.arange(5), alphas)

In [None]:
# Check variable importance for lasso regression
lasso.set_params(alpha = 1)
lasso.fit(x_Econdo_RS_train, y_Econdo_RS_train)
lasso.predict(x_Econdo_RS_test)
lasso_coef = lasso.coef_
print(x_columns_all[np.argsort(np.absolute(lasso_coef))])
print(np.sort(np.absolute(lasso_coef)))
np.count_nonzero(lasso_coef)

In [None]:
x_Econdo_RS_train_reg = x_Econdo_RS_train[:, np.argsort(np.absolute(lasso_coef))[-10:]]
x_Econdo_RS_test_reg = x_Econdo_RS_test[:, np.argsort(np.absolute(lasso_coef))[-10:]]

In [None]:
# Linear regression
reg = linear_model.LinearRegression().fit(x_Econdo_RS_train_reg, y_Econdo_RS_train)
Econdo_y_pred = reg.predict(x_Econdo_RS_train_reg)
print("MSE: " 
      + str(metrics.mean_squared_error(Econdo_y_pred, y_Econdo_RS_train)) 
      + "\nR^2: " 
      + str(reg.score(x_Econdo_RS_train_reg, y_Econdo_RS_train)))

Econdo_y_pred = reg.predict(x_Econdo_RS_test_reg)
print("MSE: " 
      + str(metrics.mean_squared_error(Econdo_y_pred, y_Econdo_RS_test)) 
      + "\nR^2: " 
      + str(reg.score(x_Econdo_RS_test_reg, y_Econdo_RS_test)))

In [None]:
x_Econdo_RS_train_reg = x_Econdo_RS_train[:, np.argsort(np.absolute(lasso_coef))[-16:]]
x_Econdo_RS_test_reg = x_Econdo_RS_test[:, np.argsort(np.absolute(lasso_coef))[-16:]]

In [None]:
# Linear regression
reg = linear_model.LinearRegression().fit(x_Econdo_RS_train_reg, y_Econdo_RS_train)
Econdo_y_pred = reg.predict(x_Econdo_RS_train_reg)
print("MSE: " 
      + str(metrics.mean_squared_error(Econdo_y_pred, y_Econdo_RS_train)) 
      + "\nR^2: " 
      + str(reg.score(x_Econdo_RS_train_reg, y_Econdo_RS_train)))

Econdo_y_pred = reg.predict(x_Econdo_RS_test_reg)
print("MSE: " 
      + str(metrics.mean_squared_error(Econdo_y_pred, y_Econdo_RS_test)) 
      + "\nR^2: " 
      + str(reg.score(x_Econdo_RS_test_reg, y_Econdo_RS_test)))

In [None]:
from yellowbrick.regressor import PredictionError

# Instantiate the linear model and visualizer
#lasso = Lasso()
visualizer = PredictionError(reg)

visualizer.fit(x_Econdo_RS_train_reg, y_Econdo_RS_train)  # Fit the training data to the visualizer
visualizer.score(x_Econdo_RS_test_reg, y_Econdo_RS_test)  # Evaluate the model on the test data
g = visualizer.poof()

## 7.2 Decision Tree

In [None]:
x_apart_NS_train = x_apart_NS_train_original
x_apart_NS_test = x_apart_NS_test_original

x_apart_RS_train =x_apart_RS_train_original
x_apart_RS_test =x_apart_RS_test_original

x_condo_NS_train =x_condo_NS_train_original
x_condo_NS_test =x_condo_NS_test_orinigal

x_condo_RS_train =x_condo_RS_train_original
x_condo_RS_test =x_condo_RS_test_original

x_Econdo_NS_train =x_Econdo_NS_train_orinigal
x_Econdo_NS_test =x_Econdo_NS_test_original

x_Econdo_RS_train =x_Econdo_RS_train_original
x_Econdo_RS_test =x_Econdo_RS_test_original

### 7.2.1 Apartment New Sale

In [None]:
from sklearn import tree
regressor = tree.DecisionTreeRegressor(random_state=0)
regressor.fit(x_apart_NS_train,y_apart_NS_train)

R_Square = regressor.score(x_apart_NS_test,y_apart_NS_test)
print("The R square is :", R_Square)
print("\n")

feature_importance = regressor.feature_importances_
all_features = list(x_apart_NS_train)
m,n = x_apart_NS_train.shape

    
Important_Features = pd.DataFrame(columns = ["Feature","Importance"])

for i in range(n):
    Important_Features.loc[i] = [all_features[i], feature_importance[i]] 



Important_Features_sorted = Important_Features.sort_values('Importance',ascending=False) 
Important_Features_sorted.iloc[0:5,:]

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_apart_NS_test, regressor.predict(x_apart_NS_test))

#### 7.2.2 Apartment Resale

In [None]:
from sklearn import tree
regressor = tree.DecisionTreeRegressor(random_state=0,min_samples_split=20)
regressor.fit(x_apart_RS_train,y_apart_RS_train)

R_Square = regressor.score(x_apart_RS_test,y_apart_RS_test)
print("The R square is :", R_Square)
print("\n")

feature_importance = regressor.feature_importances_
all_features = list(x_apart_RS_train)
m,n = x_apart_RS_train.shape

    
Important_Features = pd.DataFrame(columns = ["Feature","Importance"])

for i in range(n):
    Important_Features.loc[i] = [all_features[i], feature_importance[i]] 



Important_Features_sorted = Important_Features.sort_values('Importance',ascending=False) 
Important_Features_sorted.iloc[0:5,:]

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_apart_RS_test, regressor.predict(x_apart_RS_test))

### 7.2.3 Condo New Sale

In [None]:
from sklearn import tree
regressor = tree.DecisionTreeRegressor(random_state=0)
regressor.fit(x_condo_NS_train,y_condo_NS_train)

R_Square = regressor.score(x_condo_NS_test,y_condo_NS_test)
print("The R square is :", R_Square)
print("\n")

feature_importance = regressor.feature_importances_
all_features = list(x_condo_NS_train)
m,n = x_condo_NS_train.shape

    
Important_Features = pd.DataFrame(columns = ["Feature","Importance"])

for i in range(n):
    Important_Features.loc[i] = [all_features[i], feature_importance[i]] 



Important_Features_sorted = Important_Features.sort_values('Importance',ascending=False) 
Important_Features_sorted.iloc[0:5,:]

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_condo_NS_test, regressor.predict(x_condo_NS_test))

### 7.2.4 Condo Resale

In [None]:
from sklearn import tree
regressor = tree.DecisionTreeRegressor(random_state=0,min_samples_split=30)
regressor.fit(x_condo_RS_train,y_condo_RS_train)

R_Square = regressor.score(x_condo_RS_test,y_condo_RS_test)
print("The R square is :", R_Square)
print("\n")

feature_importance = regressor.feature_importances_
all_features = list(x_condo_RS_train)
m,n = x_condo_RS_train.shape

    
Important_Features = pd.DataFrame(columns = ["Feature","Importance"])

for i in range(n):
    Important_Features.loc[i] = [all_features[i], feature_importance[i]] 



Important_Features_sorted = Important_Features.sort_values('Importance',ascending=False) 
Important_Features_sorted.iloc[0:5,:]

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_condo_RS_test, regressor.predict(x_condo_RS_test))

### 7.2.5 Econdo New Sale

In [None]:
from sklearn import tree
regressor = tree.DecisionTreeRegressor(random_state=0,min_samples_split=20)
regressor.fit(x_Econdo_NS_train,y_Econdo_NS_train)

R_Square = regressor.score(x_Econdo_NS_test,y_Econdo_NS_test)
print("The R square is :", R_Square)
print("\n")

feature_importance = regressor.feature_importances_
all_features = list(x_Econdo_NS_train)
m,n = x_Econdo_NS_train.shape

    
Important_Features = pd.DataFrame(columns = ["Feature","Importance"])

for i in range(n):
    Important_Features.loc[i] = [all_features[i], feature_importance[i]] 



Important_Features_sorted = Important_Features.sort_values('Importance',ascending=False) 
Important_Features_sorted.iloc[0:5,:]

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_Econdo_NS_test, regressor.predict(x_Econdo_NS_test))

### 7.2.6 Econdo Resale

In [None]:
from sklearn import tree
regressor = tree.DecisionTreeRegressor(random_state=0,min_samples_split=30)
regressor.fit(x_Econdo_RS_train,y_Econdo_RS_train)

R_Square = regressor.score(x_Econdo_RS_test,y_Econdo_RS_test)
print("The R square is :", R_Square)
print("\n")

feature_importance = regressor.feature_importances_
all_features = list(x_Econdo_RS_train)
m,n = x_Econdo_RS_train.shape

    
Important_Features = pd.DataFrame(columns = ["Feature","Importance"])

for i in range(n):
    Important_Features.loc[i] = [all_features[i], feature_importance[i]] 



Important_Features_sorted = Important_Features.sort_values('Importance',ascending=False) 
Important_Features_sorted.iloc[0:5,:]

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_Econdo_RS_test, regressor.predict(x_Econdo_RS_test))

## 7.3 Random Forest

### 7.3.1 Apartment New Sale

In [None]:
regr_apart_NS = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100,
           oob_score=False, random_state=0, verbose=0, warm_start=False)


regr_apart_NS.fit(x_apart_NS_train, y_apart_NS_train)


print("R2 squared for test")
print(regr_apart_NS.score(x_apart_NS_test, y_apart_NS_test, sample_weight=None))

print("R2 squared for train")
print(regr_apart_NS.score(x_apart_NS_train, y_apart_NS_train, sample_weight=None))
print("MSE Test",metrics.mean_squared_error(y_apart_NS_test,regr_apart_NS.predict(x_apart_NS_test)))
print("MSE Train",metrics.mean_squared_error(y_apart_NS_train,regr_apart_NS.predict(x_apart_NS_train)))

importances = regr_apart_NS.feature_importances_
std = np.std([tree.feature_importances_ for tree in regr_apart_NS.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

names = x_apart_NS_train.columns
print (sorted(zip(map(lambda z: round(z, 4), regr_apart_NS.feature_importances_), names), 
             reverse=True))


visualizer = PredictionError(regr_apart_NS)
visualizer.fit(x_apart_NS_train, y_apart_NS_train)  
visualizer.score(x_apart_NS_test, y_apart_NS_test)  
g = visualizer.poof() 

### 7.3.2 Apartment Resale

In [None]:
regr_apart_RS = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, 
           oob_score=False, random_state=0, verbose=0, warm_start=False)

regr_apart_RS.fit(x_apart_RS_train, y_apart_RS_train)


print("R2 squared for test")
print(regr_apart_RS.score(x_apart_RS_test, y_apart_RS_test, sample_weight=None))

print("R2 squared for train")
print(regr_apart_RS.score(x_apart_RS_train, y_apart_RS_train, sample_weight=None))
print("MSE Test",metrics.mean_squared_error(y_apart_RS_test,regr_apart_RS.predict(x_apart_RS_test)))
print("MSE Train",metrics.mean_squared_error(y_apart_RS_train,regr_apart_RS.predict(x_apart_RS_train)))

importances = regr_apart_RS.feature_importances_
std = np.std([tree.feature_importances_ for tree in regr_apart_RS.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

names = x_apart_RS_train.columns
print (sorted(zip(map(lambda z: round(z, 4), regr_apart_RS.feature_importances_), names), 
             reverse=True))

visualizer = PredictionError(regr_apart_RS)
visualizer.fit(x_apart_RS_train, y_apart_RS_train)  
visualizer.score(x_apart_RS_test, y_apart_RS_test)  
g = visualizer.poof() 

### 7.3.3 Condo New Sale

In [None]:
regr_condo_NS = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, 
           oob_score=False, random_state=0, verbose=0, warm_start=False)

regr_condo_NS.fit(x_condo_NS_train, y_condo_NS_train)


print("R2 squared for test")
print(regr_condo_NS.score(x_condo_NS_test, y_condo_NS_test, sample_weight=None))

print("R2 squared for train")
print(regr_condo_NS.score(x_condo_NS_train, y_condo_NS_train, sample_weight=None))
print("MSE Test",metrics.mean_squared_error(y_condo_NS_test,regr_condo_NS.predict(x_condo_NS_test)))
print("MSE Train",metrics.mean_squared_error(y_condo_NS_train,regr_condo_NS.predict(x_condo_NS_train)))

importances = regr_condo_NS.feature_importances_
std = np.std([tree.feature_importances_ for tree in regr_condo_NS.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

names = x_condo_NS_train.columns
print (sorted(zip(map(lambda z: round(z, 4), regr_condo_NS.feature_importances_), names), 
             reverse=True))

visualizer = PredictionError(regr_condo_NS)
visualizer.fit(x_condo_NS_train, y_condo_NS_train)  
visualizer.score(x_condo_NS_test, y_condo_NS_test)  
g = visualizer.poof() 

### 7.3.4 Condo Resale

In [None]:
regr_condo_RS = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, 
           oob_score=False, random_state=0, verbose=0, warm_start=False)

regr_condo_RS.fit(x_condo_RS_train, y_condo_RS_train)


print("R2 squared for test")
print(regr_condo_RS.score(x_condo_RS_test, y_condo_RS_test, sample_weight=None))

print("R2 squared for train")
print(regr_condo_RS.score(x_condo_RS_train, y_condo_RS_train, sample_weight=None))
print("MSE Test",metrics.mean_squared_error(y_condo_RS_test,regr_condo_RS.predict(x_condo_RS_test)))
print("MSE Train",metrics.mean_squared_error(y_condo_RS_train,regr_condo_RS.predict(x_condo_RS_train)))

importances = regr_condo_RS.feature_importances_
std = np.std([tree.feature_importances_ for tree in regr_condo_RS.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

names = x_condo_RS_train.columns
print (sorted(zip(map(lambda z: round(z, 4), regr_condo_RS.feature_importances_), names), 
             reverse=True))

visualizer = PredictionError(regr_condo_RS)
visualizer.fit(x_condo_RS_train, y_condo_RS_train)  
visualizer.score(x_condo_RS_test, y_condo_RS_test)  
g = visualizer.poof() 

### 7.3.5 Executive Condo New Sale

In [None]:
regr_Econdo_NS = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, 
           oob_score=False, random_state=0, verbose=0, warm_start=False)

regr_Econdo_NS.fit(x_Econdo_NS_train, y_Econdo_NS_train)


print("R2 squared for test")
print(regr_Econdo_NS.score(x_Econdo_NS_test, y_Econdo_NS_test, sample_weight=None))

print("R2 squared for train")
print(regr_Econdo_NS.score(x_Econdo_NS_train, y_Econdo_NS_train, sample_weight=None))
print("MSE Test",metrics.mean_squared_error(y_Econdo_NS_test,regr_Econdo_NS.predict(x_Econdo_NS_test)))
print("MSE Train",metrics.mean_squared_error(y_Econdo_NS_train,regr_Econdo_NS.predict(x_Econdo_NS_train)))

importances = regr_Econdo_NS.feature_importances_
std = np.std([tree.feature_importances_ for tree in regr_Econdo_NS.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

names = x_Econdo_NS_train.columns
print (sorted(zip(map(lambda z: round(z, 4), regr_Econdo_NS.feature_importances_), names), 
             reverse=True))

visualizer = PredictionError(regr_Econdo_NS)
visualizer.fit(x_Econdo_NS_train, y_Econdo_NS_train)  
visualizer.score(x_Econdo_NS_test, y_Econdo_NS_test)  
g = visualizer.poof() 

### 7.3.6 Executive Condo  Resale

In [None]:
regr_Econdo_RS = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, 
           oob_score=False, random_state=0, verbose=0, warm_start=False)

regr_Econdo_RS.fit(x_Econdo_RS_train, y_Econdo_RS_train)


print("R2 squared for test")
print(regr_Econdo_RS.score(x_Econdo_RS_test, y_Econdo_RS_test, sample_weight=None))

print("R2 squared for train")
print(regr_Econdo_RS.score(x_Econdo_RS_train, y_Econdo_RS_train, sample_weight=None))
print("MSE Test",metrics.mean_squared_error(y_Econdo_RS_test,regr_Econdo_RS.predict(x_Econdo_RS_test)))
print("MSE Train",metrics.mean_squared_error(y_Econdo_RS_train,regr_Econdo_RS.predict(x_Econdo_RS_train)))

importances = regr_Econdo_RS.feature_importances_
std = np.std([tree.feature_importances_ for tree in regr_Econdo_RS.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

names = x_Econdo_RS_train.columns
print (sorted(zip(map(lambda z: round(z, 4), regr_Econdo_RS.feature_importances_), names), 
             reverse=True))

visualizer = PredictionError(regr_Econdo_RS)
visualizer.fit(x_Econdo_RS_train, y_Econdo_RS_train)  
visualizer.score(x_Econdo_RS_test, y_Econdo_RS_test)  
g = visualizer.poof() 

## 7.4. Boosting - GBR

In [None]:
from sklearn.ensemble import GradientBoostingRegressor as gbr
from sklearn.metrics import mean_squared_error
import time
from mpl_toolkits.axes_grid1.parasite_axes import host_subplot
from mpl_toolkits.axisartist.axislines import Axes
from scipy.sparse.csr import csr_matrix
from sklearn.utils import shuffle
from sklearn.svm.classes import NuSVR
import xgboost as xgb

### 7.4.1 Apartment New Sales

In [None]:
def benchmark_influence(conf):
    """
    Benchmark influence of :changing_param: on both MSE and latency.
    """
    prediction_times = []
    prediction_powers = []
    complexities = []
    for param_value in conf['changing_param_values']:
        conf['tuned_params'][conf['changing_param']] = param_value
        estimator = conf['estimator'](**conf['tuned_params'])
        print("Benchmarking %s" % estimator)
        estimator.fit(conf['x_train'], conf['y_train'])
        conf['postfit_hook'](estimator)
        complexity = conf['complexity_computer'](estimator)
        complexities.append(complexity)
        start_time = time.time()
        y_pred = estimator.predict(conf['x_test'])
        elapsed_time = (time.time() - start_time)
        prediction_times.append(elapsed_time)
        pred_score = conf['prediction_performance_computer'](
            conf['y_test'], y_pred)
        prediction_powers.append(pred_score)
        print("Complexity: %d | %s: %.4f | Pred. Time: %fm\n" % (
            complexity, conf['prediction_performance_label'], pred_score,
            elapsed_time))
    return prediction_powers, prediction_times, complexities


def plot_influence(conf, mse_values, prediction_times, complexities):
    """
    Plot influence of model complexity on both accuracy and latency.
    """
    plt.figure(figsize=(12, 6))
    host = host_subplot(111, axes_class=Axes)
    plt.subplots_adjust(right=0.75)
    par1 = host.twinx()
    host.set_xlabel('Model Complexity (%s)' % conf['complexity_label'])
    y1_label = conf['prediction_performance_label']
    y2_label = "Time (s)"
    host.set_ylabel(y1_label)
    par1.set_ylabel(y2_label)
    p1, = host.plot(complexities, mse_values, 'b-', label="prediction error")
    p2, = par1.plot(complexities, prediction_times, 'r-',
                    label="latency")
    host.legend(loc='upper right')
    host.axis["left"].label.set_color(p1.get_color())
    par1.axis["right"].label.set_color(p2.get_color())
    plt.title('Influence of Model Complexity - %s' % conf['estimator'].__name__)
    plt.show()


def _count_nonzero_coefficients(estimator):
    a = estimator.coef_.toarray()
    return np.count_nonzero(a)

configurations = [
    {'estimator': gbr,
     'tuned_params': {'loss': 'ls'},
     'changing_param': 'n_estimators',
     'changing_param_values': [100, 1000, 2000, 4000, 6000, 8000, 10000],
     'complexity_label': 'n_trees',
     'complexity_computer': lambda x: x.n_estimators,
     'x_train': x_apart_NS_train,
     'y_train': y_apart_NS_train,
     'x_test': x_apart_NS_test,
     'y_test': y_apart_NS_test,
     'postfit_hook': lambda x: x,
     'prediction_performance_computer': mean_squared_error,
     'prediction_performance_label': 'MSE'},
]
for conf in configurations:
    prediction_performances, prediction_times, complexities = \
        benchmark_influence(conf)
    plot_influence(conf, prediction_performances, prediction_times,
                   complexities)


In [None]:
gbr_apart_NS = ensemble.GradientBoostingRegressor(n_estimators=8000)

gbr_apart_NS.fit(x_apart_NS_train, y_apart_NS_train)

In [None]:
print("R2 for test", gbr_apart_NS.score(x_apart_NS_test, y_apart_NS_test, sample_weight=None))
print("R2 for train", gbr_apart_NS.score(x_apart_NS_train, y_apart_NS_train, sample_weight=None))

print("MSE Test",metrics.mean_squared_error(y_apart_NS_test,gbr_apart_NS.predict(x_apart_NS_test)))
print("MSE Train",metrics.mean_squared_error(y_apart_NS_train,gbr_apart_NS.predict(x_apart_NS_train)))


In [None]:
names = x_apart_NS_train.columns

columns = ["Importance", "Feature Name"]
index = ["1st Important Feature", 
         "2nd Important Feature", 
         "3rd Important Feature", 
         "4th Important Feature", 
         "5th Important Feature",
        "6th Important Feature",
        "7th Important Feature",
        "8th Important Feature",
        "9th Important Feature",
        "10th Important Feature"]

pd.DataFrame(sorted(zip(map(lambda z: round(z, 4), gbr_apart_NS.feature_importances_), names), 
             reverse=True)[0:10], columns = columns, index = index)


### 7.4.2 Apartment Resale

In [None]:
def benchmark_influence(conf):
    """
    Benchmark influence of :changing_param: on both MSE and latency.
    """
    prediction_times = []
    prediction_powers = []
    complexities = []
    for param_value in conf['changing_param_values']:
        conf['tuned_params'][conf['changing_param']] = param_value
        estimator = conf['estimator'](**conf['tuned_params'])
        print("Benchmarking %s" % estimator)
        estimator.fit(conf['x_train'], conf['y_train'])
        conf['postfit_hook'](estimator)
        complexity = conf['complexity_computer'](estimator)
        complexities.append(complexity)
        start_time = time.time()
        y_pred = estimator.predict(conf['x_test'])
        elapsed_time = (time.time() - start_time)
        prediction_times.append(elapsed_time)
        pred_score = conf['prediction_performance_computer'](
            conf['y_test'], y_pred)
        prediction_powers.append(pred_score)
        print("Complexity: %d | %s: %.4f | Pred. Time: %fs\n" % (
            complexity, conf['prediction_performance_label'], pred_score,
            elapsed_time))
    return prediction_powers, prediction_times, complexities


def plot_influence(conf, mse_values, prediction_times, complexities):
    """
    Plot influence of model complexity on both accuracy and latency.
    """
    plt.figure(figsize=(12, 6))
    host = host_subplot(111, axes_class=Axes)
    plt.subplots_adjust(right=0.75)
    par1 = host.twinx()
    host.set_xlabel('Model Complexity (%s)' % conf['complexity_label'])
    y1_label = conf['prediction_performance_label']
    y2_label = "Time (s)"
    host.set_ylabel(y1_label)
    par1.set_ylabel(y2_label)
    p1, = host.plot(complexities, mse_values, 'b-', label="prediction error")
    p2, = par1.plot(complexities, prediction_times, 'r-',
                    label="latency")
    host.legend(loc='upper right')
    host.axis["left"].label.set_color(p1.get_color())
    par1.axis["right"].label.set_color(p2.get_color())
    plt.title('Influence of Model Complexity - %s' % conf['estimator'].__name__)
    plt.show()


def _count_nonzero_coefficients(estimator):
    a = estimator.coef_.toarray()
    return np.count_nonzero(a)

configurations = [
    {'estimator': gbr,
     'tuned_params': {'loss': 'ls'},
     'changing_param': 'n_estimators',
     'changing_param_values': [200,400,600,800,1000,1200],
     'complexity_label': 'n_trees',
     'complexity_computer': lambda x: x.n_estimators,
     'x_train': x_apart_RS_train,
     'y_train': y_apart_RS_train,
     'x_test': x_apart_RS_test,
     'y_test': y_apart_RS_test,
     'postfit_hook': lambda x: x,
     'prediction_performance_computer': mean_squared_error,
     'prediction_performance_label': 'MSE'},
]
for conf in configurations:
    prediction_performances, prediction_times, complexities = \
        benchmark_influence(conf)
    plot_influence(conf, prediction_performances, prediction_times,
                   complexities)


In [None]:
gbr_apart_RS = ensemble.GradientBoostingRegressor(n_estimators=800)

gbr_apart_RS.fit(x_apart_RS_train, y_apart_RS_train)


In [None]:
print("R2 for test", gbr_apart_RS.score(x_apart_RS_test, y_apart_RS_test, sample_weight=None))
print("R2 for train", gbr_apart_RS.score(x_apart_RS_train, y_apart_RS_train, sample_weight=None))

print("MSE Test",metrics.mean_squared_error(y_apart_RS_test,gbr_apart_RS.predict(x_apart_RS_test)))
print("MSE Train",metrics.mean_squared_error(y_apart_RS_train,gbr_apart_RS.predict(x_apart_RS_train)))


In [None]:
names = x_apart_RS_train.columns

columns = ["Importance", "Feature Name"]
index = ["1st Important Feature", 
         "2nd Important Feature", 
         "3rd Important Feature", 
         "4th Important Feature", 
         "5th Important Feature",
        "6th Important Feature",
        "7th Important Feature",
        "8th Important Feature",
        "9th Important Feature",
        "10th Important Feature"]

pd.DataFrame(sorted(zip(map(lambda z: round(z, 4), gbr_apart_RS.feature_importances_), names), 
             reverse=True)[0:10], columns = columns, index = index)


### 7.4.3 Condo New Sales

In [None]:
def benchmark_influence(conf):
    """
    Benchmark influence of :changing_param: on both MSE and latency.
    """
    prediction_times = []
    prediction_powers = []
    complexities = []
    for param_value in conf['changing_param_values']:
        conf['tuned_params'][conf['changing_param']] = param_value
        estimator = conf['estimator'](**conf['tuned_params'])
        print("Benchmarking %s" % estimator)
        estimator.fit(conf['x_train'], conf['y_train'])
        conf['postfit_hook'](estimator)
        complexity = conf['complexity_computer'](estimator)
        complexities.append(complexity)
        start_time = time.time()
        y_pred = estimator.predict(conf['x_test'])
        elapsed_time = (time.time() - start_time)
        prediction_times.append(elapsed_time)
        pred_score = conf['prediction_performance_computer'](
            conf['y_test'], y_pred)
        prediction_powers.append(pred_score)
        print("Complexity: %d | %s: %.4f | Pred. Time: %fs\n" % (
            complexity, conf['prediction_performance_label'], pred_score,
            elapsed_time))
    return prediction_powers, prediction_times, complexities


def plot_influence(conf, mse_values, prediction_times, complexities):
    """
    Plot influence of model complexity on both accuracy and latency.
    """
    plt.figure(figsize=(12, 6))
    host = host_subplot(111, axes_class=Axes)
    plt.subplots_adjust(right=0.75)
    par1 = host.twinx()
    host.set_xlabel('Model Complexity (%s)' % conf['complexity_label'])
    y1_label = conf['prediction_performance_label']
    y2_label = "Time (s)"
    host.set_ylabel(y1_label)
    par1.set_ylabel(y2_label)
    p1, = host.plot(complexities, mse_values, 'b-', label="prediction error")
    p2, = par1.plot(complexities, prediction_times, 'r-',
                    label="latency")
    host.legend(loc='upper right')
    host.axis["left"].label.set_color(p1.get_color())
    par1.axis["right"].label.set_color(p2.get_color())
    plt.title('Influence of Model Complexity - %s' % conf['estimator'].__name__)
    plt.show()


def _count_nonzero_coefficients(estimator):
    a = estimator.coef_.toarray()
    return np.count_nonzero(a)

configurations = [
    {'estimator': gbr,
     'tuned_params': {'loss': 'ls'},
     'changing_param': 'n_estimators',
     'changing_param_values': [100, 1000, 2000, 4000, 6000, 8000, 10000, 12000],
     'complexity_label': 'n_trees',
     'complexity_computer': lambda x: x.n_estimators,
     'x_train': x_condo_NS_train,
     'y_train': y_condo_NS_train,
     'x_test': x_condo_NS_test,
     'y_test': y_condo_NS_test,
     'postfit_hook': lambda x: x,
     'prediction_performance_computer': mean_squared_error,
     'prediction_performance_label': 'MSE'},
]
for conf in configurations:
    prediction_performances, prediction_times, complexities = \
        benchmark_influence(conf)
    plot_influence(conf, prediction_performances, prediction_times,
                   complexities)


In [None]:
gbr_condo_NS = ensemble.GradientBoostingRegressor(n_estimators=10000)

gbr_condo_NS.fit(x_condo_NS_train, y_condo_NS_train)

In [None]:
print("R2 for test", gbr_condo_NS.score(x_condo_NS_test, y_condo_NS_test, sample_weight=None))
print("R2 for train", gbr_condo_NS.score(x_condo_NS_train, y_condo_NS_train, sample_weight=None))

print("MSE Test",metrics.mean_squared_error(y_condo_NS_test,gbr_condo_NS.predict(x_condo_NS_test)))
print("MSE Train",metrics.mean_squared_error(y_condo_NS_train,gbr_condo_NS.predict(x_condo_NS_train)))

In [None]:
names = x_condo_NS_train.columns

columns = ["Importance", "Feature Name"]
index = ["1st Important Feature", 
         "2nd Important Feature", 
         "3rd Important Feature", 
         "4th Important Feature", 
         "5th Important Feature",
        "6th Important Feature",
        "7th Important Feature",
        "8th Important Feature",
        "9th Important Feature",
        "10th Important Feature"]

pd.DataFrame(sorted(zip(map(lambda z: round(z, 4), gbr_condo_NS.feature_importances_), names), 
             reverse=True)[0:10], columns = columns, index = index)

### 7.4.4 Condo Resale

In [None]:
def benchmark_influence(conf):
    """
    Benchmark influence of :changing_param: on both MSE and latency.
    """
    prediction_times = []
    prediction_powers = []
    complexities = []
    for param_value in conf['changing_param_values']:
        conf['tuned_params'][conf['changing_param']] = param_value
        estimator = conf['estimator'](**conf['tuned_params'])
        print("Benchmarking %s" % estimator)
        estimator.fit(conf['x_train'], conf['y_train'])
        conf['postfit_hook'](estimator)
        complexity = conf['complexity_computer'](estimator)
        complexities.append(complexity)
        start_time = time.time()
        y_pred = estimator.predict(conf['x_test'])
        elapsed_time = (time.time() - start_time)
        prediction_times.append(elapsed_time)
        pred_score = conf['prediction_performance_computer'](
            conf['y_test'], y_pred)
        prediction_powers.append(pred_score)
        print("Complexity: %d | %s: %.4f | Pred. Time: %fs\n" % (
            complexity, conf['prediction_performance_label'], pred_score,
            elapsed_time))
    return prediction_powers, prediction_times, complexities


def plot_influence(conf, mse_values, prediction_times, complexities):
    """
    Plot influence of model complexity on both accuracy and latency.
    """
    plt.figure(figsize=(12, 6))
    host = host_subplot(111, axes_class=Axes)
    plt.subplots_adjust(right=0.75)
    par1 = host.twinx()
    host.set_xlabel('Model Complexity (%s)' % conf['complexity_label'])
    y1_label = conf['prediction_performance_label']
    y2_label = "Time (s)"
    host.set_ylabel(y1_label)
    par1.set_ylabel(y2_label)
    p1, = host.plot(complexities, mse_values, 'b-', label="prediction error")
    p2, = par1.plot(complexities, prediction_times, 'r-',
                    label="latency")
    host.legend(loc='upper right')
    host.axis["left"].label.set_color(p1.get_color())
    par1.axis["right"].label.set_color(p2.get_color())
    plt.title('Influence of Model Complexity - %s' % conf['estimator'].__name__)
    plt.show()


def _count_nonzero_coefficients(estimator):
    a = estimator.coef_.toarray()
    return np.count_nonzero(a)

configurations = [
    {'estimator': gbr,
     'tuned_params': {'loss': 'ls'},
     'changing_param': 'n_estimators',
     'changing_param_values': [1000,3000,5000],
     'complexity_label': 'n_trees',
     'complexity_computer': lambda x: x.n_estimators,
     'x_train': x_condo_RS_train,
     'y_train': y_condo_RS_train,
     'x_test': x_condo_RS_test,
     'y_test': y_condo_RS_test,
     'postfit_hook': lambda x: x,
     'prediction_performance_computer': mean_squared_error,
     'prediction_performance_label': 'MSE'},
]
for conf in configurations:
    prediction_performances, prediction_times, complexities = \
        benchmark_influence(conf)
    plot_influence(conf, prediction_performances, prediction_times,
                   complexities)

In [None]:
gbr_condo_RS = ensemble.GradientBoostingRegressor(n_estimators=3000)

gbr_condo_RS.fit(x_condo_RS_train, y_condo_RS_train)

In [None]:
print("R2 for test", gbr_condo_RS.score(x_condo_RS_test, y_condo_RS_test, sample_weight=None))
print("R2 for train", gbr_condo_RS.score(x_condo_RS_train, y_condo_RS_train, sample_weight=None))

print("MSE Test",metrics.mean_squared_error(y_condo_RS_test,gbr_condo_RS.predict(x_condo_RS_test)))
print("MSE Train",metrics.mean_squared_error(y_condo_RS_train,gbr_condo_RS.predict(x_condo_RS_train)))

In [None]:
names = x_condo_RS_train.columns

columns = ["Importance", "Feature Name"]
index = ["1st Important Feature", 
         "2nd Important Feature", 
         "3rd Important Feature", 
         "4th Important Feature", 
         "5th Important Feature",
        "6th Important Feature",
        "7th Important Feature",
        "8th Important Feature",
        "9th Important Feature",
        "10th Important Feature"]

pd.DataFrame(sorted(zip(map(lambda z: round(z, 4), gbr_condo_RS.feature_importances_), names), 
             reverse=True)[0:10], columns = columns, index = index)

### 7.4.5 Econdo New Sales

In [None]:
def benchmark_influence(conf):
    """
    Benchmark influence of :changing_param: on both MSE and latency.
    """
    prediction_times = []
    prediction_powers = []
    complexities = []
    for param_value in conf['changing_param_values']:
        conf['tuned_params'][conf['changing_param']] = param_value
        estimator = conf['estimator'](**conf['tuned_params'])
        print("Benchmarking %s" % estimator)
        estimator.fit(conf['x_train'], conf['y_train'])
        conf['postfit_hook'](estimator)
        complexity = conf['complexity_computer'](estimator)
        complexities.append(complexity)
        start_time = time.time()
        y_pred = estimator.predict(conf['x_test'])
        elapsed_time = (time.time() - start_time)
        prediction_times.append(elapsed_time)
        pred_score = conf['prediction_performance_computer'](
            conf['y_test'], y_pred)
        prediction_powers.append(pred_score)
        print("Complexity: %d | %s: %.4f | Pred. Time: %fs\n" % (
            complexity, conf['prediction_performance_label'], pred_score,
            elapsed_time))
    return prediction_powers, prediction_times, complexities


def plot_influence(conf, mse_values, prediction_times, complexities):
    """
    Plot influence of model complexity on both accuracy and latency.
    """
    plt.figure(figsize=(12, 6))
    host = host_subplot(111, axes_class=Axes)
    plt.subplots_adjust(right=0.75)
    par1 = host.twinx()
    host.set_xlabel('Model Complexity (%s)' % conf['complexity_label'])
    y1_label = conf['prediction_performance_label']
    y2_label = "Time (s)"
    host.set_ylabel(y1_label)
    par1.set_ylabel(y2_label)
    p1, = host.plot(complexities, mse_values, 'b-', label="prediction error")
    p2, = par1.plot(complexities, prediction_times, 'r-',
                    label="latency")
    host.legend(loc='upper right')
    host.axis["left"].label.set_color(p1.get_color())
    par1.axis["right"].label.set_color(p2.get_color())
    plt.title('Influence of Model Complexity - %s' % conf['estimator'].__name__)
    plt.show()


def _count_nonzero_coefficients(estimator):
    a = estimator.coef_.toarray()
    return np.count_nonzero(a)

configurations = [
    {'estimator': gbr,
     'tuned_params': {'loss': 'ls'},
     'changing_param': 'n_estimators',
     'changing_param_values': [1000,3000,5000],
     'complexity_label': 'n_trees',
     'complexity_computer': lambda x: x.n_estimators,
     'x_train': x_Econdo_NS_train,
     'y_train': y_Econdo_NS_train,
     'x_test': x_Econdo_NS_test,
     'y_test': y_Econdo_NS_test,
     'postfit_hook': lambda x: x,
     'prediction_performance_computer': mean_squared_error,
     'prediction_performance_label': 'MSE'},
]
for conf in configurations:
    prediction_performances, prediction_times, complexities = \
        benchmark_influence(conf)
    plot_influence(conf, prediction_performances, prediction_times,
                   complexities)

In [None]:
gbr_Econdo_NS = ensemble.GradientBoostingRegressor(n_estimators=3000)

gbr_Econdo_NS.fit(x_Econdo_NS_train, y_Econdo_NS_train)

In [None]:
print("R2 for test", gbr_Econdo_NS.score(x_Econdo_NS_test, y_Econdo_NS_test, sample_weight=None))
print("R2 for train", gbr_Econdo_NS.score(x_Econdo_NS_train, y_Econdo_NS_train, sample_weight=None))

print("MSE Test",metrics.mean_squared_error(y_Econdo_NS_test,gbr_Econdo_NS.predict(x_Econdo_NS_test)))
print("MSE Train",metrics.mean_squared_error(y_Econdo_NS_train,gbr_Econdo_NS.predict(x_Econdo_NS_train)))

In [None]:
names = x_Econdo_NS_train.columns

columns = ["Importance", "Feature Name"]
index = ["1st Important Feature", 
         "2nd Important Feature", 
         "3rd Important Feature", 
         "4th Important Feature", 
         "5th Important Feature",
        "6th Important Feature",
        "7th Important Feature",
        "8th Important Feature",
        "9th Important Feature",
        "10th Important Feature"]

pd.DataFrame(sorted(zip(map(lambda z: round(z, 4), gbr_Econdo_NS.feature_importances_), names), 
             reverse=True)[0:10], columns = columns, index = index)

### 7.4.6 Econdo Resale

In [None]:
def benchmark_influence(conf):
    """
    Benchmark influence of :changing_param: on both MSE and latency.
    """
    prediction_times = []
    prediction_powers = []
    complexities = []
    for param_value in conf['changing_param_values']:
        conf['tuned_params'][conf['changing_param']] = param_value
        estimator = conf['estimator'](**conf['tuned_params'])
        print("Benchmarking %s" % estimator)
        estimator.fit(conf['x_train'], conf['y_train'])
        conf['postfit_hook'](estimator)
        complexity = conf['complexity_computer'](estimator)
        complexities.append(complexity)
        start_time = time.time()
        y_pred = estimator.predict(conf['x_test'])
        elapsed_time = (time.time() - start_time)
        prediction_times.append(elapsed_time)
        pred_score = conf['prediction_performance_computer'](
            conf['y_test'], y_pred)
        prediction_powers.append(pred_score)
        print("Complexity: %d | %s: %.4f | Pred. Time: %fs\n" % (
            complexity, conf['prediction_performance_label'], pred_score,
            elapsed_time))
    return prediction_powers, prediction_times, complexities


def plot_influence(conf, mse_values, prediction_times, complexities):
    """
    Plot influence of model complexity on both accuracy and latency.
    """
    plt.figure(figsize=(12, 6))
    host = host_subplot(111, axes_class=Axes)
    plt.subplots_adjust(right=0.75)
    par1 = host.twinx()
    host.set_xlabel('Model Complexity (%s)' % conf['complexity_label'])
    y1_label = conf['prediction_performance_label']
    y2_label = "Time (s)"
    host.set_ylabel(y1_label)
    par1.set_ylabel(y2_label)
    p1, = host.plot(complexities, mse_values, 'b-', label="prediction error")
    p2, = par1.plot(complexities, prediction_times, 'r-',
                    label="latency")
    host.legend(loc='upper right')
    host.axis["left"].label.set_color(p1.get_color())
    par1.axis["right"].label.set_color(p2.get_color())
    plt.title('Influence of Model Complexity - %s' % conf['estimator'].__name__)
    plt.show()


def _count_nonzero_coefficients(estimator):
    a = estimator.coef_.toarray()
    return np.count_nonzero(a)

configurations = [
    {'estimator': gbr,
     'tuned_params': {'loss': 'ls'},
     'changing_param': 'n_estimators',
     'changing_param_values': [140,160,180,200,220,240,260,280,300],
     'complexity_label': 'n_trees',
     'complexity_computer': lambda x: x.n_estimators,
     'x_train': x_Econdo_RS_train,
     'y_train': y_Econdo_RS_train,
     'x_test': x_Econdo_RS_test,
     'y_test': y_Econdo_RS_test,
     'postfit_hook': lambda x: x,
     'prediction_performance_computer': mean_squared_error,
     'prediction_performance_label': 'MSE'},
]
for conf in configurations:
    prediction_performances, prediction_times, complexities = \
        benchmark_influence(conf)
    plot_influence(conf, prediction_performances, prediction_times,
                   complexities)

In [None]:
gbr_Econdo_RS = ensemble.GradientBoostingRegressor(n_estimators=220)

gbr_Econdo_RS.fit(x_Econdo_RS_train, y_Econdo_RS_train)

In [None]:
print("R2 for test", gbr_Econdo_RS.score(x_Econdo_RS_test, y_Econdo_RS_test, sample_weight=None))
print("R2 for train", gbr_Econdo_RS.score(x_Econdo_RS_train, y_Econdo_RS_train, sample_weight=None))

print("MSE Test",metrics.mean_squared_error(y_Econdo_RS_test,gbr_Econdo_RS.predict(x_Econdo_RS_test)))
print("MSE Train",metrics.mean_squared_error(y_Econdo_RS_train,gbr_Econdo_RS.predict(x_Econdo_RS_train)))

In [None]:
names = x_Econdo_RS_train.columns

columns = ["Importance", "Feature Name"]
index = ["1st Important Feature", 
         "2nd Important Feature", 
         "3rd Important Feature", 
         "4th Important Feature", 
         "5th Important Feature",
        "6th Important Feature",
        "7th Important Feature",
        "8th Important Feature",
        "9th Important Feature",
        "10th Important Feature"]

pd.DataFrame(sorted(zip(map(lambda z: round(z, 4), gbr_Econdo_RS.feature_importances_), names), 
             reverse=True)[0:10], columns = columns, index = index)

### 7.5 Boosting - XGB

#### 7.5.1 Apartment New Sales

In [None]:
def benchmark_influence(conf):
    """
    Benchmark influence of :changing_param: on both MSE and latency.
    """
    prediction_times = []
    prediction_powers = []
    complexities = []
    for param_value in conf['changing_param_values']:
        conf['tuned_params'][conf['changing_param']] = param_value
        estimator = conf['estimator'](**conf['tuned_params'])
        print("Benchmarking %s" % estimator)
        estimator.fit(conf['x_train'], conf['y_train'])
        conf['postfit_hook'](estimator)
        complexity = conf['complexity_computer'](estimator)
        complexities.append(complexity)
        start_time = time.time()
        y_pred = estimator.predict(conf['x_test'])
        elapsed_time = (time.time() - start_time)
        prediction_times.append(elapsed_time)
        pred_score = conf['prediction_performance_computer'](
            conf['y_test'], y_pred)
        prediction_powers.append(pred_score)
        print("Complexity: %d | %s: %.4f | Pred. Time: %fs\n" % (
            complexity, conf['prediction_performance_label'], pred_score,
            elapsed_time))
    return prediction_powers, prediction_times, complexities


def plot_influence(conf, mse_values, prediction_times, complexities):
    """
    Plot influence of model complexity on both accuracy and latency.
    """
    plt.figure(figsize=(12, 6))
    host = host_subplot(111, axes_class=Axes)
    plt.subplots_adjust(right=0.75)
    par1 = host.twinx()
    host.set_xlabel('Model Complexity (%s)' % conf['complexity_label'])
    y1_label = conf['prediction_performance_label']
    y2_label = "Time (s)"
    host.set_ylabel(y1_label)
    par1.set_ylabel(y2_label)
    p1, = host.plot(complexities, mse_values, 'b-', label="prediction error")
    p2, = par1.plot(complexities, prediction_times, 'r-',
                    label="latency")
    host.legend(loc='upper right')
    host.axis["left"].label.set_color(p1.get_color())
    par1.axis["right"].label.set_color(p2.get_color())
    plt.title('Influence of Model Complexity - %s' % conf['estimator'].__name__)
    plt.show()


def _count_nonzero_coefficients(estimator):
    a = estimator.coef_.toarray()
    return np.count_nonzero(a)

configurations = [
    {'estimator': xgb.XGBRegressor,
     'tuned_params': {'loss': 'ls'},
     'changing_param': 'n_estimators',
     'changing_param_values': [2000,2200,2400,2600,2800,3000,3200,3400,3600,3800,4000,4200],
     'complexity_label': 'n_trees',
     'complexity_computer': lambda x: x.n_estimators,
     'x_train': x_apart_NS_train,
     'y_train': y_apart_NS_train,
     'x_test': x_apart_NS_test,
     'y_test': y_apart_NS_test,
     'postfit_hook': lambda x: x,
     'prediction_performance_computer': mean_squared_error,
     'prediction_performance_label': 'MSE'},
]
for conf in configurations:
    prediction_performances, prediction_times, complexities = \
        benchmark_influence(conf)
    plot_influence(conf, prediction_performances, prediction_times,
                   complexities)

In [None]:
xg_reg_apart_NS = xgb.XGBRegressor(n_estimators = 3600)

xg_reg_apart_NS.fit(x_apart_NS_train, y_apart_NS_train)

preds_apart_NS = xg_reg_apart_NS.predict(x_apart_NS_test)

In [None]:
print("R2 for test", xg_reg_apart_NS.score(x_apart_NS_test, y_apart_NS_test, sample_weight=None))
print("R2 for train", xg_reg_apart_NS.score(x_apart_NS_train, y_apart_NS_train, sample_weight=None))

print("MSE Test",metrics.mean_squared_error(y_apart_NS_test,xg_reg_apart_NS.predict(x_apart_NS_test)))
print("MSE Train",metrics.mean_squared_error(y_apart_NS_train,xg_reg_apart_NS.predict(x_apart_NS_train)))

In [None]:
names = x_apart_NS_train.columns

columns = ["Importance", "Feature Name"]
index = ["1st Important Feature", 
         "2nd Important Feature", 
         "3rd Important Feature", 
         "4th Important Feature", 
         "5th Important Feature",
        "6th Important Feature",
        "7th Important Feature",
        "8th Important Feature",
        "9th Important Feature",
        "10th Important Feature"]

pd.DataFrame(sorted(zip(map(lambda z: round(z, 4), xg_reg_apart_NS.feature_importances_), names), 
             reverse=True)[0:10], columns = columns, index = index)

### 7.5.2 Apartment Resale

In [None]:
def benchmark_influence(conf):
    """
    Benchmark influence of :changing_param: on both MSE and latency.
    """
    prediction_times = []
    prediction_powers = []
    complexities = []
    for param_value in conf['changing_param_values']:
        conf['tuned_params'][conf['changing_param']] = param_value
        estimator = conf['estimator'](**conf['tuned_params'])
        print("Benchmarking %s" % estimator)
        estimator.fit(conf['x_train'], conf['y_train'])
        conf['postfit_hook'](estimator)
        complexity = conf['complexity_computer'](estimator)
        complexities.append(complexity)
        start_time = time.time()
        y_pred = estimator.predict(conf['x_test'])
        elapsed_time = (time.time() - start_time)
        prediction_times.append(elapsed_time)
        pred_score = conf['prediction_performance_computer'](
            conf['y_test'], y_pred)
        prediction_powers.append(pred_score)
        print("Complexity: %d | %s: %.4f | Pred. Time: %fs\n" % (
            complexity, conf['prediction_performance_label'], pred_score,
            elapsed_time))
    return prediction_powers, prediction_times, complexities


def plot_influence(conf, mse_values, prediction_times, complexities):
    """
    Plot influence of model complexity on both accuracy and latency.
    """
    plt.figure(figsize=(12, 6))
    host = host_subplot(111, axes_class=Axes)
    plt.subplots_adjust(right=0.75)
    par1 = host.twinx()
    host.set_xlabel('Model Complexity (%s)' % conf['complexity_label'])
    y1_label = conf['prediction_performance_label']
    y2_label = "Time (s)"
    host.set_ylabel(y1_label)
    par1.set_ylabel(y2_label)
    p1, = host.plot(complexities, mse_values, 'b-', label="prediction error")
    p2, = par1.plot(complexities, prediction_times, 'r-',
                    label="latency")
    host.legend(loc='upper right')
    host.axis["left"].label.set_color(p1.get_color())
    par1.axis["right"].label.set_color(p2.get_color())
    plt.title('Influence of Model Complexity - %s' % conf['estimator'].__name__)
    plt.show()


def _count_nonzero_coefficients(estimator):
    a = estimator.coef_.toarray()
    return np.count_nonzero(a)

configurations = [
    {'estimator': xgb.XGBRegressor,
     'tuned_params': {'loss': 'ls'},
     'changing_param': 'n_estimators',
     'changing_param_values': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
     'complexity_label': 'n_trees',
     'complexity_computer': lambda x: x.n_estimators,
     'x_train': x_apart_RS_train,
     'y_train': y_apart_RS_train,
     'x_test': x_apart_RS_test,
     'y_test': y_apart_RS_test,
     'postfit_hook': lambda x: x,
     'prediction_performance_computer': mean_squared_error,
     'prediction_performance_label': 'MSE'},
]
for conf in configurations:
    prediction_performances, prediction_times, complexities = \
        benchmark_influence(conf)
    plot_influence(conf, prediction_performances, prediction_times,
                   complexities)

In [None]:
xg_reg_apart_RS = xgb.XGBRegressor(n_estimators = 600)

xg_reg_apart_RS.fit(x_apart_RS_train, y_apart_RS_train)

preds_apart_RS = xg_reg_apart_RS.predict(x_apart_RS_test)

In [None]:
print("R2 for test", xg_reg_apart_RS.score(x_apart_RS_test, y_apart_RS_test, sample_weight=None))
print("R2 for train", xg_reg_apart_RS.score(x_apart_RS_train, y_apart_RS_train, sample_weight=None))

print("MSE Test",metrics.mean_squared_error(y_apart_RS_test,xg_reg_apart_RS.predict(x_apart_RS_test)))
print("MSE Train",metrics.mean_squared_error(y_apart_RS_train,xg_reg_apart_RS.predict(x_apart_RS_train)))

In [None]:
names = x_apart_RS_train.columns

columns = ["Importance", "Feature Name"]
index = ["1st Important Feature", 
         "2nd Important Feature", 
         "3rd Important Feature", 
         "4th Important Feature", 
         "5th Important Feature",
        "6th Important Feature",
        "7th Important Feature",
        "8th Important Feature",
        "9th Important Feature",
        "10th Important Feature"]

pd.DataFrame(sorted(zip(map(lambda z: round(z, 4), xg_reg_apart_RS.feature_importances_), names), 
             reverse=True)[0:10], columns = columns, index = index)

###  7.5.3 Condo - New Sales

In [None]:
def benchmark_influence(conf):
    """
    Benchmark influence of :changing_param: on both MSE and latency.
    """
    prediction_times = []
    prediction_powers = []
    complexities = []
    for param_value in conf['changing_param_values']:
        conf['tuned_params'][conf['changing_param']] = param_value
        estimator = conf['estimator'](**conf['tuned_params'])
        print("Benchmarking %s" % estimator)
        estimator.fit(conf['x_train'], conf['y_train'])
        conf['postfit_hook'](estimator)
        complexity = conf['complexity_computer'](estimator)
        complexities.append(complexity)
        start_time = time.time()
        y_pred = estimator.predict(conf['x_test'])
        elapsed_time = (time.time() - start_time)
        prediction_times.append(elapsed_time)
        pred_score = conf['prediction_performance_computer'](
            conf['y_test'], y_pred)
        prediction_powers.append(pred_score)
        print("Complexity: %d | %s: %.4f | Pred. Time: %fs\n" % (
            complexity, conf['prediction_performance_label'], pred_score,
            elapsed_time))
    return prediction_powers, prediction_times, complexities


def plot_influence(conf, mse_values, prediction_times, complexities):
    """
    Plot influence of model complexity on both accuracy and latency.
    """
    plt.figure(figsize=(12, 6))
    host = host_subplot(111, axes_class=Axes)
    plt.subplots_adjust(right=0.75)
    par1 = host.twinx()
    host.set_xlabel('Model Complexity (%s)' % conf['complexity_label'])
    y1_label = conf['prediction_performance_label']
    y2_label = "Time (s)"
    host.set_ylabel(y1_label)
    par1.set_ylabel(y2_label)
    p1, = host.plot(complexities, mse_values, 'b-', label="prediction error")
    p2, = par1.plot(complexities, prediction_times, 'r-',
                    label="latency")
    host.legend(loc='upper right')
    host.axis["left"].label.set_color(p1.get_color())
    par1.axis["right"].label.set_color(p2.get_color())
    plt.title('Influence of Model Complexity - %s' % conf['estimator'].__name__)
    plt.show()


def _count_nonzero_coefficients(estimator):
    a = estimator.coef_.toarray()
    return np.count_nonzero(a)

configurations = [
    {'estimator': xgb.XGBRegressor,
     'tuned_params': {'loss': 'ls'},
     'changing_param': 'n_estimators',
     'changing_param_values': [100, 1000, 2000, 4000, 6000, 8000, 10000, 12000],
     'complexity_label': 'n_trees',
     'complexity_computer': lambda x: x.n_estimators,
     'x_train': x_condo_NS_train,
     'y_train': y_condo_NS_train,
     'x_test': x_condo_NS_test,
     'y_test': y_condo_NS_test,
     'postfit_hook': lambda x: x,
     'prediction_performance_computer': mean_squared_error,
     'prediction_performance_label': 'MSE'},
]
for conf in configurations:
    prediction_performances, prediction_times, complexities = \
        benchmark_influence(conf)
    plot_influence(conf, prediction_performances, prediction_times,
                   complexities)

In [None]:
xg_reg_condo_NS = xgb.XGBRegressor(n_estimators = 8000)

xg_reg_condo_NS.fit(x_condo_NS_train, y_condo_NS_train)

preds_condo_NS = xg_reg_condo_NS.predict(x_condo_NS_test)

In [None]:
print("R2 for test", xg_reg_condo_NS.score(x_condo_NS_test, y_condo_NS_test, sample_weight=None))
print("R2 for train", xg_reg_condo_NS.score(x_condo_NS_train, y_condo_NS_train, sample_weight=None))

print("MSE Test",metrics.mean_squared_error(y_condo_NS_test,xg_reg_condo_NS.predict(x_condo_NS_test)))
print("MSE Train",metrics.mean_squared_error(y_condo_NS_train,xg_reg_condo_NS.predict(x_condo_NS_train)))

In [None]:
names = x_condo_NS_train.columns

columns = ["Importance", "Feature Name"]
index = ["1st Important Feature", 
         "2nd Important Feature", 
         "3rd Important Feature", 
         "4th Important Feature", 
         "5th Important Feature",
        "6th Important Feature",
        "7th Important Feature",
        "8th Important Feature",
        "9th Important Feature",
        "10th Important Feature"]

pd.DataFrame(sorted(zip(map(lambda z: round(z, 4), xg_reg_condo_NS.feature_importances_), names), 
             reverse=True)[0:10], columns = columns, index = index)

### 7.5.4 Condo - Resales

In [None]:
def benchmark_influence(conf):
    """
    Benchmark influence of :changing_param: on both MSE and latency.
    """
    prediction_times = []
    prediction_powers = []
    complexities = []
    for param_value in conf['changing_param_values']:
        conf['tuned_params'][conf['changing_param']] = param_value
        estimator = conf['estimator'](**conf['tuned_params'])
        print("Benchmarking %s" % estimator)
        estimator.fit(conf['x_train'], conf['y_train'])
        conf['postfit_hook'](estimator)
        complexity = conf['complexity_computer'](estimator)
        complexities.append(complexity)
        start_time = time.time()
        y_pred = estimator.predict(conf['x_test'])
        elapsed_time = (time.time() - start_time)
        prediction_times.append(elapsed_time)
        pred_score = conf['prediction_performance_computer'](
            conf['y_test'], y_pred)
        prediction_powers.append(pred_score)
        print("Complexity: %d | %s: %.4f | Pred. Time: %fs\n" % (
            complexity, conf['prediction_performance_label'], pred_score,
            elapsed_time))
    return prediction_powers, prediction_times, complexities


def plot_influence(conf, mse_values, prediction_times, complexities):
    """
    Plot influence of model complexity on both accuracy and latency.
    """
    plt.figure(figsize=(12, 6))
    host = host_subplot(111, axes_class=Axes)
    plt.subplots_adjust(right=0.75)
    par1 = host.twinx()
    host.set_xlabel('Model Complexity (%s)' % conf['complexity_label'])
    y1_label = conf['prediction_performance_label']
    y2_label = "Time (s)"
    host.set_ylabel(y1_label)
    par1.set_ylabel(y2_label)
    p1, = host.plot(complexities, mse_values, 'b-', label="prediction error")
    p2, = par1.plot(complexities, prediction_times, 'r-',
                    label="latency")
    host.legend(loc='upper right')
    host.axis["left"].label.set_color(p1.get_color())
    par1.axis["right"].label.set_color(p2.get_color())
    plt.title('Influence of Model Complexity - %s' % conf['estimator'].__name__)
    plt.show()


def _count_nonzero_coefficients(estimator):
    a = estimator.coef_.toarray()
    return np.count_nonzero(a)

configurations = [
    {'estimator': xgb.XGBRegressor,
     'tuned_params': {'loss': 'ls'},
     'changing_param': 'n_estimators',
     'changing_param_values': [100, 1000, 2000, 4000, 6000, 8000, 10000, 12000, 14000],
     'complexity_label': 'n_trees',
     'complexity_computer': lambda x: x.n_estimators,
     'x_train': x_condo_RS_train,
     'y_train': y_condo_RS_train,
     'x_test': x_condo_RS_test,
     'y_test': y_condo_RS_test,
     'postfit_hook': lambda x: x,
     'prediction_performance_computer': mean_squared_error,
     'prediction_performance_label': 'MSE'},
]
for conf in configurations:
    prediction_performances, prediction_times, complexities = \
        benchmark_influence(conf)
    plot_influence(conf, prediction_performances, prediction_times,
                   complexities)

In [None]:
xg_reg_condo_RS = xgb.XGBRegressor(n_estimators = 2000)

xg_reg_condo_RS.fit(x_condo_RS_train, y_condo_RS_train)

preds_condo_RS = xg_reg_condo_RS.predict(x_condo_RS_test)

In [None]:
print("R2 for test", xg_reg_condo_RS.score(x_condo_RS_test, y_condo_RS_test, sample_weight=None))
print("R2 for train", xg_reg_condo_RS.score(x_condo_RS_train, y_condo_RS_train, sample_weight=None))

print("MSE Test",metrics.mean_squared_error(y_condo_RS_test,xg_reg_condo_RS.predict(x_condo_RS_test)))
print("MSE Train",metrics.mean_squared_error(y_condo_RS_train,xg_reg_condo_RS.predict(x_condo_RS_train)))

In [None]:
names = x_condo_RS_train.columns

columns = ["Importance", "Feature Name"]
index = ["1st Important Feature", 
         "2nd Important Feature", 
         "3rd Important Feature", 
         "4th Important Feature", 
         "5th Important Feature",
        "6th Important Feature",
        "7th Important Feature",
        "8th Important Feature",
        "9th Important Feature",
        "10th Important Feature"]

pd.DataFrame(sorted(zip(map(lambda z: round(z, 4), xg_reg_condo_RS.feature_importances_), names), 
             reverse=True)[0:10], columns = columns, index = index)

### 7.5.5 ECondo - New Sales

In [None]:
def benchmark_influence(conf):
    """
    Benchmark influence of :changing_param: on both MSE and latency.
    """
    prediction_times = []
    prediction_powers = []
    complexities = []
    for param_value in conf['changing_param_values']:
        conf['tuned_params'][conf['changing_param']] = param_value
        estimator = conf['estimator'](**conf['tuned_params'])
        print("Benchmarking %s" % estimator)
        estimator.fit(conf['x_train'], conf['y_train'])
        conf['postfit_hook'](estimator)
        complexity = conf['complexity_computer'](estimator)
        complexities.append(complexity)
        start_time = time.time()
        y_pred = estimator.predict(conf['x_test'])
        elapsed_time = (time.time() - start_time)
        prediction_times.append(elapsed_time)
        pred_score = conf['prediction_performance_computer'](
            conf['y_test'], y_pred)
        prediction_powers.append(pred_score)
        print("Complexity: %d | %s: %.4f | Pred. Time: %fs\n" % (
            complexity, conf['prediction_performance_label'], pred_score,
            elapsed_time))
    return prediction_powers, prediction_times, complexities


def plot_influence(conf, mse_values, prediction_times, complexities):
    """
    Plot influence of model complexity on both accuracy and latency.
    """
    plt.figure(figsize=(12, 6))
    host = host_subplot(111, axes_class=Axes)
    plt.subplots_adjust(right=0.75)
    par1 = host.twinx()
    host.set_xlabel('Model Complexity (%s)' % conf['complexity_label'])
    y1_label = conf['prediction_performance_label']
    y2_label = "Time (s)"
    host.set_ylabel(y1_label)
    par1.set_ylabel(y2_label)
    p1, = host.plot(complexities, mse_values, 'b-', label="prediction error")
    p2, = par1.plot(complexities, prediction_times, 'r-',
                    label="latency")
    host.legend(loc='upper right')
    host.axis["left"].label.set_color(p1.get_color())
    par1.axis["right"].label.set_color(p2.get_color())
    plt.title('Influence of Model Complexity - %s' % conf['estimator'].__name__)
    plt.show()


def _count_nonzero_coefficients(estimator):
    a = estimator.coef_.toarray()
    return np.count_nonzero(a)

configurations = [
    {'estimator': xgb.XGBRegressor,
     'tuned_params': {'loss': 'ls'},
     'changing_param': 'n_estimators',
     'changing_param_values': [100, 500, 1000, 1500, 2000, 2500, 3000, 3500],
     'complexity_label': 'n_trees',
     'complexity_computer': lambda x: x.n_estimators,
     'x_train': x_Econdo_NS_train,
     'y_train': y_Econdo_NS_train,
     'x_test': x_Econdo_NS_test,
     'y_test': y_Econdo_NS_test,
     'postfit_hook': lambda x: x,
     'prediction_performance_computer': mean_squared_error,
     'prediction_performance_label': 'MSE'},
]
for conf in configurations:
    prediction_performances, prediction_times, complexities = \
        benchmark_influence(conf)
    plot_influence(conf, prediction_performances, prediction_times,
                   complexities)

In [None]:
xg_reg_Econdo_NS = xgb.XGBRegressor(n_estimators = 3000)

xg_reg_Econdo_NS.fit(x_Econdo_NS_train, y_Econdo_NS_train)

preds_Econdo_NS = xg_reg_Econdo_NS.predict(x_Econdo_NS_test)

In [None]:
print("R2 for test", xg_reg_Econdo_NS.score(x_Econdo_NS_test, y_Econdo_NS_test, sample_weight=None))
print("R2 for train", xg_reg_Econdo_NS.score(x_Econdo_NS_train, y_Econdo_NS_train, sample_weight=None))

print("MSE Test",metrics.mean_squared_error(y_Econdo_NS_test,xg_reg_Econdo_NS.predict(x_Econdo_NS_test)))
print("MSE Train",metrics.mean_squared_error(y_Econdo_NS_train,xg_reg_Econdo_NS.predict(x_Econdo_NS_train)))

In [None]:
names = x_Econdo_NS_train.columns

columns = ["Importance", "Feature Name"]
index = ["1st Important Feature", 
         "2nd Important Feature", 
         "3rd Important Feature", 
         "4th Important Feature", 
         "5th Important Feature",
        "6th Important Feature",
        "7th Important Feature",
        "8th Important Feature",
        "9th Important Feature",
        "10th Important Feature"]

pd.DataFrame(sorted(zip(map(lambda z: round(z, 4), xg_reg_Econdo_NS.feature_importances_), names), 
             reverse=True)[0:10], columns = columns, index = index)

### 7.5.6 ECondo - Resales

In [None]:
def benchmark_influence(conf):
    """
    Benchmark influence of :changing_param: on both MSE and latency.
    """
    prediction_times = []
    prediction_powers = []
    complexities = []
    for param_value in conf['changing_param_values']:
        conf['tuned_params'][conf['changing_param']] = param_value
        estimator = conf['estimator'](**conf['tuned_params'])
        print("Benchmarking %s" % estimator)
        estimator.fit(conf['x_train'], conf['y_train'])
        conf['postfit_hook'](estimator)
        complexity = conf['complexity_computer'](estimator)
        complexities.append(complexity)
        start_time = time.time()
        y_pred = estimator.predict(conf['x_test'])
        elapsed_time = (time.time() - start_time)
        prediction_times.append(elapsed_time)
        pred_score = conf['prediction_performance_computer'](
            conf['y_test'], y_pred)
        prediction_powers.append(pred_score)
        print("Complexity: %d | %s: %.4f | Pred. Time: %fs\n" % (
            complexity, conf['prediction_performance_label'], pred_score,
            elapsed_time))
    return prediction_powers, prediction_times, complexities


def plot_influence(conf, mse_values, prediction_times, complexities):
    """
    Plot influence of model complexity on both accuracy and latency.
    """
    plt.figure(figsize=(12, 6))
    host = host_subplot(111, axes_class=Axes)
    plt.subplots_adjust(right=0.75)
    par1 = host.twinx()
    host.set_xlabel('Model Complexity (%s)' % conf['complexity_label'])
    y1_label = conf['prediction_performance_label']
    y2_label = "Time (s)"
    host.set_ylabel(y1_label)
    par1.set_ylabel(y2_label)
    p1, = host.plot(complexities, mse_values, 'b-', label="prediction error")
    p2, = par1.plot(complexities, prediction_times, 'r-',
                    label="latency")
    host.legend(loc='upper right')
    host.axis["left"].label.set_color(p1.get_color())
    par1.axis["right"].label.set_color(p2.get_color())
    plt.title('Influence of Model Complexity - %s' % conf['estimator'].__name__)
    plt.show()


def _count_nonzero_coefficients(estimator):
    a = estimator.coef_.toarray()
    return np.count_nonzero(a)

configurations = [
    {'estimator': xgb.XGBRegressor,
     'tuned_params': {'loss': 'ls'},
     'changing_param': 'n_estimators',
     'changing_param_values': [60,80,100,120,140,160,180,200,220,240,260,280,300],
     'complexity_label': 'n_trees',
     'complexity_computer': lambda x: x.n_estimators,
     'x_train': x_Econdo_RS_train,
     'y_train': y_Econdo_RS_train,
     'x_test': x_Econdo_RS_test,
     'y_test': y_Econdo_RS_test,
     'postfit_hook': lambda x: x,
     'prediction_performance_computer': mean_squared_error,
     'prediction_performance_label': 'MSE'},
]
for conf in configurations:
    prediction_performances, prediction_times, complexities = \
        benchmark_influence(conf)
    plot_influence(conf, prediction_performances, prediction_times,
                   complexities)

In [None]:
xg_reg_Econdo_RS = xgb.XGBRegressor(n_estimators = 260)

xg_reg_Econdo_RS.fit(x_Econdo_RS_train, y_Econdo_RS_train)

preds_Econdo_RS = xg_reg_Econdo_RS.predict(x_Econdo_RS_test)

In [None]:
print("R2 for test", xg_reg_Econdo_RS.score(x_Econdo_RS_test, y_Econdo_RS_test, sample_weight=None))
print("R2 for train", xg_reg_Econdo_RS.score(x_Econdo_RS_train, y_Econdo_RS_train, sample_weight=None))

print("MSE Test",metrics.mean_squared_error(y_Econdo_RS_test,xg_reg_Econdo_RS.predict(x_Econdo_RS_test)))
print("MSE Train",metrics.mean_squared_error(y_Econdo_RS_train,xg_reg_Econdo_RS.predict(x_Econdo_RS_train)))

In [None]:
names = x_Econdo_RS_train.columns
columns = ["Importance", "Feature Name"]
index = ["1st Important Feature", 
         "2nd Important Feature", 
         "3rd Important Feature", 
         "4th Important Feature", 
         "5th Important Feature",
        "6th Important Feature",
        "7th Important Feature",
        "8th Important Feature",
        "9th Important Feature",
        "10th Important Feature"]

pd.DataFrame(sorted(zip(map(lambda z: round(z, 4), xg_reg_Econdo_RS.feature_importances_), names), 
             reverse=True)[0:10], columns = columns, index = index)

# 8.0 Discussion on Overall Results

Out of the predictive models employed for eac.h domain, we can deduce that XGBRegressor is generally the best performing model where it achieved the highest R2 score and lowest MSE score for all domains.

To determine how well the models fit the data, we need to take a statistical measure of how close each data point fits the regression line. By taking XGBRegressor from the Apartment – New Sales and Resales domains as an example, we can visualize its goodness-of-fit as shown in the diagrams below:

In [None]:
from yellowbrick.regressor import PredictionError

# Instantiate the linear model and visualizer
#lasso = Lasso()
visualizer = PredictionError(xg_reg_apart_NS)

visualizer.fit(x_apart_NS_train, y_apart_NS_train)  # Fit the training data to the visualizer
visualizer.score(x_apart_NS_test, y_apart_NS_test)  # Evaluate the model on the test data
g = visualizer.poof()

In [None]:
from yellowbrick.regressor import PredictionError

# Instantiate the linear model and visualizer
#lasso = Lasso()
visualizer = PredictionError(xg_reg_apart_RS)

visualizer.fit(x_apart_RS_train, y_apart_RS_train)  # Fit the training data to the visualizer
visualizer.score(x_apart_RS_test, y_apart_RS_test)  # Evaluate the model on the test data
g = visualizer.poof()

In general, the higher the R2, the better the model fits the data. However, the measurement of R2 alone is not sufficient to determine biasness of the predictions. In order to do so, we employed the use of residual plots to analyse the variance of error. A random dispersion of data points around the horizontal axis tells us that our model is performing well. Additionally, from the histogram on the right, we can also see that the error is normally distributed around zero, providing another indication of a well fitted model.

In [None]:
from yellowbrick.regressor import ResidualsPlot
visualizer = ResidualsPlot(xg_reg_apart_NS)

visualizer.fit(x_apart_NS_train, y_apart_NS_train)  # Fit the training data to the visualizer
visualizer.score(x_apart_NS_test, y_apart_NS_test)  # Evaluate the model on the test data
visualizer.poof()                 # Draw/show/poof the data

In [None]:
from yellowbrick.regressor import ResidualsPlot
visualizer = ResidualsPlot(xg_reg_apart_RS)

visualizer.fit(x_apart_NS_train, y_apart_NS_train)  # Fit the training data to the visualizer
visualizer.score(x_apart_NS_test, y_apart_NS_test)  # Evaluate the model on the test data
visualizer.poof()                 # Draw/show/poof the data

# 9.0 Discussion on Important Features

## 9.1 Overall Important Features

Based on the results above, we can see that both Area (sqf) and Floor No (Final) consistently appear throughout with a relatively high importance score. However, in order to truly understand how these features affect our target variable, we aim to visualize these relationships to gain further insights.

### 9.1.1 Area (sqf)

In [None]:
fig, axes = plt.subplots(nrows=2,ncols=3)
fig.set_size_inches(20, 10)
sns.regplot(x=x_apart_NS["Area (sqf)"], y=y_apart_NS,ax=axes[0][0])
sns.regplot(x=x_condo_NS["Area (sqf)"], y=y_condo_NS,ax=axes[0][1])
sns.regplot(x=x_Econdo_NS["Area (sqf)"], y=y_Econdo_NS,ax=axes[0][2])
sns.regplot(x=x_apart_RS["Area (sqf)"], y=y_apart_RS,ax=axes[1][0])
sns.regplot(x=x_condo_RS["Area (sqf)"], y=y_condo_RS,ax=axes[1][1])
sns.regplot(x=x_Econdo_RS["Area (sqf)"], y=y_Econdo_RS,ax=axes[1][2])


axes[0][0].set(xlabel='Area (sqf)', ylabel='Unit Price ($psf)',title="Reg Plot Of Unit Price ($psf) Against Area (sqf) for Apartment New Sales")
axes[0][1].set(xlabel='Area (sqf)', ylabel='Unit Price ($psf)',title="Reg Plot Of Unit Price ($psf) Against Area (sqf) for Condo New Sales")
axes[0][2].set(xlabel='Area (sqf)', ylabel='Unit Price ($psf)',title="Reg Plot Of Unit Price ($psf) Against Area (sqf) for Executive Condo New Sales")
axes[1][0].set(xlabel='Area (sqf)', ylabel='Unit Price ($psf)',title="Reg Plot Of Unit Price ($psf) Against Area (sqf) for Apartment Resales")
axes[1][1].set(xlabel='Area (sqf)', ylabel='Unit Price ($psf)',title="Reg Plot Of Unit Price ($psf) Against Area (sqf) for Condo Resales")
axes[1][2].set(xlabel='Area (sqf)', ylabel='Unit Price ($psf)',title="Reg Plot Of Unit Price ($psf) Against Area (sqf) for Executive Condo Resales")


### 9.1.2 Floor No (Final)

In [None]:
fig, axes = plt.subplots(nrows=2,ncols=3)
fig.set_size_inches(20, 10)
sns.boxplot(x=x_apart_NS["Floor No (Final)"], y=y_apart_NS,ax=axes[0][0])
sns.regplot(x=x_apart_NS["Floor No (Final)"], y=y_apart_NS, scatter=False, ax=axes[0][0])

sns.boxplot(x=x_condo_NS["Floor No (Final)"], y=y_condo_NS,ax=axes[0][1])
sns.regplot(x=x_condo_NS["Floor No (Final)"], y=y_condo_NS, scatter = False, ax=axes[0][1])

sns.boxplot(x=x_Econdo_NS["Floor No (Final)"], y=y_Econdo_NS,ax=axes[0][2])
sns.regplot(x=x_Econdo_NS["Floor No (Final)"], y=y_Econdo_NS,scatter = False,ax=axes[0][2])

sns.boxplot(x=x_apart_RS["Floor No (Final)"], y=y_apart_RS,ax=axes[1][0])
sns.regplot(x=x_apart_RS["Floor No (Final)"], y=y_apart_RS, scatter = False, ax=axes[1][0])

sns.boxplot(x=x_condo_RS["Floor No (Final)"], y=y_condo_RS,ax=axes[1][1])
sns.regplot(x=x_condo_RS["Floor No (Final)"], y=y_condo_RS, scatter = False, ax=axes[1][1])

sns.boxplot(x=x_Econdo_RS["Floor No (Final)"], y=y_Econdo_RS,ax=axes[1][2])
sns.regplot(x=x_Econdo_RS["Floor No (Final)"], y=y_Econdo_RS, scatter = False, ax=axes[1][2])



axes[0][0].set(xlabel='Floor No (Final)', ylabel='Unit Price ($psf)',title="Box Plot Of Unit Price ($psf) Against Floor No (Final) for Apartment New Sales")
axes[0][1].set(xlabel='Floor No (Final)', ylabel='Unit Price ($psf)',title="Box Plot Of Unit Price ($psf) Against Floor No (Final) for Condo New Sales")
axes[0][2].set(xlabel='Floor No (Final)', ylabel='Unit Price ($psf)',title="Box Plot Of Unit Price ($psf) Against Floor No (Final) for Executive Condo New Sales")
axes[1][0].set(xlabel='Floor No (Final)', ylabel='Unit Price ($psf)',title="Box Plot Of Unit Price ($psf) Against Floor No (Final) for Apartment Resales")
axes[1][1].set(xlabel='Floor No (Final)', ylabel='Unit Price ($psf)',title="Box Plot Of Unit Price ($psf) Against Floor No (Final) for Condo Resales")
axes[1][2].set(xlabel='Floor No (Final)', ylabel='Unit Price ($psf)',title="Box Plot Of Unit Price ($psf) Against Floor No (Final) for Executive Condo Resales")


### 9.1.3 Distance to Public Transportation

Even though the features representing distance to public transportation did not appear in the top three most important features across all six domains, it is important to note that these features do appear to be common across all six domains among the top ten most important features. 

Based on the plot below, we can see that Unit Price ($psf) is inversely related to two features representing distance to public transport – Bus Distance (log) and MRT Distance (log). As the distance to bus stop and to MRT station decreases, the Unit Price ($psf) increases.

In [None]:
from __future__ import print_function
print(__doc__)

import numpy as np
import matplotlib.pyplot as plt

from mpl_toolkits.mplot3d import Axes3D

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.ensemble.partial_dependence import partial_dependence
from sklearn.datasets.california_housing import fetch_california_housing


def main():
    #cal_housing = fetch_california_housing()

    # split 80/20 train-test
    #X_train, X_test, y_train, y_test = train_test_split(cal_housing.data,
    #                                                    cal_housing.target,
    #                                                    test_size=0.2,
    #                                                    random_state=1)
    x_train = x_apart_NS_train
    y_train =y_apart_NS_train
    x_test = x_apart_NS_test
    y_test = y_apart_NS_test
    names = x_apart_NS_train.columns[17 & 27]

    print("Training GBRT...")
    clf = GradientBoostingRegressor(n_estimators=100, max_depth=4,
                                    learning_rate=0.1, loss='huber',
                                    random_state=1)
    clf.fit(x_train, y_train)
    print(" done.")

    print('Convenience plot with ``partial_dependence_plots``')


    print('Custom 3d plot via ``partial_dependence``')
    fig = plt.figure()

    target_feature = (0, 1)
    pdp, axes = partial_dependence(clf, target_feature,
                                   X=x_train, grid_resolution=50)
    XX, YY = np.meshgrid(axes[0], axes[1])
    Z = pdp[0].reshape(list(map(np.size, axes))).T
    ax = Axes3D(fig)
    surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1,
                           cmap=plt.cm.BuPu, edgecolor='k')
    ax.set_xlabel(names[target_feature[0]])
    ax.set_ylabel(names[target_feature[1]])
    ax.set_zlabel('Partial dependence')
    #  pretty init view
    ax.view_init(elev=22, azim=122)
    plt.colorbar(surf)
    plt.suptitle('Partial dependence of house value on median\n'
                 'age and average occupancy')
    plt.subplots_adjust(top=0.9)

    plt.show()


# Needed on Windows because plot_partial_dependence uses multiprocessing
if __name__ == '__main__':
    main()

## 9.2 Important Features across Type of Sale

Next, having visualized all important common features across all six domains, we want to identify features that are common across each type of sale to obtain further insights on whether there are features that are only unique to each particular sales type.

### 9.2.1 Resales - Age

Based on the results obtained above, we can see that Age appears to be a significantly important feature that consistently appear across all property types as the second most important feature. 

In [None]:
fig = plt.figure(figsize=(20,10))
sns.boxplot(x=x_apart_RS["Age"], y=y_apart_RS)
sns.regplot(x=x_apart_RS["Age"], y=y_apart_RS, scatter=False)

In [None]:
fig = plt.figure(figsize=(20,10))
sns.boxplot(x=x_condo_RS["Age"], y=y_condo_RS)
sns.regplot(x=x_condo_RS["Age"], y=y_condo_RS, scatter=False)

In [None]:
fig = plt.figure(figsize=(20,10))
sns.boxplot(x=x_Econdo_RS["Age"], y=y_Econdo_RS)
sns.regplot(x=x_Econdo_RS["Age"], y=y_Econdo_RS, scatter=False)

Based on the plots above, we can see that as Age is inversely related to Unit Price ($psf). As Age increases, the Unit Price ($psf) decreases. However, it is also interesting to note that for Apartment and Condominiums, we can see a spike in Unit Price ($psf) at around age 24 for Apartments and age 23 to 26 for Condominiums. Based on business domain knowledge, the sudden rise in price could be due to demand driven by property investors who are interested in profiting from a potential en bloc sale, which typically occurs for properties that were more than 20 years old at the point of sale.