In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore") 

In [2]:
train_path = 'data/train.csv'
test_path = 'data/test.csv'

In [3]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

# Drop duplicates & missing data

In [4]:
isDuplicated = train.duplicated()
print('Shape of duplicated data:', train[isDuplicated].shape)
train = train.drop_duplicates()

Shape of duplicated data: (315, 17)


In [5]:
print('Missing data of each field:')
print(train.isnull().sum())

Missing data of each field:
month                  0
town                   0
flat_type              0
block                  0
street_name            0
storey_range           0
floor_area_sqm         0
flat_model             0
eco_category           0
lease_commence_date    0
latitude               0
longitude              0
elevation              0
subzone                0
planning_area          0
region                 0
resale_price           0
dtype: int64


# Split X and Y

In [6]:
y_train = train.iloc[:,-1]
X_train = train.iloc[:,:-1]
X_test = test.copy()

# Feature Engineering

In [7]:
ylabel = 'resale_price'
all_features = list(X_train.columns)
numerical_features = list(X_train.select_dtypes(include = ['int64','float']))
nominal_features = list(X_train.select_dtypes(include=['object']))
print('Numerical Features: {}\n\nNominal Features: {}'.format(numerical_features, nominal_features))

Numerical Features: ['floor_area_sqm', 'lease_commence_date', 'latitude', 'longitude', 'elevation']

Nominal Features: ['month', 'town', 'flat_type', 'block', 'street_name', 'storey_range', 'flat_model', 'eco_category', 'subzone', 'planning_area', 'region']


In [8]:
print('Number of categories for nominal features:')
X_train[nominal_features].describe().loc['unique']

Number of categories for nominal features:


month             251
town               26
flat_type          12
block            2472
street_name      1103
storey_range       25
flat_model         20
eco_category        1
subzone           155
planning_area      32
region              5
Name: unique, dtype: object

Drop:
- elevation
- eco_category
- latitude
- longitude
- town
- block

Convert to number by rule:
- month
- flat_type
- storey_range

Target Encoding:
- street_name
- flat_model
- subzone
- planning_area

One-hot encoding:
- region

In [9]:
# Drop
drop_features = ['elevation', 'eco_category', 'latitude', 'longitude', 'town']
X_train.drop(drop_features, axis=1, inplace=True)
X_test.drop(drop_features, axis=1, inplace=True)

In [10]:
def block_to_num(blk):
    return int(''.join(list(filter(str.isdigit, blk))))

X_train['block'] = X_train['block'].apply(lambda x: block_to_num(x))
X_test['block'] = X_test['block'].apply(lambda x: block_to_num(x))
X_train.drop('block', axis=1, inplace=True)
X_test.drop('block', axis=1, inplace=True)

In [11]:
# month to month & year
def month_year(data):
    data['year'] = data['month'].apply(lambda x: x.split('-')[0]).astype('int')
    # data['month'] = data['month'].apply(lambda x: x.split('-')[1]).astype('int')
    data = data.drop('month', axis=1)
    return data

X_train = month_year(X_train)
X_test = month_year(X_test)

In [12]:
# https://www.hdb.gov.sg/residential/buying-a-flat/finding-a-flat/types-of-flats
# flat_type
def flat_type_to_num(ft):
    if ft=='executive':
        return 6
    elif ft=='multi generation':
        return 7
    else:
        return int(ft.replace('-',' ').split(' ')[0])

X_train['flat_type'] = X_train['flat_type'].apply(lambda x: flat_type_to_num(x))
X_test['flat_type'] = X_test['flat_type'].apply(lambda x: flat_type_to_num(x))

In [13]:
# storey_range
def storey_range_to_num(sr):
    temp = sr.split(' to ')
    return (int(temp[0])+int(temp[1]))//2

# print(X_train['storey_range'].unique())
X_train['storey_range'] = X_train['storey_range'].apply(lambda x: storey_range_to_num(x))
X_test['storey_range'] = X_test['storey_range'].apply(lambda x: storey_range_to_num(x))

In [14]:
# Target Encoding
target_features = ['street_name', 'flat_model', 'subzone', 'planning_area']

import category_encoders as ce
target_encoder = ce.TargetEncoder(cols=target_features)
target_encoder.fit(X_train, y_train)

# transform the training and test data
X_train = target_encoder.transform(X_train)
X_test = target_encoder.transform(X_test)


In [15]:
# One-hot encoding 
onehot_features = ['region']

X_train = pd.get_dummies(X_train, columns=onehot_features)
X_test = pd.get_dummies(X_test, columns=onehot_features)

# keep only the common columns in the training and test data
common_columns = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_columns]
X_test = X_test[common_columns]

In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Predict 'resale_price'

In [24]:
from sklearn.model_selection import KFold
def kfold_validation(model, n_folds=5):
    # define the k-fold cross-validation method
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    # create an empty array to store the predicted values for each fold
    y_pred_all = np.array([])
    # loop through each fold
    for train_index, test_index in kf.split(X_train):
        # extract the training and test sets for this fold
        X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
        y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]
        # train the model on the training set for this fold
        model.fit(X_train_fold, y_train_fold)
        # make predictions on the test set for this fold
        y_pred_fold = model.predict(X_test_fold)
        # append the predicted values for this fold to the array
        y_pred_all = np.concatenate((y_pred_all, y_pred_fold))
    # calculate the mean squared error across all folds
    mse = ((y_pred_all - y_train) ** 2).mean()
    # print the mean squared error
    print("Mean Squared Error:", mse)


In [20]:
def test_predict_save(model):
    # fit the model to the training data
    model.fit(X_train, y_train)
    # predict on the test data
    y_pred = model.predict(X_test)
    # convert the id column of test set to a pandas dataframe
    id_df = pd.Series([i for i in range(len(y_pred))], name='Predicted')
    # convert y_pred to a pandas series
    y_pred_series = pd.Series(y_pred, name='Predicted')
    # concatenate id_df and y_pred_series horizontally
    result_df = pd.concat([id_df, y_pred_series], axis=1)
    # rename the columns of the concatenated dataframe
    result_df.columns = ['Id', 'Predicted']
    # output the result dataframe to a CSV file
    result_df.to_csv('{}_result.csv'.format(type(model)), index=False)



## 1. Linear Regression

In [69]:
from sklearn.linear_model import LinearRegression

# create a LinearRegression object
linear_reg = LinearRegression()
kfold_validation(linear_reg)

Mean Squared Error: 30311413042.79659


## 2. Ridge Regression:

In [70]:
from sklearn.linear_model import Ridge

# create a Ridge object with a regularization parameter of alpha=1
ridge_reg = Ridge(alpha=1)
kfold_validation(ridge_reg)


Mean Squared Error: 30316886136.897026


## 3. Lasso Regression

In [71]:
from sklearn.linear_model import Lasso

# create a Lasso object with a regularization parameter of alpha=1
lasso_reg = Lasso(alpha=1)
kfold_validation(lasso_reg)


Mean Squared Error: 30316778117.19081


## 4. Elastic Net Regression:

In [25]:
from sklearn.linear_model import ElasticNet #50189

# create an ElasticNet object with a regularization parameter of alpha=1 and a mix parameter of l1_ratio=0.5
ela_reg = ElasticNet(alpha=1, l1_ratio=0.5)
kfold_validation(ela_reg)
test_predict_save(ela_reg)

Mean Squared Error: 17174751.59873881


## 5. Decision Tree Regression:

In [74]:
from sklearn.tree import DecisionTreeRegressor

# create a DecisionTreeRegressor object with a maximum depth of 5
dt_reg = DecisionTreeRegressor(max_depth=5)
kfold_validation(dt_reg)

Mean Squared Error: 31096689159.50097


## 6. Random Forest Regression:

In [75]:
from sklearn.ensemble import RandomForestRegressor

# create a RandomForestRegressor object with 100 trees and a maximum depth of 5
rf_reg = RandomForestRegressor(n_estimators=100, max_depth=5)
kfold_validation(rf_reg)


Mean Squared Error: 31000227055.841526


## 7. Gradient Boosting Regression:

In [76]:
from sklearn.ensemble import GradientBoostingRegressor

# create a GradientBoostingRegressor object with 100 trees and a learning rate of 0.1
gb_reg = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
kfold_validation(gb_reg)


Mean Squared Error: 31820465549.445984


## 8. Support Vector Regression:

In [None]:
from sklearn.svm import SVR

# create an SVR object with a radial basis function kernel and a regularization parameter of C=1
svr_reg = SVR(kernel='rbf', C=1)
kfold_validation(svr_reg)

## 9. Neural Network Regression:

In [None]:
from sklearn.neural_network import MLPRegressor

# create an MLPRegressor object with 2 hidden layers of size 10 each and a learning rate of 0.1
mlp_reg = MLPRegressor(hidden_layer_sizes=(10, 10), learning_rate_init=0.1)
kfold_validation(mlp_reg)
