In [1]:
import numpy as np
import matplotlib.pyplot as plt 

import pandas as pd  
import seaborn as sns 

import timeit
import warnings
warnings.filterwarnings('ignore')

from sklearn import preprocessing

%matplotlib inline

In [2]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

In [3]:
data = pd.read_csv("housepricing/HousePrices_HalfMil.csv")
data.shape

(500000, 16)

In [4]:
data.head(3)

Unnamed: 0,Area,Garage,FirePlace,Baths,White Marble,Black Marble,Indian Marble,Floors,City,Solar,Electric,Fiber,Glass Doors,Swiming Pool,Garden,Prices
0,164,2,0,2,0,1,0,0,3,1,1,1,1,0,0,43800
1,84,2,0,4,0,0,1,1,2,0,0,0,1,1,1,37550
2,190,2,4,4,1,0,0,0,2,0,0,1,0,0,0,49500


In [5]:
feature_cols = ['Area', 'Garage', 'FirePlace', 'Baths', 'White Marble', 'Black Marble','Indian Marble', 'Floors', 'City',
                'Solar', 'Electric', 'Fiber','Glass Doors', 'Swiming Pool', 'Garden']
cont_cols = ['Area', 'Garage', 'FirePlace', 'Baths', 'White Marble', 'Black Marble','Indian Marble', 'Floors', 'City',
             'Solar', 'Electric', 'Fiber','Glass Doors', 'Swiming Pool', 'Garden']
cat_cols = ['City']
target_col = 'Prices'

In [6]:
X = data[feature_cols]
Y = data[target_col]

In [7]:
def remove_outlier(df_in):
    for col_name in df_in.columns:
        if col_name in cont_cols:
            q1 = df_in[col_name].quantile(0.25)
            q3 = df_in[col_name].quantile(0.75)
            iqr = q3-q1 #Interquartile range
            fence_low  = q1-1.5*iqr
            fence_high = q3+1.5*iqr
            df_in[col_name] = df_in[col_name].clip(fence_low,fence_high)
        else:
            pass
    return df_in

In [8]:
X_wo = remove_outlier(X.copy())

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_wo, Y, test_size = 0.3, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(350000, 15)
(150000, 15)
(350000,)
(150000,)


In [10]:
# Scaling for KNN

scaler = preprocessing.MinMaxScaler()
train_scaled = pd.DataFrame(scaler.fit_transform(X_train[cont_cols]), columns=cont_cols, index = X_train.index)
test_scaled = pd.DataFrame(scaler.transform(X_test[cont_cols]), columns=cont_cols, index = X_test.index)

X_train = pd.concat([train_scaled, X_train[cat_cols]], axis =1)
X_test = pd.concat([test_scaled, X_test[cat_cols]], axis =1)

In [11]:
# import the regressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [12]:
# PNN - Ensembling CART and KNN

start_time = timeit.default_timer()
# create a regressor object
regressor = DecisionTreeRegressor(random_state = 0, min_samples_leaf = 20, max_depth = 4)
  
# fit the regressor with X and Y data
regressor.fit(X_train, Y_train)
y_train_predict = regressor.predict(X_train)
y_test_predict = regressor.predict(X_test)

df_train = pd.concat([X_train,Y_train], axis = 1)
df_train['Tree_pred'] = y_train_predict
df_train.loc[:,'Leaf_node'] = regressor.apply(X_train) # To get leaf node for each training sample

df_test = pd.concat([X_test,Y_test], axis = 1)
df_test['Tree_pred'] = y_test_predict
df_test.loc[:,'Leaf_node'] = regressor.apply(X_test) # To get leaf node for each test sample

df_final = pd.DataFrame()
for i in df_test['Leaf_node'].unique():
    X_train = df_train[df_train['Leaf_node']==i]
    X_test = df_test[df_test['Leaf_node']==i]
    
    if(X_train.shape[0]>10):
        neigh = KNeighborsRegressor(n_neighbors=5)
        neigh.fit(X_train[feature_cols],X_train[target_col])
        X_test.loc[:,'pred'] = neigh.predict(X_test[feature_cols])
    elif(X_train.shape[0]>=3):
        neigh = KNeighborsRegressor(n_neighbors=3)
        neigh.fit(X_train[feature_cols],X_train[target_col])
        X_test.loc[:,'pred'] = neigh.predict(X_test[feature_cols])        
    else:
        X_test.loc[:,'pred'] = regressor.predict(X_test[feature_cols])
        
    df_final = pd.concat([df_final,X_test], axis = 0)

print("Time elapsed in seconds",timeit.default_timer() - start_time)

Time elapsed in seconds 45.6394326


In [13]:
# RMSE for Decision Tree Predictions

(np.sqrt(mean_squared_error(df_final[target_col], df_final['Tree_pred'])))

4515.305123160281

In [14]:
# RMSE for PNN predictions

(np.sqrt(mean_squared_error(df_final[target_col], df_final['pred'])))

650.611958210627

In [15]:
# MAPE
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [16]:
# MAPE for Decision Tree Predictions
mean_absolute_percentage_error(df_final[target_col], df_final['Tree_pred'])

9.869007019942575

In [17]:
# MAPE for PNN prediction
mean_absolute_percentage_error(df_final[target_col], df_final['pred'])

1.3363870624955665

In [18]:
# Standard deviation in prediction of Decision Tree and PNN
df_final['Tree_pred'].std(), df_final['pred'].std()

(11246.317456360479, 12046.41591821702)

### Random Forest

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_wo, Y, test_size = 0.3, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(350000, 15)
(150000, 15)
(350000,)
(150000,)


In [20]:
start_time = timeit.default_timer()

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

regr = RandomForestRegressor(max_depth=3, random_state=0,
                             n_estimators=100)
regr.fit(X_train, Y_train)

# model evaluation for testing set
y_test_predict = regr.predict(X_test)

print("Time elapsed in seconds",timeit.default_timer() - start_time)
rmse = (np.sqrt(mean_squared_error(Y_test, y_test_predict)))

print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))

print('MAPE for testing set is')
mean_absolute_percentage_error(Y_test, y_test_predict)

Time elapsed in seconds 54.973894200000004
The model performance for testing set
--------------------------------------
RMSE is 5151.020262189354
MAPE for testing set is


11.297011176630543


### KNN Regression

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_wo, Y, test_size = 0.3, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(350000, 15)
(150000, 15)
(350000,)
(150000,)


In [22]:
# Scaling for KNN

scaler = preprocessing.MinMaxScaler()
train_scaled = pd.DataFrame(scaler.fit_transform(X_train[cont_cols]), columns=cont_cols, index = X_train.index)
test_scaled = pd.DataFrame(scaler.transform(X_test[cont_cols]), columns=cont_cols, index = X_test.index)

X_train = pd.concat([train_scaled, X_train[cat_cols]], axis =1)
X_test = pd.concat([test_scaled, X_test[cat_cols]], axis =1)

In [23]:
start_time = timeit.default_timer()

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

neigh = KNeighborsRegressor(n_neighbors=5, metric="euclidean")
neigh.fit(X_train, Y_train)

# model evaluation for testing set
y_test_predict = neigh.predict(X_test)
rmse = (np.sqrt(mean_squared_error(Y_test, y_test_predict)))
print("Time elapsed in seconds",timeit.default_timer() - start_time)

print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))

print('MAPE for testing set is')
mean_absolute_percentage_error(Y_test, y_test_predict)

Time elapsed in seconds 392.84712240000005
The model performance for testing set
--------------------------------------
RMSE is 650.5401228978886
MAPE for testing set is


1.3361069764633093