In [1]:
import numpy as np
import matplotlib.pyplot as plt 

import pandas as pd  
import seaborn as sns 

from sklearn import preprocessing

%matplotlib inline

In [2]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

In [3]:
data = pd.read_csv("housesalesprediction/kc_house_data.csv")
data.shape

(21613, 21)

In [4]:
feature_cols = ['bedrooms', 'bathrooms', 'sqft_living','sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
             'sqft_above', 'sqft_basement','sqft_living15', 'sqft_lot15']
cont_cols = ['bedrooms','bathrooms','sqft_living','sqft_lot','floors','sqft_above','sqft_basement','sqft_living15','sqft_lot15']
cat_cols = ['waterfront', 'view', 'condition', 'grade']
target_col = 'price'

In [5]:
X = data[feature_cols]
Y = data[target_col]

In [6]:
def remove_outlier(df_in):
    for col_name in df_in.columns:
        if col_name in cont_cols:
            q1 = df_in[col_name].quantile(0.25)
            q3 = df_in[col_name].quantile(0.75)
            iqr = q3-q1 #Interquartile range
            fence_low  = q1-1.5*iqr
            fence_high = q3+1.5*iqr
            df_in[col_name] = df_in[col_name].clip(fence_low,fence_high)
        else:
            pass
    return df_in

In [7]:
X_wo = remove_outlier(X.copy())

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_wo, Y, test_size = 0.3, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(15129, 13)
(6484, 13)
(15129,)
(6484,)


In [9]:
# Scaling for KNN

scaler = preprocessing.MinMaxScaler()
train_scaled = pd.DataFrame(scaler.fit_transform(X_train[cont_cols]), columns=cont_cols, index = X_train.index)
test_scaled = pd.DataFrame(scaler.transform(X_test[cont_cols]), columns=cont_cols, index = X_test.index)

X_train = pd.concat([train_scaled, X_train[cat_cols]], axis =1)
X_test = pd.concat([test_scaled, X_test[cat_cols]], axis =1)

In [10]:
# import the regressor 
from sklearn.tree import DecisionTreeRegressor

# create a regressor object 
regressor = DecisionTreeRegressor(random_state = 0, min_samples_leaf = 20, max_depth = 4)
  
# fit the regressor with X and Y data
regressor.fit(X_train, Y_train) 

DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=20,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=0, splitter='best')

In [11]:
from sklearn.metrics import mean_squared_error

# model evaluation for training set
y_train_predict = regressor.predict(X_train)
rmse = (np.sqrt(mean_squared_error(Y_train, y_train_predict)))

print("The model performance for training set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))

# model evaluation for testing set
y_test_predict = regressor.predict(X_test)
rmse = (np.sqrt(mean_squared_error(Y_test, y_test_predict)))

print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))

The model performance for training set
--------------------------------------
RMSE is 235168.64368539164
The model performance for testing set
--------------------------------------
RMSE is 234357.19250368702


In [12]:
df_train = pd.concat([X_train,Y_train], axis = 1)
df_train['Tree_pred'] = y_train_predict
df_train.loc[:,'Leaf_node'] = regressor.apply(X_train)

In [13]:
df_test = pd.concat([X_test,Y_test], axis = 1)
df_test['Tree_pred'] = y_test_predict
df_test.loc[:,'Leaf_node'] = regressor.apply(X_test)

In [14]:
feature_cols = ['bedrooms', 'bathrooms', 'sqft_living','sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
             'sqft_above', 'sqft_basement','sqft_living15', 'sqft_lot15']
target_col = 'price'

In [15]:
from sklearn.neighbors import KNeighborsRegressor

df_final = pd.DataFrame()
for i in df_test['Leaf_node'].unique():
    X_train = df_train[df_train['Leaf_node']==i]
    X_test = df_test[df_test['Leaf_node']==i]
    
    if(X_train.shape[0]>10):
        # Need rule for auto selectio of neighbors
        neigh = KNeighborsRegressor(n_neighbors=5)
        neigh.fit(X_train[feature_cols],X_train[target_col])
        X_test.loc[:,'pred'] = neigh.predict(X_test[feature_cols])
    elif(X_train.shape[0]>=3):
        neigh = KNeighborsRegressor(n_neighbors=3)
        neigh.fit(X_train[feature_cols],X_train[target_col])
        X_test.loc[:,'pred'] = neigh.predict(X_test[feature_cols])        
    else:
        X_test.loc[:,'pred'] = regressor.predict(X_test[feature_cols])
        
    df_final = pd.concat([df_final,X_test], axis = 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [16]:
df_final.head(3)

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,sqft_above,sqft_basement,sqft_living15,sqft_lot15,waterfront,view,condition,grade,price,Tree_pred,Leaf_node,pred
17485,0.625,0.625,0.627838,0.336131,0.4,0.720238,0.0,0.49922,0.364889,0,0,4,7,365000.0,421004.808945,8,413921.8
615,0.375,0.541667,0.373589,0.443404,0.0,0.267857,0.385714,0.421217,0.405998,0,0,3,7,425000.0,421004.808945,8,347900.0
7763,0.375,0.458333,0.389156,0.426189,0.0,0.244048,0.485714,0.25897,0.501405,0,0,4,7,860000.0,421004.808945,8,362440.0


In [17]:
(np.sqrt(mean_squared_error(df_final[target_col], df_final['Tree_pred'])))

234357.19250368702

In [18]:
(np.sqrt(mean_squared_error(df_final[target_col], df_final['pred'])))

220270.5054191276

In [19]:
# MAPE
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [20]:
mean_absolute_percentage_error(df_final[target_col], df_final['Tree_pred'])

32.08694592006732

In [21]:
mean_absolute_percentage_error(df_final[target_col], df_final['pred'])

26.61722748426939

In [22]:
df_final['Tree_pred'].std(), df_final['pred'].std()

(289361.889141703, 313631.288087317)

### Random Forest

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_wo, Y, test_size = 0.3, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(15129, 13)
(6484, 13)
(15129,)
(6484,)


In [24]:
from sklearn.ensemble import RandomForestRegressor

regr = RandomForestRegressor(max_depth=3, random_state=0,
                             n_estimators=100)
regr.fit(X_train, Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [25]:
from sklearn.metrics import mean_squared_error

# model evaluation for training set
y_train_predict = regr.predict(X_train)
rmse = (np.sqrt(mean_squared_error(Y_train, y_train_predict)))

print("The model performance for training set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))

# model evaluation for testing set
y_test_predict = regr.predict(X_test)
rmse = (np.sqrt(mean_squared_error(Y_test, y_test_predict)))

print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))

print('MAPE for testing set is')
mean_absolute_percentage_error(Y_test, y_test_predict)

The model performance for training set
--------------------------------------
RMSE is 237825.81481016357
The model performance for testing set
--------------------------------------
RMSE is 239401.49754966956
MAPE for testing set is


33.158558855620804

### KNN Regression

In [32]:
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=5, metric="euclidean")
neigh.fit(X_train, Y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='euclidean',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [33]:
from sklearn.metrics import mean_squared_error

# model evaluation for testing set
y_test_predict = neigh.predict(X_test)
rmse = (np.sqrt(mean_squared_error(Y_test, y_test_predict)))

print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))

print('MAPE for testing set is')
mean_absolute_percentage_error(Y_test, y_test_predict)

The model performance for testing set
--------------------------------------
RMSE is 264172.13801753504
MAPE for testing set is


31.6711264245777