In [1]:
import numpy as np
import pandas as pd
import math

import sklearn
from sklearn.preprocessing import MinMaxScaler
import sklearn.model_selection
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

In [2]:
# Load data from CSV file

data = pd.read_csv("kc_house_data.csv")
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [3]:
# Droping the columns which we are not interested at

X = data.drop(['id', 'price', 'date'],axis=1)
Y = data['price']

print(X.shape, Y.shape)
X.head()

(21613, 18) (21613,)


Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [4]:
# Spliting data into train and test data and scaling them

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

x_scaler = MinMaxScaler(feature_range=(0, 1))
y_scaler = MinMaxScaler(feature_range=(0, 1))

x_train_scaled = x_scaler.fit_transform(x_train)
y_train_scaled = y_scaler.fit_transform(y_train)

x_test_scaled = x_scaler.transform(x_test)
y_test_scaled = y_scaler.transform(y_test)

y_test_scaled = y_test_scaled.reshape((-1,)) 
y_train_scaled = y_train_scaled.reshape((-1,)) 
print(y_test_scaled.shape)

(6484,)




In [5]:
# Linear Regression

lreg = LinearRegression()
lreg.fit(x_train_scaled,y_train_scaled)
accuracy = lreg.score(x_test_scaled,y_test_scaled)
print('Accuracy of Linear Regression: ',accuracy)

Accuracy of Linear Regression:  0.6877902899299289


In [6]:
# Gradient Boosting Regression
gbreg = GradientBoostingRegressor(n_estimators = 400, max_depth = 5, min_samples_split = 2,
                                           learning_rate = 0.1, loss = 'ls')

gbreg.fit(x_train_scaled, y_train_scaled)
accuracy = gbreg.score(x_test_scaled, y_test_scaled)
print('Accuracy of GB Regression: ',accuracy)

Accuracy of GB Regression:  0.9020952439000731


In [7]:
# SVR

svreg = SVR(kernel='rbf', C=100, gamma=0.0001)
svreg.fit(x_train_scaled, y_train_scaled)
accuracy = svreg.score(x_test_scaled,y_test_scaled)
print('Accuracy of SVR: ',accuracy)

Accuracy of SVR:  0.33800153606010563


In [8]:
# Decision Tree Regression

dtreg = DecisionTreeRegressor(max_depth=4, min_samples_split=2, max_leaf_nodes=17, min_samples_leaf=1)
dtreg.fit(x_train_scaled, y_train_scaled)
accuracy = dtreg.score(x_test_scaled, y_test_scaled)
print('Accuracy of Decision Tree Regression: ',accuracy)

Accuracy of Decision Tree Regression:  0.6845757056815174


In [9]:
# Random Forest Regression

rfreg = RandomForestRegressor(max_features=15, n_estimators=500, bootstrap=False)
rfreg.fit(x_train_scaled, y_train_scaled)
accuracy = rfreg.score(x_test_scaled, y_test_scaled)
print('Accuracy of Random Forest Regression: ',accuracy)

Accuracy of Random Forest Regression:  0.8614994900003762
