## Regression Algorithm Machine Learning - House Sales in King County USA



This dataset contains house sale prices for King County, which includes Seattle. 

It includes homes sold between May 2014 and May 2015.


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn import set_config
set_config(print_changed_only=False)

import warnings
warnings.filterwarnings('ignore')

df =pd.read_csv('kc_house_data.csv')
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


# DATA EXPLORATION

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

In [3]:
df.isnull().sum()

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

Dataset is quiet good and looks clean, No Missing Values detected

In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,21613.0,4580302000.0,2876566000.0,1000102.0,2123049000.0,3904930000.0,7308900000.0,9900000000.0
price,21613.0,540088.1,367127.2,75000.0,321950.0,450000.0,645000.0,7700000.0
bedrooms,21613.0,3.370842,0.9300618,0.0,3.0,3.0,4.0,33.0
bathrooms,21613.0,2.114757,0.7701632,0.0,1.75,2.25,2.5,8.0
sqft_living,21613.0,2079.9,918.4409,290.0,1427.0,1910.0,2550.0,13540.0
sqft_lot,21613.0,15106.97,41420.51,520.0,5040.0,7618.0,10688.0,1651359.0
floors,21613.0,1.494309,0.5399889,1.0,1.0,1.5,2.0,3.5
waterfront,21613.0,0.007541757,0.0865172,0.0,0.0,0.0,0.0,1.0
view,21613.0,0.2343034,0.7663176,0.0,0.0,0.0,0.0,4.0
condition,21613.0,3.40943,0.650743,1.0,3.0,3.0,4.0,5.0


In [5]:
df.describe(include='O')

Unnamed: 0,date
count,21613
unique,372
top,20140623T000000
freq,142


In [6]:
HouseSalesDesc=[]

for i in df.columns:
    HouseSalesDesc.append([
        i,
        df[i].dtypes,
        df[i].isna().sum(),
        (((df[i].isna().sum())/len(df))*100).round(2),
        df[i].nunique(),
        df[i].drop_duplicates().sample(2).values
    ])
    
pd.DataFrame(data=HouseSalesDesc,columns=[
    'Data Feature', 'Data Types', 'Null','Null Percentages','Unique','Unique Sample'
])

Unnamed: 0,Data Feature,Data Types,Null,Null Percentages,Unique,Unique Sample
0,id,int64,0,0.0,21436,"[7430500301, 1105000588]"
1,date,object,0,0.0,372,"[20140925T000000, 20150106T000000]"
2,price,float64,0,0.0,4028,"[377691.0, 649800.0]"
3,bedrooms,int64,0,0.0,13,"[8, 11]"
4,bathrooms,float64,0,0.0,30,"[0.0, 4.0]"
5,sqft_living,int64,0,0.0,1038,"[2050, 3504]"
6,sqft_lot,int64,0,0.0,9782,"[15624, 36276]"
7,floors,float64,0,0.0,6,"[1.0, 2.0]"
8,waterfront,int64,0,0.0,2,"[1, 0]"
9,view,int64,0,0.0,5,"[1, 4]"


# EXPLORATORY DATA ANALYSIS

In [7]:
#Drop Unessecary columns - Feature Selection

df.drop(columns=['id','date','sqft_living15','sqft_lot15'],inplace=True)

In [8]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045


In [9]:
((df[['bedrooms','price']].groupby(['bedrooms']).mean())*100).round(2).sort_values(by='bedrooms',ascending=False)

Unnamed: 0_level_0,price
bedrooms,Unnamed: 1_level_1
33,64000000.0
11,52000000.0
10,81933330.0
9,89399980.0
8,110507700.0
7,95118470.0
6,82552060.0
5,78659980.0
4,63541950.0
3,46623210.0


In [10]:
((df[['floors','price']].groupby(['floors']).mean())*100).round(2).sort_values(by='floors',ascending=False)

Unnamed: 0_level_0,price
floors,Unnamed: 1_level_1
3.5,93331250.0
3.0,58252600.0
2.5,106034600.0
2.0,64889120.0
1.5,55898060.0
1.0,44218060.0


In [11]:
((df[['grade','price']].groupby(['grade']).mean())*100).round(2).sort_values(by='grade',ascending=False)

Unnamed: 0_level_0,price
grade,Unnamed: 1_level_1
13,370961500.0
12,219122200.0
11,149684200.0
10,107177100.0
9,77351320.0
8,54285280.0
7,40259030.0
6,30191960.0
5,24852400.0
4,21438100.0


In [12]:
((df[['condition','price']].groupby(['condition']).mean())*100).round(2).sort_values(by='condition',ascending=False)

Unnamed: 0_level_0,price
condition,Unnamed: 1_level_1
5,61241808.94
4,52120039.0
3,54201257.81
2,32728714.53
1,33443166.67


In [13]:
((df[['waterfront','price']].groupby(['waterfront']).mean())*100).round(2).sort_values(by='waterfront',ascending=False)

Unnamed: 0_level_0,price
waterfront,Unnamed: 1_level_1
1,166187600.0
0,53156360.0


In [14]:
#sns.pairplot(df, hue='price', palette='icefire')

In [15]:
df.corr()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long
price,1.0,0.30835,0.525138,0.702035,0.089661,0.256794,0.266369,0.397293,0.036362,0.667434,0.605567,0.323816,0.054012,0.126434,-0.053203,0.307003,0.021626
bedrooms,0.30835,1.0,0.515884,0.576671,0.031703,0.175429,-0.006582,0.079532,0.028472,0.356967,0.4776,0.303093,0.154178,0.018841,-0.152668,-0.008931,0.129473
bathrooms,0.525138,0.515884,1.0,0.754665,0.08774,0.500653,0.063744,0.187737,-0.124982,0.664983,0.685342,0.28377,0.506019,0.050739,-0.203866,0.024573,0.223042
sqft_living,0.702035,0.576671,0.754665,1.0,0.172826,0.353949,0.103818,0.284611,-0.058753,0.762704,0.876597,0.435043,0.318049,0.055363,-0.19943,0.052529,0.240223
sqft_lot,0.089661,0.031703,0.08774,0.172826,1.0,-0.005201,0.021604,0.07471,-0.008958,0.113621,0.183512,0.015286,0.05308,0.007644,-0.129574,-0.085683,0.229521
floors,0.256794,0.175429,0.500653,0.353949,-0.005201,1.0,0.023698,0.029444,-0.263768,0.458183,0.523885,-0.245705,0.489319,0.006338,-0.059121,0.049614,0.125419
waterfront,0.266369,-0.006582,0.063744,0.103818,0.021604,0.023698,1.0,0.401857,0.016653,0.082775,0.072075,0.080588,-0.026161,0.092885,0.030285,-0.014274,-0.04191
view,0.397293,0.079532,0.187737,0.284611,0.07471,0.029444,0.401857,1.0,0.04599,0.251321,0.167649,0.276947,-0.05344,0.103917,0.084827,0.006157,-0.0784
condition,0.036362,0.028472,-0.124982,-0.058753,-0.008958,-0.263768,0.016653,0.04599,1.0,-0.144674,-0.158214,0.174105,-0.361417,-0.060618,0.003026,-0.014941,-0.1065
grade,0.667434,0.356967,0.664983,0.762704,0.113621,0.458183,0.082775,0.251321,-0.144674,1.0,0.755923,0.168392,0.446963,0.014414,-0.184862,0.114084,0.198372


# SPLITTING DATA

In [16]:
from sklearn.model_selection import train_test_split

# Algorithm Model

# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.linear_model import LinearRegression
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.ensemble import RandomForestRegressor

In [17]:
X = df.drop(columns='price')  ### Features 
y = df['price'] ### Target 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .80, random_state = 42)

# Machine Learning Modelling

## Linear Regression

In [18]:
from sklearn.linear_model import LinearRegression

In [19]:
model_linreg = LinearRegression()

In [20]:
model_linreg.fit(X_train, y_train) 

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [21]:
model_linreg.predict(X_test)

array([ 461291.61257276,  731704.09137437, 1217116.74900676, ...,
        315521.37054524,  462833.47300744,  692663.01833351])

In [22]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [23]:
y_model_Linreg = model_linreg.predict(X_test)

In [24]:
r2_model_Linreg= (r2_score(y_test, y_model_Linreg)).round(2)
MAE_model_linreg = mean_absolute_error(y_test, y_model_Linreg)
MSE_model_linreg= mean_squared_error(y_test, y_model_Linreg)
RMSE_model_linreg = np.sqrt(MSE_model_linreg)

print("Evaluation Matrix Linear regression")
print("MAE Score: ", MAE_model_linreg)
print("MSE Score: ", MSE_model_linreg)
print("RMSE Score: ", RMSE_model_linreg)
print("R2 : ", r2_model_Linreg)

Evaluation Matrix Linear regression
MAE Score:  123562.25603768413
MSE Score:  42821101853.71838
RMSE Score:  206932.60220109925
R2 :  0.69


## K Nearest Neighbors

In [25]:
from sklearn.neighbors import KNeighborsRegressor

In [26]:
model_KNN = KNeighborsRegressor()

In [27]:
model_KNN.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [28]:
y_model_KNN = model_KNN.predict(X_test)

In [29]:
r2_model_KNN= (r2_score(y_test, y_model_KNN)).round(2)
MAE_model_KNN = mean_absolute_error(y_test, y_model_KNN)
MSE_model_KNN= mean_squared_error(y_test, y_model_KNN)
RMSE_model_KNN= np.sqrt(MSE_model_KNN)

print("Evaluation Matrix K Nearest Neighbors")
print("MAE Score: ", MAE_model_KNN)
print("MSE Score: ", MSE_model_KNN)
print("RMSE Score: ", RMSE_model_KNN)
print("R2 : ", r2_model_KNN)

Evaluation Matrix K Nearest Neighbors
MAE Score:  169800.04452027066
MSE Score:  76775666245.47299
RMSE Score:  277084.2222961694
R2 :  0.45


## Decision Tree

In [30]:
from sklearn.tree import DecisionTreeRegressor

In [31]:
model_desc_tree=DecisionTreeRegressor()

In [32]:
model_desc_tree.fit(X_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [33]:
y_desc_tree = model_desc_tree.predict(X_test)

In [34]:
r2_model_desc_tree= (r2_score(y_test, y_desc_tree)).round(2)
MAE_model_desc_tree = mean_absolute_error(y_test, y_desc_tree)
MSE_model_desc_tree= mean_squared_error(y_test, y_desc_tree)
RMSE_mode_desc_tree= np.sqrt(MSE_model_desc_tree)

print("Evaluation Matrix Decission Tree")
print("MAE Score: ", MAE_model_desc_tree)
print("MSE Score: ", MSE_model_desc_tree)
print("RMSE Score: ", RMSE_mode_desc_tree)
print("R2 : ", r2_model_desc_tree)

Evaluation Matrix Decission Tree
MAE Score:  109260.97270256202
MSE Score:  40759547200.46099
RMSE Score:  201889.93833388772
R2 :  0.71


## Random Forest

In [35]:
from sklearn.ensemble import RandomForestRegressor

In [36]:
model_RF = RandomForestRegressor()

In [37]:
model_RF.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [38]:
y_model_RF = model_RF.predict(X_test)

In [39]:
r2_model_RF= (r2_score(y_test, y_model_RF)).round(2)
MAE_model_RF = mean_absolute_error(y_test, y_model_RF)
MSE_model_RF= mean_squared_error(y_test, y_model_RF)
RMSE_mode_RF= np.sqrt(MSE_model_RF)

print("Evaluation Matrix Random Forest")
print("MAE Score: ", MAE_model_RF)
print("MSE Score: ", MSE_model_RF)
print("RMSE Score: ", RMSE_mode_RF)
print("R2 : ", r2_model_RF)

Evaluation Matrix Random Forest
MAE Score:  80077.08343222871
MSE Score:  27062934123.440372
RMSE Score:  164508.15822761002
R2 :  0.81


In [40]:
data = {
    "Linear Regression" : [MAE_model_linreg, MSE_model_linreg, RMSE_model_linreg, r2_model_Linreg],
    "K Nearest Neighbors" : [MAE_model_KNN, MSE_model_KNN, RMSE_model_KNN, r2_model_KNN],
    "Decission Tree" : [MAE_model_desc_tree, MSE_model_desc_tree, RMSE_mode_desc_tree, r2_model_desc_tree],
    "Random Forest" : [MAE_model_RF, MSE_model_RF, RMSE_mode_RF, r2_model_RF]
}

pd.DataFrame(data=data, index=['MAE', 'MSE', 'RMSE', 'R2'])

Unnamed: 0,Linear Regression,K Nearest Neighbors,Decission Tree,Random Forest
MAE,123562.3,169800.0,109261.0,80077.08
MSE,42821100000.0,76775670000.0,40759550000.0,27062930000.0
RMSE,206932.6,277084.2,201889.9,164508.2
R2,0.69,0.45,0.71,0.81


### Polynomial 


In [41]:
from sklearn.preprocessing import PolynomialFeatures

In [42]:
apoli = PolynomialFeatures(degree=5, include_bias = False)

In [43]:
apoli.fit_transform(X_train,y_train)

array([[ 4.00000000e+00,  2.50000000e+00,  2.99000000e+03, ...,
        -4.07385845e+09,  1.04988759e+10, -2.70570019e+10],
       [ 3.00000000e+00,  2.25000000e+00,  1.23000000e+03, ...,
        -4.16230954e+09,  1.06900269e+10, -2.74551119e+10],
       [ 4.00000000e+00,  2.75000000e+00,  4.43000000e+03, ...,
        -4.11983908e+09,  1.05495250e+10, -2.70137922e+10],
       ...,
       [ 3.00000000e+00,  2.50000000e+00,  2.12000000e+03, ...,
        -4.13153939e+09,  1.05740235e+10, -2.70625456e+10],
       [ 1.00000000e+00,  7.50000000e-01,  3.80000000e+02, ...,
        -4.12633122e+09,  1.06304672e+10, -2.73867576e+10],
       [ 4.00000000e+00,  2.50000000e+00,  3.13000000e+03, ...,
        -4.08690235e+09,  1.05311888e+10, -2.71369188e+10]])

In [44]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045


In [47]:
poli = apoli.fit_transform(df[['price']])

In [49]:
x_poly = df.drop(columns='price')  ### Features 
y_poly = df['price'] ### Target 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .80, random_state = 42)

In [57]:
model_poly=model_linreg.fit(x_poly,y_poly)

y_linreg_poly=model_poly.predict(x_poly)

In [58]:
r2_linreg_poly= r2_score(y_poly, y_linreg_poly)
MAE_linreg_poly = mean_absolute_error(y_poly, y_linreg_poly)
MSE_linreg_poly= mean_squared_error(y_poly, y_linreg_poly)
RMSE_linreg_poly= np.sqrt(MSE_linreg_poly)

print("Evaluation Matrix Linear Regression- Polynomial")
print("MAE Score: ", MAE_linreg_poly)
print("MSE Score: ", MSE_linreg_poly)
print("RMSE Score: ", RMSE_linreg_poly)
print("R2 : ", r2_linreg_poly)

Evaluation Matrix Linear Regression- Polynomial
MAE Score:  126148.38213053216
MSE Score:  40586297407.84716
RMSE Score:  201460.41151513407
R2 :  0.698861410204793


In [59]:
data = {
    "Linear Regression" : [MAE_model_linreg, MSE_model_linreg, RMSE_model_linreg, r2_model_Linreg],
    "K Nearest Neighbors" : [MAE_model_KNN, MSE_model_KNN, RMSE_model_KNN, r2_model_KNN],
    "Decission Tree" : [MAE_model_desc_tree, MSE_model_desc_tree, RMSE_mode_desc_tree, r2_model_desc_tree],
    "Random Forest" : [MAE_model_RF, MSE_model_RF, RMSE_mode_RF, r2_model_RF],
    "Linear Regression - polynomial" : [MAE_linreg_poly, MSE_linreg_poly, RMSE_linreg_poly, r2_linreg_poly]
}

pd.DataFrame(data=data, index=['MAE', 'MSE', 'RMSE', 'R2'])

Unnamed: 0,Linear Regression,K Nearest Neighbors,Decission Tree,Random Forest,Linear Regression - polynomial
MAE,123562.3,169800.0,109261.0,80077.08,126148.4
MSE,42821100000.0,76775670000.0,40759550000.0,27062930000.0,40586300000.0
RMSE,206932.6,277084.2,201889.9,164508.2,201460.4
R2,0.69,0.45,0.71,0.81,0.6988614


In [None]:
Random Forest Regressor is commended in this case, because its R2 Score is the highest among the others.