In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', 50)

In [2]:
# next load the data
df = pd.read_csv('final.csv')
df.head()

Unnamed: 0,price,year_sold,property_tax,insurance,beds,baths,sqft,year_built,lot_size,basement,popular,recession,property_age,property_type_Condo
0,295850,2013,234,81,1,1,584,2013,0,0,0,1,0,1
1,216500,2006,169,51,1,1,612,1965,0,0,0,0,41,1
2,279900,2012,216,74,1,1,615,1963,0,0,0,1,49,1
3,379900,2005,265,92,1,1,618,2000,33541,1,0,0,5,0
4,340000,2002,88,30,1,1,634,1992,0,0,0,0,10,1


In [3]:
df.tail()

Unnamed: 0,price,year_sold,property_tax,insurance,beds,baths,sqft,year_built,lot_size,basement,popular,recession,property_age,property_type_Condo
1855,760000,2008,881,249,5,5,7842,2003,436035,1,0,0,5,0
1856,690000,2015,1553,473,5,6,6501,1956,23086,1,0,0,59,0
1857,600000,1999,942,287,5,6,7064,1995,217800,1,0,0,4,0
1858,759900,2009,803,245,5,6,7500,2006,8886,1,0,0,3,0
1859,735000,2015,1459,444,5,6,7515,1958,10497,1,0,0,57,0


In [4]:
df.shape

(1860, 14)

## Linear Regression Model

In [5]:
# import linear regression model
from sklearn.linear_model import LinearRegression

In [6]:
#step 1
# seperate input features in x
x = df.drop('price', axis=1)

# store the target variable in y
y = df['price']

In [7]:
# import module
from sklearn.model_selection import train_test_split

In [8]:
#df['property_type_Condo'] = df['property_type_Condo'].astype(object) we do not need this one

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, stratify=x.property_type_Condo)

In [10]:
x_train.property_type_Condo.value_counts()

property_type_Condo
0    1208
1     280
Name: count, dtype: int64

In [11]:
#step 2
# Split the dataset
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, stratify=x.property_type_Condo)

In [12]:
x_train.head()

Unnamed: 0,year_sold,property_tax,insurance,beds,baths,sqft,year_built,lot_size,basement,popular,recession,property_age,property_type_Condo
25,2005,244,85,1,1,705,1985,0,0,0,0,20,1
764,2007,561,171,3,3,1628,1971,1694,1,0,0,36,0
640,2005,402,103,3,2,2121,2005,2178,1,0,0,0,0
89,2007,258,89,1,2,1489,1940,0,0,0,0,67,1
336,2011,477,158,2,2,2310,1955,3062,1,1,1,56,0


In [13]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((1488, 13), (1488,), (372, 13), (372,))

In [14]:
# train your model
model = LinearRegression()
lrmodel = model.fit(x_train, y_train)

In [15]:
lrmodel.coef_

array([ 8.21072108e+03, -1.13085670e+02,  1.81325769e+03, -4.33326948e+02,
        1.05060441e+03,  1.41581301e+01,  4.57505551e+03,  1.53504510e-01,
        1.57061341e+04,  2.40131760e+03, -4.93696735e+04,  3.63566557e+03,
       -1.57061341e+04])

In [16]:
lrmodel.intercept_

-25446624.18668623

In [17]:
x_train.head(1)

Unnamed: 0,year_sold,property_tax,insurance,beds,baths,sqft,year_built,lot_size,basement,popular,recession,property_age,property_type_Condo
25,2005,244,85,1,1,705,1985,0,0,0,0,20,1


In [18]:
#step 4
# make preditions on train set
train_pred = lrmodel.predict(x_train)

In [19]:
train_pred

array([291496.71521594, 468104.84706286, 377034.08034989, ...,
       626123.81891583, 364923.68465375, 421992.17954483])

In [20]:
#step 5
# evaluate your model
# we need mean absolute error
from sklearn.metrics import mean_absolute_error

train_mae = mean_absolute_error(train_pred, y_train)
print('Train error is', train_mae)

Train error is 84029.3437786043


In [21]:
lrmodel.coef_

array([ 8.21072108e+03, -1.13085670e+02,  1.81325769e+03, -4.33326948e+02,
        1.05060441e+03,  1.41581301e+01,  4.57505551e+03,  1.53504510e-01,
        1.57061341e+04,  2.40131760e+03, -4.93696735e+04,  3.63566557e+03,
       -1.57061341e+04])

In [22]:
lrmodel.intercept_

-25446624.18668623

In [23]:
# make predictions om test set
ypred = lrmodel.predict(x_test)

#evaluate the model
test_mae = mean_absolute_error(ypred, y_test)
print('Test error is', test_mae)

Test error is 84756.35024264638


### Our model is still not good beacuse we need a model with Mean Absolute Error < $70,000

Note - We have not scaled the features and not tuned the model.

## Decision Tree Model

In [24]:
# import decision tree model
from sklearn.tree import DecisionTreeRegressor

In [25]:
# create an instance of the class
dt = DecisionTreeRegressor(max_depth=3, max_features=10, random_state=567)

In [26]:
# train the model
dtmodel = dt.fit(x_train,y_train)

In [27]:
# make predictions using the test set
ytrain_pred = dtmodel.predict(x_train)

# evaluate the model
train_mae = mean_absolute_error(ytrain_pred, y_train)
train_mae

64390.54239084183

In [28]:
# make predictions using the test set
ytest_pred = dtmodel.predict(x_test)

In [29]:
# evaluate the model
test_mae = mean_absolute_error(ytest_pred, y_test)
test_mae

67093.10971218246

## Plot the tree

In [32]:
# get the features
dtmodel.feature_names_in_

array(['year_sold', 'property_tax', 'insurance', 'beds', 'baths', 'sqft',
       'year_built', 'lot_size', 'basement', 'popular', 'recession',
       'property_age', 'property_type_Condo'], dtype=object)

In [33]:
# plot the tree
from sklearn import tree

# Plot the tree with feature names
tree.plot_tree(dtmodel, feature_names=dtmodel.feature_names_in_)

#tree.plot_tree(dtmodel)
#plt.show(dpi=300)

# Save the plot to a file
plt.savefig('tree.png', dpi=300)

## Random Forest Model

In [34]:
# import decision tree model
from sklearn.ensemble import RandomForestRegressor

In [35]:
# create an instance of the model
rf = RandomForestRegressor(n_estimators=200, criterion='absolute_error')

In [36]:
# train the model
rfmodel = rf.fit(x_train,y_train)

In [37]:
# make prediction on train set
ytrain_pred = rfmodel.predict(x_train)

In [38]:
# make predictions on the x_test values
ytest_pred = rfmodel.predict(x_test)

In [39]:
# evaluate the model
test_mae = mean_absolute_error(ytest_pred, y_test)
test_mae

46640.51133064516

In [44]:
import pickle
with open('Re_Model.pkl', 'wb') as f:
    pickle.dump(rfmodel, f)

In [41]:
# Load the pickled model
RE_Model = pickle.load(open('RE_Model','rb'))

In [42]:
# Use the loaded pickled model to make predictions
RE_Model.predict([[2012, 216, 74, 1 , 1, 618, 2000, 600, 1, 0, 0, 6, 0]])



array([266654.0959596])

In [43]:
x_test.head(1)

Unnamed: 0,year_sold,property_tax,insurance,beds,baths,sqft,year_built,lot_size,basement,popular,recession,property_age,property_type_Condo
1193,2009,442,134,4,3,1600,1998,1600,1,0,0,11,0
