In [1]:
# Import dependencies
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error,mean_absolute_percentage_error
import joblib

In [2]:
#Load csv file

housing = ("Resources/Modified_housing.csv")

housing_df = pd.read_csv(housing)
housing_df

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,yr_built,yr_renovated,zipcode,lat,long,renovation_category,renovation_category_numeric
0,1954400510,510000.0,3,2.0,1680,8080,1.0,0,0,1987,0,98074,47.6168,-122.045,Never Renovated,0
1,7237550310,1225000.0,4,4.0,5420,101930,1.0,0,0,2001,0,98053,47.6561,-122.005,Never Renovated,0
2,1321400060,257500.0,3,2.0,1715,6819,2.0,0,0,1995,0,98003,47.3097,-122.327,Never Renovated,0
3,3793500160,323000.0,3,2.0,1890,6560,2.0,0,0,2003,0,98038,47.3684,-122.031,Never Renovated,0
4,1875500060,395000.0,3,2.0,1890,14040,2.0,0,0,1994,0,98019,47.7277,-121.962,Never Renovated,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8287,263000018,360000.0,3,2.0,1530,1131,3.0,0,0,2009,0,98103,47.6993,-122.346,Never Renovated,0
8288,6600060120,400000.0,4,2.0,2310,5813,2.0,0,0,2014,0,98146,47.5107,-122.362,Never Renovated,0
8289,1523300141,402101.0,2,1.0,1020,1350,2.0,0,0,2009,0,98144,47.5944,-122.299,Never Renovated,0
8290,291310100,400000.0,3,2.0,1600,2388,2.0,0,0,2004,0,98027,47.5345,-122.069,Never Renovated,0


In [3]:
# Encode the categorical column into dummy/indicator variables
dummies = pd.get_dummies(housing_df, columns=['zipcode','renovation_category'], dtype=int)
dummies.head()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,yr_built,...,zipcode_98168,zipcode_98177,zipcode_98178,zipcode_98188,zipcode_98198,zipcode_98199,renovation_category_Never Renovated,renovation_category_Renovated 2000-2010,renovation_category_Renovated 2010-2015,renovation_category_Renovated before 2000
0,1954400510,510000.0,3,2.0,1680,8080,1.0,0,0,1987,...,0,0,0,0,0,0,1,0,0,0
1,7237550310,1225000.0,4,4.0,5420,101930,1.0,0,0,2001,...,0,0,0,0,0,0,1,0,0,0
2,1321400060,257500.0,3,2.0,1715,6819,2.0,0,0,1995,...,0,0,0,0,0,0,1,0,0,0
3,3793500160,323000.0,3,2.0,1890,6560,2.0,0,0,2003,...,0,0,0,0,0,0,1,0,0,0
4,1875500060,395000.0,3,2.0,1890,14040,2.0,0,0,1994,...,0,0,0,0,0,0,1,0,0,0


In [4]:
# Drop the original 'yr_renovated' column and ID
dummies = housing_df.drop(columns=['yr_renovated', 'id','sqft_lot','waterfront','renovation_category_numeric','renovation_category'])
dummies.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,floors,view,yr_built,zipcode,lat,long
0,510000.0,3,2.0,1680,1.0,0,1987,98074,47.6168,-122.045
1,1225000.0,4,4.0,5420,1.0,0,2001,98053,47.6561,-122.005
2,257500.0,3,2.0,1715,2.0,0,1995,98003,47.3097,-122.327
3,323000.0,3,2.0,1890,2.0,0,2003,98038,47.3684,-122.031
4,395000.0,3,2.0,1890,2.0,0,1994,98019,47.7277,-121.962


In [5]:
#Create the model
model = RandomForestRegressor()

In [6]:
#Train the model
X = dummies[['bedrooms','bathrooms','sqft_living', 'floors', 'lat','long','yr_built']]
X= X[:int(len(dummies)-1)]
y= dummies['price']
y= y[:int(len(dummies)-1)]
model.fit(X,y)


In [7]:
#Test the model
predictions = model.predict(X)
print('The model score is:', model.score(X,y))

The model score is: 0.9764604232431253


In [8]:
#Make the predictions

new_data = dummies[['bedrooms','bathrooms','sqft_living', 'floors', 'lat','long','yr_built']].tail(1)
prediction = model.predict(new_data)

print('The model predicts the last row or day to be:', prediction)
print('Actual value is:', dummies[['price']].tail(1).values[0][0])


The model predicts the last row or day to be: [355125.54]
Actual value is: 325000.0


In [9]:
# Calculate Mean Absolute Error (MAE) and Mean Squared Error (MSE)
mae = mean_absolute_error(y, predictions)
mse = mean_squared_error(y, predictions)

print('Mean Absolute Error (MAE):', mae)
print('Mean Squared Error (MSE):', mse)

Mean Absolute Error (MAE): 28277.614511001604
Mean Squared Error (MSE): 3829810132.8187976


In [10]:
#marginal error
#sqlite - trend of model accurasy over time

In [11]:
# Calculate Mean Absolute Percentage Error (MAPE) 
mape = mean_absolute_percentage_error(y, predictions)
print(mape)
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape = mean_absolute_percentage_error(y, predictions)
print('Mean Absolute Percentage Error (MAPE):', mape)

0.04198658513560681
Mean Absolute Percentage Error (MAPE): 4.198658513560681
