Topic: House Price Prediction using Machine Learning

Description: This project aims at building a model of house prices to predict median house values in the United States using provided dataset. This model should learn from the data and be able to predict the median house price in any city given all the other matrices. Predicting house prices can help to determine the selling price of a house in a particular city and can help people to find the right time to buy a home

Name: Oluwaseun Ojo

Student Number: 202194682

Tools: Scikit-learn, Matplotlib, Pandas, Seaborn, Numpy, Scipy




In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#import data
data = pd.read_csv('./data.csv')

df = pd.DataFrame(data)
df.head()

Exploratory Data Analysis & Data Visualization

In [None]:
import seaborn as sns
from scipy import stats
from matplotlib import style
from matplotlib.gridspec import GridSpec

from outliers import outliers

#plotting all the data using matplotlib
data.info()
data.hist(bins=50, figsize=(15, 10))
plt.grid()

#PRICE VISUALIZATION
data = outliers.remove_outliers(data, 'price') #remove outliers from price

fig = plt.figure(figsize=(15, 10))
grid = GridSpec(ncols=1, nrows=2, figure=fig)

# Price Histogram
ax1 = fig.add_subplot(grid[0, :])
plt.title('Price Histogram')
sns.histplot(data['price'], ax=ax1, kde=True)

# Price QQ plot
ax2 = fig.add_subplot(grid[1, :])
stats.probplot(data['price'], plot=ax2)

In [None]:
#BEDROOM VISUALIZATION
fig = plt.figure(figsize=(12, 8))
sns.catplot(x='bedrooms', y='price', data=data, height=5, aspect=2)
plt.legend(loc='upper left')
plt.title('Price vs Number of Bedrooms')

data = data[data.bedrooms < 8] #remove outliers in bedrooms
fig = plt.figure(figsize=(12, 8))
plt.title('Bedrooms Distribution')
sns.histplot(x=data.bedrooms)
plt.ylabel('Amount')

In [None]:
#BATHROOM VISUALIZATION

data = data[data.bathrooms < 8] #remove outliers in bathrooms
plt.figure(figsize=(12, 8))
sns.catplot(x='bathrooms', y='price', data=data, height=5, aspect=2)
plt.title('Price vs. Number of Bathrooms')

plt.figure(figsize=(12, 8))
sns.countplot(x=data['bathrooms'])
plt.title('Bathrooms Distribution')

In [None]:
#FLOOR VISUALIZATION
mask_floors = {1: 1, 1.5:1, 2:2, 2.5:3, 3:3, 3.5:3}
data.floors = data.floors.map(mask_floors)
plt.figure(figsize=(12, 8))
sns.catplot(x='floors', y='price', data=data, height=5, aspect=2)
plt.title('Price vs. Number of Floors')

In [None]:
#VIEWS AND WATERFRONT VISUALIZATION
plt.figure(figsize=(12, 8))
sns.catplot(x='view', y='price', data=data, kind='boxen', height=5, aspect=2)
plt.title('Price vs. Number of Views')

plt.figure(figsize=(12, 8))
sns.catplot(x='waterfront', y='price', data=data, kind='box', height=5, aspect=2)
plt.title('Price vs. Number of Waterfronts')

In [None]:
#YEAR BUILT AND YEAR RENOVATED VISUALIZATION
plt.figure(figsize=(12, 8))
sns.histplot(data.yr_built, kde=True)
plt.title('Year Built Distribution')

data.yr_renovated = data.yr_renovated.apply(lambda x: 0 if x==0 else 1)
sns.catplot(x='yr_renovated', y='price', data=data, kind='bar', height=5, aspect=2)
plt.title('Year Renovated Distribution')

In [None]:
#CITY VISUALIZATION
plt.figure(figsize=(12, 8))
data.city.value_counts().head(10).plot.pie() #top 10 cities
plt.title('City Distribution' )

Training our Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, r2_score

from geocoder import geocoder

# data['lat'] = df.apply(lambda x: geocoder.geocode(x.street)[0], axis=1)
# data['long'] = df.apply(lambda x: geocoder.geocode(x.street)[1], axis=1)

# new_data = data.copy()
# new_data.to_csv('data_updated.csv')




In [None]:
new_data = pd.read_csv('./data_updated.csv')

df = pd.DataFrame(new_data)

df.info()

X= new_data.drop(['price', 'street', 'city', 'country', 'statezip'], axis=1)
y = new_data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

#Transform the data using StandardScaler
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.transform(X_test)

#Train the model using LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
lin_reg_pred = lin_reg.predict(X_test)

#Calculate mean squared error, mean absolute error, and r2 score
lin_reg_mae = mean_absolute_error(y_test, lin_reg_pred)
lin_reg_mse = mean_squared_error(y_test, lin_reg_pred)
lin_reg_r2 = r2_score(y_test, lin_reg_pred)
print('MAE of linear regression model:', lin_reg_mae)
print('MSE of linear regression model:', lin_reg_mse)
print('R2 of linear regression model:', lin_reg_r2)

#Train the model using DecisionTreeRegressor
dec_tree = DecisionTreeRegressor()
dec_tree.fit(X_train, y_train)
dec_tree_pred = dec_tree.predict(X_test)

#Calculate mean squared error, mean absolute error, and r2 score
dec_tree_mae = mean_absolute_error(y_test, dec_tree_pred)
dec_tree_mse = mean_squared_error(y_test, dec_tree_pred)
dec_tree_r2 = r2_score(y_test, dec_tree_pred)
print('MAE of decision tree model:', dec_tree_mae)
print('MSE of decision tree model:', dec_tree_mse)
print('R2 of decision tree model:', dec_tree_r2)

#Train the model using RandomForestRegressor
ran_forest = RandomForestRegressor()
ran_forest.fit(X_train, y_train)
ran_forest_pred = ran_forest.predict(X_test)

#Calculate mean squared error, mean absolute error, and r2 score
ran_forest_mae = mean_absolute_error(y_test, ran_forest_pred)
ran_forest_mse = mean_squared_error(y_test, ran_forest_pred)
ran_forest_r2 = r2_score(y_test, ran_forest_pred)
print('MAE of random forest model:', ran_forest_mae)
print('MSE of random forest model:', ran_forest_mse)
print('R2 of random forest model:', ran_forest_r2)



In [18]:
from prediction import prediction

test_data = {'bedrooms': 3, 'bathrooms': 2, 'sqft_living': 2000, 'sqft_lot': 1000, 'floors': 1, 'waterfront': 1, 'view': 0, 'condition': 3, 'grade': 10, 'sqft_above': 1000, 'sqft_basement': 0, 'yr_built': 2000, 'yr_renovated': 0, 'address': '123 Main St California'}
predicted_value = ran_forest.predict((prediction.transform(test_data)))
print('Predicted value for house at address:', test_data['address'], 'is:', predicted_value)

Predicted value for house at address: 123 Main St California is: [540431.211207]


