In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset

* The dataset I used is https://www.kaggle.com/harlfoxem/housesalesprediction

In [None]:
df = pd.read_csv('../input/housesalesprediction/kc_house_data.csv')
df.shape

In [None]:
df.columns

In [None]:
print('Null values present:',df.isnull().any().any())

In [None]:
df.info()

In [None]:
df.head()

# Feature Description

1. id: Unique ID for each home sold
2. date: Date of the home sale
3. price: Price of each home sold
4. bedrooms: Number of bedrooms
5. bathrooms: Number of bathrooms, where .5 accounts for a room with a toilet but no shower
6. sqft_living: Square footage of the apartments interior living space
7. sqft_lot: Square footage of the land space
8. floors: Number of floors
9. waterfront: - A dummy variable for whether the apartment was overlooking the waterfront or not
10. view: An index from 0 to 4 of how good the view of the property was
11. condition: - An index from 1 to 5 on the condition of the apartment,
12. grade: An index from 1 to 13, where 1-3 falls short of building construction and design, 7 has an average 1. level of construction and design, and 11-13 have a high quality level of construction and design.
13. sqft_above: The square footage of the interior housing space that is above ground level
14. sqft_basement: The square footage of the interior housing space that is below ground level
15. yr_built: The year the house was initially built
16. yr_renovated: The year of the house’s last renovation
17. zipcode: What zipcode area the house is in
18. lat: Lattitude
19. long: Longitude
20. sqft_living15: The square footage of interior housing living space for the nearest 15 neighbors
21. sqft_lot15: The square footage of the land lots of the nearest 15 neighbors

### Categorical Features

* Categorical: id, waterfront, zipcode

### Numerical Features

* Continuous: price, bathrooms, floors, lat, long
* Discrete:  date, bedrooms, sqft_living, sqft_lot, view, condition, grade, sqft_above, sqft_basement, yr_built, yr_renovated, sqft_living15, sqft_lot15

# Exploratory Data Analysis

### Lets see first the correlation of independent variables to target variable

In [None]:
plt.figure(figsize=(17,10))
sns.heatmap(df.corr(),annot=True,fmt='.2f',cmap='OrRd')
plt.show()

Lets see the features with high correlation to price (independent feature)

In [None]:
corr = df.corr()

corr['price'].drop(['price']).sort_values(ascending=False)

## Exploring categorical variables

* House price distribution based on waterfront

### House prices with and without waterfront

In [None]:
fig,ax = plt.subplots(ncols=2,figsize=(18,6))
sns.kdeplot(df[df['waterfront']==0]['price'],color='r',shade=True,label='Without waterfront',ax=ax[0])
sns.kdeplot(df[df['waterfront']==1]['price'],color='b',shade=True,label='Waterfront',ax=ax[0])
sns.countplot(x='waterfront',data=df,ax=ax[1])
plt.yticks([])
plt.show()

#### Insight

* Houses with waterfront tends to have higher average prices than those who dont have. 
* Prices of houses with waterfront are more spread out.
* Outliers are present in both categories.
* Imbalanced distribution of data between houses with and without waterfront

## Exploring numerical variables

* House price based on number of bedrooms and bathrooms
* House price based on living space and land space (square ft.)
* House price based on the number of floors
* House price based on view and condition
* House price based on grade
* House price based on year built and renovated

### House price based on number of bedrooms and bathrooms

In [None]:
fig,ax = plt.subplots(ncols=2,figsize=(18,6))

sns.boxplot(x='bedrooms',y='price',data=df,ax=ax[0])
sns.pointplot(x='bathrooms',y='price',data=df,estimator=np.mean,ax=ax[1])

plt.xticks(rotation=45)
plt.show()

#### Insight

* Houses with higher number of bedrooms and bathrooms tend to have higher prices as well.

### House price based on living space and land space (square ft.)

In [None]:
sns.set()

fig,ax = plt.subplots(ncols=2,figsize=(18,6))

ax[0].scatter(df['sqft_living'],df['price'])
ax[1].scatter(df['sqft_lot'],df['price'])

labels = [
    {
        'title': 'House price based on living space (square ft.)',
        'xlabel': 'Living space in square feet',
        'ylabel': 'Price'
    },
    {
        'title': 'House price based on lot space (square ft.)',
        'xlabel': 'Lot space in square feet',
        'ylabel': 'Price'
    }
]

for i in range(2):
    ax[i].set_title(labels[i]['title'])
    ax[i].set_xlabel(labels[i]['xlabel'])
    ax[i].set_ylabel(labels[i]['ylabel'])

plt.show()

#### Insight

* We can see that there is a linear relationship between the living space and house price.
* In terms of lot space and price, we cant see that much relationship between these two variables.

### House price based on the number of floors

In [None]:
plt.figure(figsize=(10,7))

sns.boxplot(x='floors',y='price',data=df)
plt.title('House price based on the number of floors')
plt.xlabel('Number of floors')
plt.show()

#### Insight

* We can see that the number of floors affect the house price.

### House price based on view and condition

In [None]:
fig,ax = plt.subplots(ncols=2,figsize=(18,6))

sns.boxplot(x='view',y='price',data=df,ax=ax[0])
sns.boxplot(x='condition',y='price',data=df,ax=ax[1])

plt.show()

#### Insight

* We can see that the higher the rate for view and condition, the higher the price.

### House price based on grade

In [None]:
plt.figure(figsize=(10,7))
sns.boxplot(x='grade',y='price',data=df)

plt.title('House price based on grade')
plt.show()

#### Insight

* Just like the view and condition, higher grade gives higher price.

### House price based on year built and renovated

In [None]:
year_built = df.groupby('yr_built')['price'].mean()
year_built.index = pd.to_datetime(year_built.index, format='%Y')
year_built_resampled = year_built.resample('5A').mean()

year_renovated = df.groupby('yr_renovated')['price'].mean()[1:]
year_renovated.index = pd.to_datetime(year_renovated.index, format='%Y')
year_renovated_resampled = year_renovated.resample('5A').mean()

fig,ax = plt.subplots(ncols=2,figsize=(18,6))
ax[0].plot(year_built_resampled,lw=3)
ax[1].plot(year_renovated_resampled,lw=3)

for i in range(2):
    ax[i].set_xlabel('Year')
    ax[i].set_ylabel('Price')
plt.show()

#### Insight

* Newer houses tends to have higher prices.
* Houses that were renovated in later years also tends to have higher prices.

# Feature Selection

* Dropping unnecessary features

### Dropping unnecessary features

Following are the features that we will drop from the dataframe.

* id - IDs are not needed for training.
* date - The date in this particular dataset are only limited to 2014 and 2015 and will not likely to contribute to price.
* zipcode,sqft_living15,sqft_lot15 - these features are also not needed

In [None]:
df2 = df.drop(['id','date','zipcode','sqft_living15','sqft_lot15'],axis='columns')
df2.head()

In [None]:
print('Shape after removing features and outliers:',df2.shape)

# Model Building

* Importing necessary libraries
* Splitting the dataset into training and testing data
* Feature Scaling
* Creating ANN model
* Training the model
* Evaluating the model

### Importing necessary libraries

In [None]:
# Splitting and scaling the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Creating ANN model
import tensorflow as tf
from tensorflow.keras import Sequential, Input
from tensorflow.keras.layers import Dense

# Evaluating the model
from sklearn.metrics import r2_score

### Splitting the dataset into training and testing data

In [None]:
X = df2.drop('price',axis='columns')
y = df2['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

print('Lenght of training data:',len(X_train))
print('Lenght of testing data:',len(X_test))

### Feature Scaling

* We will be using MinMaxScaler from sklearn library.
* This scales your data between 0 and 1.

In [None]:
scaler = MinMaxScaler()

# fit and transfrom
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Creating ANN model

In [None]:
model = Sequential()

# input layer
model.add(Input(shape=(15,)))

# hidden layers
model.add(Dense(20,activation='relu'))
model.add(Dense(30,activation='relu'))
model.add(Dense(30,activation='relu'))
model.add(Dense(20,activation='relu'))
model.add(Dense(20,activation='relu'))
model.add(Dense(30,activation='relu'))
model.add(Dense(30,activation='relu'))
model.add(Dense(20,activation='relu'))

# output layer
model.add(Dense(1,activation='linear'))

model.compile(optimizer='adam',loss='mse')

### Training the model

In [None]:
model.fit(x=X_train,y=y_train.values,validation_data=(X_test,y_test.values),batch_size=256,epochs=400)

### Evaluating the model

In [None]:
y_predicted = model.predict(X_test)

print('r2 score:',r2_score(y_test,y_predicted))

In [None]:
fig,ax = plt.subplots(ncols=2,figsize=(18,6))

y_test_values = y_test.values
y_predicted_values = y_predicted.reshape(len(y_predicted))
errors = y_test_values - y_predicted_values

sns.distplot(errors,ax=ax[0])
ax[0].set_title('Error histogram')
ax[0].set_xlabel('Error')

ax[1].scatter(y_test_values,y_predicted_values)
ax[1].plot(y_test_values,y_test_values,'r')
ax[1].set_title('Model Prediction and Perfect fit')
ax[1].set_xlabel('Y test')
ax[1].set_ylabel('Model Prediction')
plt.show()

# Saving the model

* Save the model
* Load model and check r2 score

### Save the model

In [None]:
model.save('/kaggle/working/house_price_prediction_model.h5')

### Load model and check r2 score

In [None]:
loaded_model = tf.keras.models.load_model('./house_price_prediction_model.h5')
loaded_model.summary()

In [None]:
loaded_model_y_predicted = loaded_model.predict(X_test)

print('Loaded model r2 score:',r2_score(y_test,loaded_model_y_predicted))

This [notebook](https://www.kaggle.com/tomasmantero/predicting-house-prices-keras-ann) helped me in constructing this notebook.