# House Price Prediction
In this project, I'll use the California House dataset available in sklearn and use Jupyter Kernel Gateway to expose its cells as Endpoints.

## Import libraries

In [292]:
import json
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

## Dataset import
We get the features inside .data and labels inside .target. We split it into test and train data using trsin_test_split with test size of 33%.

In [293]:
fetched_data = fetch_california_housing()
X_train, X_test, y_train, y_test = train_test_split(fetched_data.data, fetched_data.target, test_size = 0.33)


Now we will get its desription using .DESC and get the complete information on the same.

In [294]:
print(fetched_data.DESCR)

California housing dataset.

The original database is available from StatLib

    http://lib.stat.cmu.edu/datasets/

The data contains 20,640 observations on 9 variables.

This dataset contains the average house value as target variable
and the following input variables (features): average income,
housing average age, average rooms, average bedrooms, population,
average occupation, latitude, and longitude in that order.

References
----------

Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,
Statistics and Probability Letters, 33 (1997) 291-297.




## Data Analysis
Here, we will analyse the dataset and creata a GET endpoint to fetch the basic stats.
We first concatenate the features and labels and then combine them as columns with specific column names.

In [295]:
dataset = pd.concat([pd.DataFrame(fetched_data.data, columns = fetched_data.feature_names), 
                     pd.DataFrame(fetched_data.target*100000, columns = ['Price'])], axis = 1)

Let's analyse the dataset.

In [296]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
MedInc        20640 non-null float64
HouseAge      20640 non-null float64
AveRooms      20640 non-null float64
AveBedrms     20640 non-null float64
Population    20640 non-null float64
AveOccup      20640 non-null float64
Latitude      20640 non-null float64
Longitude     20640 non-null float64
Price         20640 non-null float64
dtypes: float64(9)
memory usage: 1.4 MB


We see that we have a total of 20640 houses. There are total of 8 features and 1 label column. There are no null values.

In [297]:
dataset.corr()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176,0.688075
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197,0.105623
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754,0.151948
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344,-0.046701
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773,-0.02465
AveOccup,0.018766,0.013191,-0.004852,-0.006181,0.069863,1.0,0.002366,0.002476,-0.023737
Latitude,-0.079809,0.011173,0.106389,0.069721,-0.108785,0.002366,1.0,-0.924664,-0.14416
Longitude,-0.015176,-0.108197,-0.02754,0.013344,0.099773,0.002476,-0.924664,1.0,-0.045967
Price,0.688075,0.105623,0.151948,-0.046701,-0.02465,-0.023737,-0.14416,-0.045967,1.0


We see that the price is mainly dependant on Median Income with a correlation of approximately ~0.7.

### GET Endpoint
This endpoint will extract important information about our dataset and return the same when the endpoint is called.

In [298]:
# GET /housing_stats
total_houses = len(dataset)
max_value = dataset['Price'].describe()['max']
min_value = dataset['Price'].describe()['min']
print(json.dumps({
    'total_houses': total_houses,
    'max_value': max_value,
    'min_value': min_value,
    'most_imp_feature': 'Median Income'
}))

{"max_value": 500000.99999999994, "min_value": 14999.000000000002, "most_imp_feature": "Median Income", "total_houses": 20640}


## Scatter Plot

In [299]:
import plotly
print (plotly.__version__)
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot, plot 
init_notebook_mode() 
#iplot([go.Scatter(x=[1, 2, 3], y=[3, 2, 6])])
# We can also download an image of the plot by setting the image to theformat you want. e.g. `image='png'`
#iplot([{'x': [1, 2, 3], 'y': [5, 2, 7]}], image='png')
# Create a plotly graph locally as an HTML document or string
#plot([go.Scatter(x=[1, 2, 3], y=[3, 2, 6])], filename='my-graph.html')


1.12.5


In [356]:
x=dataset.MedInc
y=dataset.Price
z=dataset.AveBedrms

# for categorical data in z
groups=sorted(z.unique())
data=[]
for g in groups:
    group=dataset[z==g]
    i=1
    data.append(
        go.Scattergl(
            x=group[x.name],
            y=group[y.name],
            mode='markers',
            text=['{}: {}<br>{}: {}<br>{}: {}'.format(x.name,group.at[i,x.name],y.name, group.at[i,y.name],z.name, group.at[i,z.name]) for i in group.index],
            marker=dict(
                opacity=0.75,
            ),
            name=z.name+': '+str(g)
        )
    )

#for numerical data in z
data=[]
group = dataset
dataPoints = go.Scattergl(
    x=x,
    y=y,
    mode='markers',
    text=['{}: {}<br>{}: {}<br>{}: {}'.format(x.name,group.at[i,x.name],y.name, group.at[i,y.name],z.name, group.at[i,z.name]) for i in group.index],
    marker=dict(
        opacity=0.75,
        color=z,
        showscale=True,
        colorscale='Plasma',
        #colorbar=dict(title=z.name),
    ),
    name='Data points'
)
data.append(dataPoints)

# lin regression
m,b = np.polyfit(x, y, 1)
bestfit_y = (x * m + b)

lineOfBestFit=go.Scattergl(
    x=x,
    y=bestfit_y,
    name='Line of best fit',
    line=dict(
        color='red',
    )
)
data.append(lineOfBestFit)

figure = go.Figure()
figure.data=data

#figure.layout.xaxis=dict(type="log")
#figure.layout.yaxis=dict(type="log")

iplot(figure)

In [349]:
# geoscatter
figure = go.Figure()

#data
data=figure.data
c=dataset.Price
geo_data=go.Scattergeo(
    lat = dataset.Latitude,
    lon = dataset.Longitude,
    text = c.round().tolist(),
    marker = dict(
        color = c,
        colorscale = 'Plasma',
        opacity = 0.7,
        size = 5,
        colorbar = dict(
            titleside = "right",
            title = dict(text=c.name),
        ) 
    )
)
data.append(geo_data)

#layout
figure.layout.geo.resolution = 50
figure.layout.geo.scope = 'usa'
figure.layout.geo.showland = True
figure.layout.geo.showocean = True
figure.layout.geo.showlakes=True
figure.layout.geo.showrivers=True
figure.layout.geo.showcountries=True
figure.layout.geo.showsubunits = True
figure.layout.geo.lataxis= dict(
            range = [dataset.Latitude.min(),dataset.Latitude.max()],
            showgrid = True,
            dtick = 20
        )
figure.layout.geo.lonaxis= dict(
            range = [dataset.Longitude.min(),dataset.Longitude.max()],
            showgrid = True,
            dtick = 20
        )
figure.layout.geo.fitbounds="locations"

iplot(figure)

## Machine Learning
Let's now directly train our dataset on the train data and analyse our Mean Absolute Error on the test data.

In [302]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

clf = RandomForestRegressor(n_estimators = 100, max_depth = 50)
clf.fit(X_train, y_train)
print("Mean Absolute Error: {}".format(mean_squared_error(y_test, clf.predict(X_test))))

Mean Absolute Error: 0.269859849956


### POST Endpoint
Here, I'll train the model on the complete dataset and then simply use the post endpoint to get the price.

In [303]:
endpoint_classifier = RandomForestRegressor(n_estimators = 100, max_depth = 50)
endpoint_classifier.fit(fetched_data.data, fetched_data.target)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=50,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

Let's define a random REQUEST object for our POST Endpoint with the mean values from our dataset.

In [304]:
features = pd.DataFrame(fetched_data.data)
mean_values = features.describe().iloc[1, :]

REQUEST = json.dumps({
    'body': {
        'MedInc': mean_values[0],
        'HouseAge': mean_values[1],
        'AveRooms': mean_values[2],
        'AveBedrms': mean_values[3],
        'Population': mean_values[4],
        'AveOccup': mean_values[5],
        'Latitude': mean_values[6],
        'Longitude': mean_values[7]
    }
})


The endpoint will accept all the values from the user and return the predicted price. The data received is in the body part of the request.

In [305]:
# POST /get_price
req = json.loads(REQUEST)
req = np.array(list(req['body'].values()))
predicted_price = endpoint_classifier.predict(req.reshape(1, -1))[0]
predicted_price = "{0:.2f}".format(predicted_price*100000)
print(json.dumps({
    'result': 'The price of the house with your specifications should be approximately: $' + predicted_price
}))

{"result": "The price of the house with your specifications should be approximately: $146081.03"}


In [357]:
iplot(figure)

f = go.FigureWidget([go.Scatter(x=x, y=y, mode='markers')])

scatter = f.data[0]
colors = ['#a3a7e4'] * 100
scatter.marker.color = colors
scatter.marker.size = [10] * 100
f.layout.hovermode = 'closest'


# create our callback function
def update_point(trace, points, selector):
    c = list(scatter.marker.color)
    s = list(scatter.marker.size)
    for i in points.point_inds:
        c[i] = '#bae2be'
        s[i] = 20
        with f.batch_update():
            scatter.marker.color = c
            scatter.marker.size = s


scatter.on_click(update_point)
