In [99]:
from sklearn.datasets import fetch_california_housing
import requests
from collections import namedtuple
from deployment_example.app.models.payload import HousePredictionPayload

import altair as alt
import pandas as pd

# Load data 

In [108]:
data = fetch_california_housing(as_frame=False)

In [136]:
print(data.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [126]:
df = pd.DataFrame(data.data, columns=HousePredictionPayload.schema()["required"])
target = pd.Series(data.target, name="median_house_value")

In [127]:
df.head()

Unnamed: 0,median_income_in_block,median_house_age_in_block,average_rooms,average_bedrooms,population_per_block,average_house_occupancy,block_latitude,block_longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [134]:
alt.Chart(df.assign(median_house_value=target).sample(1000)).mark_circle(size=60).encode(
    x='median_income_in_block',
    y='median_house_value',
    color='average_rooms',
)

### Query model

In [173]:
url = 'http://localhost:8000/api/model/predict'
def predict(request_body):
    return requests.post(
        url, 
        json=request_body,
        headers={'Content-Type': 'application/json',
                 "token": "sample_api_key"}
    )

# Test
response = predict(df.iloc[0].to_dict())
print(f"Status code: {response.status_code}")
print(response.json())

Status code: 200
{'median_house_value': 4.2240132718208585, 'currency': 'USD'}


Test multiple samples

In [169]:
df_sample = df.sample(100)
predictions = list()
for idx, sample in df_sample.iterrows():
    predictions.append(predict(sample.to_dict()).json()['median_house_value'])
    
result = (target.loc[df_sample.index].to_frame()
          .assign(prediction=predictions,
                  error=lambda x: x["median_house_value"] - x["prediction"]))

In [170]:
result.head()

Unnamed: 0,median_house_value,prediction,error
4151,1.758,2.347683,-0.589683
10947,1.833,1.492022,0.340978
12737,1.752,1.504842,0.247158
358,2.791,2.565331,0.225669
17159,5.00001,4.354074,0.645936


In [171]:
alt.Chart(result).mark_circle(size=60).encode(
    x='median_house_value',
    y='prediction'
)

In [172]:
alt.Chart(result).mark_bar().encode(
    alt.X("error:Q", bin=True),
    y='count()',
)