In [1]:
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np

In [2]:
boston_dataset = load_boston()
data =pd.DataFrame (data= boston_dataset.data, columns=boston_dataset.feature_names)
features = data.drop(['INDUS','AGE'],axis=1)
log_prices = np.log(boston_dataset.target)
target = pd.DataFrame(log_prices, columns=['PRICE'])


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

In [3]:
CRIME_IDX=0
ZN_IDX = 1
CHAS_IDX = 2
RM_IDX = 4
PTRATIO_IDX = 8
property_stats = np.ndarray(shape=(1,11))
#prepopulate array with starting characteristics 
property_stats = features.mean().values.reshape(1,11)
property_stats


array([[3.61352356e+00, 1.13636364e+01, 6.91699605e-02, 5.54695059e-01,
        6.28463439e+00, 3.79504269e+00, 9.54940711e+00, 4.08237154e+02,
        1.84555336e+01, 3.56674032e+02, 1.26530632e+01]])

In [4]:
regr = LinearRegression().fit(features, target)
fitted_vals = regr.predict(features)
mse = mean_squared_error(target, fitted_vals)
rmse = np.sqrt(mse)


In [5]:
def get_log_estimate(nr_rooms, 
                     student_per_classroom,
                     next_to_river= False,
                     high_confidence= True):

     #configure property

    property_stats[0][RM_IDX]= nr_rooms
    property_stats[0][PTRATIO_IDX] = student_per_classroom 

    if next_to_river:
        property_stats[0][CHAS_IDX] = 1
    else:
        property_stats[0][CHAS_IDX] =0
    log_estimate = regr.predict(property_stats)[0][0]
    if high_confidence:
        upper_bound = log_estimate + 2*rmse
        lower_bound = log_estimate - 2*rmse
        interval = 95
    else:
        upper_bound = log_estimate + rmse
        lower_bound = log_estimate -2*rmse
        interval = 68

    
    return log_estimate, upper_bound, lower_bound, interval


In [6]:
get_log_estimate(3,20,next_to_river = True)



(2.7767581914803987, 3.1517824618746593, 2.401733921086138, 95)

In [10]:
def get_dollar_estimate(rm, ptratio, chas = False, large_range = True):
    ZILLOW_MEDIAN_PRICE = 583.3
    SCALE_FACTOR = ZILLOW_MEDIAN_PRICE/np.median(boston_dataset.target)
    
    if rm<1 or ptratio<1:
        print("that is unrealistic. Try again")
        return
    log_est,upper,low,conf =get_log_estimate(rm,ptratio,next_to_river =chas,high_confidence = large_range)


    dollar_est = np.e**log_est * 1000 *SCALE_FACTOR
    dollar_hi= np.e**upper * 1000 *SCALE_FACTOR
    dollar_low = np.e** low * 1000 *SCALE_FACTOR

    # rounding dollars to nearest thousand
    rounded_est = np.around(dollar_est, -3)
    rounded_hi = np.around(dollar_hi, -3)
    rounded_low = np.around(dollar_low, -3)

    print (f'Estimated property value: {rounded_est}')
    print(f'valuation confidence range: {conf}')
    print(f'lower_end:{rounded_low}, higher_end:{rounded_hi}')
     
     

In [12]:
get_dollar_estimate(0,30,True)

Estimated property value: 232000.0
valuation confidence range: 95
lower_end:159000.0, higher_end:337000.0


