In [5]:
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

In [6]:
# Gather data
boston_dataset = load_boston()
data = pd.DataFrame(data = boston_dataset.data , columns = boston_dataset.feature_names)
features = data.drop(['INDUS' , 'AGE'] , axis = 1)
log_prices = np.log(boston_dataset.target)
target = pd.DataFrame(log_prices , columns = ['PRICE'])

In [29]:
CRIME_IDX = 0
ZN_IDX = 1
CHAS_IDX = 2
RM_IDX = 4
PTRATIO_IDX = 8

property_stats = features.mean().values.reshape(1 , 11)

In [30]:
regr = LinearRegression().fit(features , target)
fitted_vals = regr.predict(features)

# Challenge: Calculate the MSE and RMSE using sklearn
MSE = mean_squared_error(target , fitted_vals)
RMSE = np.sqrt(MSE)


In [72]:
def get_log_estimate(nr_rooms,
                    students_per_classroom,
                    next_to_river = False,
                    high_confidence = True):
    property_stats[0][RM_IDX] = nr_rooms
    property_stats[0][PTRATIO_IDX] = students_per_classroom
    
    if next_to_river:
        property_stats[0][CHAS_IDX] = 1
    else:
        property_stats[0][CHAS_IDX] = 0
        
        
    log_estimate = regr.predict(property_stats)[0][0]
    
    if high_confidence:
        upper_bound = log_estimate + 2*RMSE
        lower_bound = log_estimate - 2*RMSE
        interval = 95
    else:
        upper_bound = log_estimate + 1*RMSE
        lower_bound = log_estimate - 1*RMSE
        interval = 68
        
    return log_estimate, upper_bound , lower_bound , interval

In [73]:
get_log_estimate(3, 20 , next_to_river=True)

(2.776758191480399, 3.1517824618746597, 2.4017339210861386, 95)

In [74]:
np.median(boston_dataset.target)

21.2

In [84]:
def get_dollar_estimate(rm , ptratio , chas = False , large_range = True):
    
    
    if rm<1 or ptratio<1:
        print('That is unrealistic. Try again.')
        return
    
    today_median_price = 583.3
    scale_factor = today_median_price/np.median(boston_dataset.target)
    log_est , upper , lower , conf = get_log_estimate(rm, ptratio , chas , large_range)

    dollar_est = np.e**log_est * 1000 * scale_factor
    high_est = np.around(np.e**upper * 1000 * scale_factor , -3)
    lowr_est = np.around(np.e**lower * 1000 * scale_factor , -3)

    rounded_est = np.around(dollar_est , -3)
    print(f'The estimated property value is {rounded_est}.')
    print(f'{lowr_est} USD at the lower end to USD {high_est} at the high end.')
    print(f'At {conf}% confidence the valuation range is')

In [89]:
get_dollar_estimate( 2, 1 , chas = True)

The estimated property value is 822000.0.
565000.0 USD at the lower end to USD 1196000.0 at the high end.
At 95% confidence the valuation range is
