In [60]:
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

In [61]:
#Gather Data
boston_dataset=load_boston()
data=pd.DataFrame(data=boston_dataset.data, columns=boston_dataset.feature_names)
data.head()
features=data.drop(['INDUS', 'AGE'],axis=1)
print(features.head())

log_prices=np.log(boston_dataset.target)

#We need to convert log_prices to a dataframe in order to be compatible with the features dataset
target=pd.DataFrame(log_prices, columns=['PRICE'])

      CRIM    ZN  CHAS    NOX     RM     DIS  RAD    TAX  PTRATIO       B  \
0  0.00632  18.0   0.0  0.538  6.575  4.0900  1.0  296.0     15.3  396.90   
1  0.02731   0.0   0.0  0.469  6.421  4.9671  2.0  242.0     17.8  396.90   
2  0.02729   0.0   0.0  0.469  7.185  4.9671  2.0  242.0     17.8  392.83   
3  0.03237   0.0   0.0  0.458  6.998  6.0622  3.0  222.0     18.7  394.63   
4  0.06905   0.0   0.0  0.458  7.147  6.0622  3.0  222.0     18.7  396.90   

   LSTAT  
0   4.98  
1   9.14  
2   4.03  
3   2.94  
4   5.33  


In [62]:
#Creating a row in order to predict
property_stats=np.ndarray(shape=(1,11))

#We create indices for ease
CRIME_IDX=0
ZN_IDX=1
CHAS_IDX=2
RM_IDX=4
PTRATIO_IDX=8

#We create a datapoint, for each datapoint that we do not specify ourselves a value about the value is zero.
property_stats[0][CRIME_IDX]=features['CRIM'].mean()
property_stats[0][ZN_IDX]=features['ZN'].mean()
property_stats[0][CHAS_IDX]=features['CHAS'].mean()

print(property_stats)
#Instead of specifying the value for each variable, we put them all at once.

[[ 3.61352356 11.36363636  0.06916996  0.          0.          0.
   0.          0.          0.          0.          0.        ]]


In [63]:
#Adding the mean for each variable to the property_stats datapoint
#We have to turn the features.mean() into a np.array and then reshape so that is is compatible with the property_stats structure
property_stats=features.mean().values.reshape(1,11)
print(property_stats)

[[3.61352356e+00 1.13636364e+01 6.91699605e-02 5.54695059e-01
  6.28463439e+00 3.79504269e+00 9.54940711e+00 4.08237154e+02
  1.84555336e+01 3.56674032e+02 1.26530632e+01]]


In [64]:
regr=LinearRegression().fit(features,target)
in_sample_pred=regr.predict(features)

print('MSE:',mean_squared_error(target,in_sample_pred))
print('Root MSE',np.sqrt(mean_squared_error(target,in_sample_pred)))

RMSE=np.sqrt(mean_squared_error(target,in_sample_pred))

MSE: 0.03516080084618688
Root MSE 0.18751213519713034


In [71]:
def get_log_estimate(nr_rooms,students_per_classroom, next_river=False,  high_confidence=True):
    
    if students_per_classroom<1 or nr_rooms<1:
        print('This is unrealistic')
        return
    
    property_stats[0][PTRATIO_IDX]=students_per_classroom
    property_stats[0][RM_IDX]=nr_rooms
    if next_river:
        property_stats[0][CHAS_IDX]=1
    else:
        property_stats[0][CHAS_IDX]=0
    
    
    log_estimate=regr.predict(property_stats)
    
    if high_confidence:
        upper_bound=log_estimate+2*RMSE
        lower_bound=log_estimate-2*RMSE
        interval=95
    else:
        upper_bound=log_estimate+RMSE
        lower_bound=log_estimate-RMSE
        interval=68
    
    return log_estimate, upper_bound,lower_bound,interval

In [72]:
get_log_estimate(2,20,next_river=False)

(array([[2.58093695]]), array([[2.95596122]]), array([[2.20591268]]), 95)

In [73]:
#Function the returns the total value in today's terms (accounting for inflation)
Current_boston_median_price=583.3
prior_median_price=np.exp(np.median(target))
scale=Current_boston_median_price/prior_median_price


def convertion(x):
    return np.around(1000*np.exp(x)*scale, -3)



In [79]:
def get_dollar_estimate(nr_rooms,students_per_classroom, next_river=False,  high_confidence=True):
    """ Estimate the price of a property in Boston
    
    """
    mean, upper, lower, inter = get_log_estimate(nr_rooms,students_per_classroom,next_river,high_confidence)
    mean_in_dollars=convertion(mean)[0][0]
    upper_in_dollars=convertion(upper)[0][0]
    lower_in_dollars=convertion(lower)[0][0]
    return mean_in_dollars, upper_in_dollars, lower_in_dollars, inter

In [80]:
mean, upper, lower, inter = get_dollar_estimate(2,30,True)

print('Estimared Property Value:',mean,'$')
print('Upper bound:',upper,'$')
print('Lower bound',lower,'$')
print('CI:',inter,'%')

Estimared Property Value: 278000.0 $
Upper bound: 404000.0 $
Lower bound 191000.0 $
CI: 95 %


In [70]:
scale

27.51415094339622

(708000.0, 1030000.0, 486000.0, 95)