### Imports

In [3]:
import pandas as pd 
import numpy as np
from sklearn.linear_model import LinearRegression as LR
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split as tts
import matplotlib.pyplot as plt
import seaborn as sb
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

### Dataset 

In [4]:
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

features = [
    "CRIM",     # per capita crime rate by town
    "ZN",       # proportion of residential land zoned for lots over 25,000 sq.ft.
    "INDUS",    # proportion of non-retail business acres per town
    "CHAS",     # Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
    "NOX",      # nitric oxides concentration (parts per 10 million)
    "RM",       # average number of rooms per dwelling
    "AGE",      # proportion of owner-occupied units built prior to 1940
    "DIS",      # weighted distances to five Boston employment centres
    "RAD",      # index of accessibility to radial highways
    "TAX",      # full-value property-tax rate per $10,000
    "PTRATIO",  # pupil-teacher ratio by town
    "B",        # 1000(Bk - 0.63)^2 where Bk is the proportion of Black residents by town
    "LSTAT"     # % lower status of the population
]

data = pd.DataFrame(data=data, columns=features)
data["Price"] = target
data = data.drop(["INDUS","AGE"] , axis =1)
log_prices = np.log(data.Price)
target = pd.DataFrame(data=log_prices, columns=["Price"])
features = data.drop(["Price"],axis=1)
property_stats  = np.ndarray(shape=(1,11))

#### stick


In [5]:
RM_IDX = 4
CHAS_IDX = 2
property_stats = features.mean().values.reshape(1,11)
property_stats


array([[3.61352356e+00, 1.13636364e+01, 6.91699605e-02, 5.54695059e-01,
        6.28463439e+00, 3.79504269e+00, 9.54940711e+00, 4.08237154e+02,
        1.84555336e+01, 3.56674032e+02, 1.26530632e+01]])

#### Fitting  Model using SKlearn


In [6]:
regr = LR()
regr.fit(features,target)
fitted_vals = regr.predict(features)

#### Computing MSE & RMSE

In [7]:
MSE = mse(target,fitted_vals)
RMSE = np.sqrt(MSE)
print("RMSE: ", RMSE)
print("MSE: " , MSE)

RMSE:  0.18751213519713034
MSE:  0.03516080084618688


### Estimator (main Function)

In [8]:
### Select current Median price of houses in Boston in  -> villow_median_price
villow_median_price = 583.3
training_median = 21.199

## our scaling factor 
sc = villow_median_price/training_median

def get_log_estimate(rooms_num , next_2_river = False,high_confidence = True):
    
    """
    Estimate the price of a property in Boston 
    
    Keywords Arguments:
    rooms_num -- Number of Rooms 
    next_2_river -- True is property is next to river , False otherwise  
    high_confidence -- True for a 95% Prediction Interval, False for a 68% Interval 


    """

    if rooms_num <1 or rooms_num > 8:  
        print("Please enter a more realistic number ")
        
        return None
    ## Room indexing
    property_stats[0][RM_IDX] = rooms_num

    ## River indexing 
    if next_2_river:
        property_stats[0][CHAS_IDX] = 1
    else:
        property_stats[0][CHAS_IDX] = 0

    ## converting to df 
    property_stats_df = pd.DataFrame(data = property_stats , columns=features.columns)
    ##  Prediction 
    log_estimate = regr.predict(property_stats_df).item()
    
    ## Confidence calc
    if high_confidence:
        upper_bound = log_estimate + 2* RMSE
        lower_bound = log_estimate - 2* RMSE
        interval = 95
    else :
        upper_bound = log_estimate +  RMSE
        lower_bound = log_estimate -  RMSE
        interval = 68

    return log_estimate, upper_bound, lower_bound , interval

## Guessr

In [19]:
###################################################################################

#  Estimate the price of a property in Boston 
    
# Keywords Arguments:
# rooms_num -- Number of Rooms 
# next_2_river -- True is property is next to river , False otherwise  
# high_confidence -- True for a 95% Prediction Interval, False for a 68% Interval 

###################################################################################

Num_of_rooms = 3
next_2_river = True
high_confidence = True

def get_actual_price():
    # Passing the get_log_estimate values 
    result = get_log_estimate(Num_of_rooms, next_2_river=next_2_river, high_confidence=high_confidence)
    
    if result is None:
        return None
    
    estimated_price, ub, lb, intv = result
    
    # Converting to actual $ & scaling to today's prices 
    actual_price = np.e ** estimated_price  
    inflated_price = np.around((actual_price * sc) * 1000, -3)
    upper_range = np.around(np.e ** ub * 1000 * sc, -3)
    lower_range = np.around(np.e ** lb * 1000 * sc, -3)
    
    return inflated_price, upper_range, lower_range, intv

result = get_actual_price()

if result:
    price, ur, lr, intv = result
    print(f"Estimated price: ${price:,.0f}")
    print(f"Estimated price range: ${lr:,.0f} - ${ur:,.0f} with confidence of {intv}%")
else:
    print("Could not estimate price due to invalid input.")

Estimated price: $468,000
Estimated price range: $322,000 - $682,000 with confidence of 95%
