In [43]:
import pandas as pd #for data anlysis
import numpy as np #for linear algebra and scientific approach
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
#for data visualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
# %matplotlib inline

In [44]:
df = pd.read_csv("MELBOURNE_HOUSE_PRICES_LESS.csv")
full_df = pd.read_csv("Melbourne_housing_FULL.csv")

In [45]:
df = pd.concat([df, pd.get_dummies(df["Type"]), pd.get_dummies(df["Method"]),
pd.get_dummies(df["Regionname"])], axis=1)
df = df.drop(["Suburb", "Address", "SellerG", "CouncilArea", "Type", "Method", "Regionname"], 1)
df['Date'] = [pd.Timestamp(x).timestamp() for x in df["Date"]]
df = df.dropna()
df.head()

Unnamed: 0,Rooms,Price,Date,Postcode,Propertycount,Distance,h,t,u,PI,...,VB,W,Eastern Metropolitan,Eastern Victoria,Northern Metropolitan,Northern Victoria,South-Eastern Metropolitan,Southern Metropolitan,Western Metropolitan,Western Victoria
0,3,1490000.0,1483488000.0,3067,4019,3.0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,3,1220000.0,1483488000.0,3067,4019,3.0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,3,1420000.0,1483488000.0,3067,4019,3.0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,3,1515000.0,1483488000.0,3040,1543,7.5,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,2,670000.0,1483488000.0,3042,3464,10.4,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [46]:
X = df.drop("Price", 1)
Y = df["Price"]

In [47]:
features = df.drop("Price", axis=1)

log_prices = np.log(df["Price"])
target = pd.DataFrame(log_prices, columns=['Price'])

In [48]:
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48433 entries, 0 to 63020
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Rooms                       48433 non-null  int64  
 1   Date                        48433 non-null  float64
 2   Postcode                    48433 non-null  int64  
 3   Propertycount               48433 non-null  int64  
 4   Distance                    48433 non-null  float64
 5   h                           48433 non-null  uint8  
 6   t                           48433 non-null  uint8  
 7   u                           48433 non-null  uint8  
 8   PI                          48433 non-null  uint8  
 9   PN                          48433 non-null  uint8  
 10  S                           48433 non-null  uint8  
 11  SA                          48433 non-null  uint8  
 12  SN                          48433 non-null  uint8  
 13  SP                          484

In [49]:
RoomsId=0
hId=6
tId=7
uId=8
property_stats = features.mean().values.reshape(1, 25)

In [50]:
regr = LinearRegression().fit(features, target)
fitted_vals = regr.predict(features)

# Challenge: calculate the MSE and RMSE using sklearn
MSE = mean_squared_error(target, fitted_vals)
RMSE = np.sqrt(MSE)

In [51]:
def get_log_estimate(nr_rooms=3,
                    house=False,
                     unit=False,
                     tower=False,high_confidence=True):
    
    # Configure property
    property_stats[0][RoomsId] = nr_rooms
    
    
    if house:
        property_stats[0][hId] = 1
        property_stats[0][uId] = 0
        property_stats[0][tId] = 0
    elif unit:
        property_stats[0][hId] = 0
        property_stats[0][uId] = 1
        property_stats[0][tId] = 0
    elif tower:
        property_stats[0][hId] = 0
        property_stats[0][uId] = 0
        property_stats[0][tId] = 1
    else:
        property_stats[0][hId] = 0
        property_stats[0][uId] = 0
        property_stats[0][tId] = 1
    
    # Make prediction
    log_estimate = regr.predict(property_stats)[0][0]
    
    # Calc Range 
    if high_confidence:
        upper_bound = log_estimate + 2*RMSE
        lower_bound = log_estimate - 2*RMSE
        interval = 95
    else:
        upper_bound = log_estimate + RMSE
        lower_bound = log_estimate - RMSE
        interval = 68
    
    return log_estimate, upper_bound, lower_bound, interval

In [52]:
def get_dollar_estimate(rm=3,house=False,unit=False,tower=False, large_range=True):
    
    
    
    

    log_est, upper, lower, conf = get_log_estimate(rm,house,unit,tower, 
                                                   high_confidence=large_range)

    # Convert to today's dollars
    dollar_est = np.e**log_est
    dollar_hi = np.e**upper
    dollar_low = np.e**lower

    # Round the dollar values to nearest thousand
    rounded_est = np.around(dollar_est, -3)
    rounded_hi = np.around(dollar_hi, -3)
    rounded_low = np.around(dollar_low, -3)

    print(f'The estimated property value is {rounded_est}.')
    print(f'At {conf}% confidence the valuation range is')
    print(f'USD {rounded_low} at the lower end to USD {rounded_hi} at the high end.')

In [53]:
"""
To get the estimate price of house in Melbourne, in dollars
Specify the following:
rm- number of rooms
house=Ture, for Type of property as villa
unit=Ture, for Type of property as unit, duplex;
Tower=Ture, for Type of property as townhouse

"""
get_dollar_estimate(rm=4, unit=True)

The estimated property value is 1089000.0.
At 95% confidence the valuation range is
USD 601000.0 at the lower end to USD 1972000.0 at the high end.
