In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option("display.max_columns", None)

from sklearn.model_selection import train_test_split


In [2]:
# load data
home_data = pd.read_csv('melb_data.csv')

In [3]:
home_data.shape

(13580, 21)

In [4]:
home_data.sample(n = 5, random_state = 5)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
9599,Keilor East,68 Nyah St,3,h,1001000.0,S,Nelson,17/06/2017,11.7,3033.0,3.0,1.0,2.0,605.0,,,Moonee Valley,-37.73137,144.86882,Western Metropolitan,5629.0
7330,Oakleigh,1173 North Rd,3,h,1035000.0,S,Buxton,12/11/2016,14.0,3166.0,3.0,1.0,2.0,650.0,,,Monash,-37.91,145.0914,Southern Metropolitan,3224.0
2885,Glenroy,4 Lyons St,3,h,600000.0,S,Barry,8/10/2016,13.0,3046.0,3.0,1.0,2.0,702.0,,,Moreland,-37.6999,144.9387,Northern Metropolitan,8870.0
7677,Camberwell,20 Crellin Gr,2,h,1900000.0,S,Noel,13/05/2017,7.8,3124.0,2.0,1.0,3.0,633.0,134.3,1960.0,Boroondara,-37.8427,145.0824,Southern Metropolitan,8920.0
979,Box Hill,89 Thames St,4,h,2770000.0,S,Lindellas,12/11/2016,13.1,3128.0,4.0,1.0,2.0,715.0,,1950.0,Whitehorse,-37.8135,145.1218,Eastern Metropolitan,4605.0


In [5]:
home_data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [6]:
# selecting target variable
y = home_data.Price

In [7]:
y

0        1480000.0
1        1035000.0
2        1465000.0
3         850000.0
4        1600000.0
5         941000.0
6        1876000.0
7        1636000.0
8         300000.0
9        1097000.0
10        700000.0
11       1350000.0
12        750000.0
13       1172500.0
14        441000.0
15       1310000.0
16       1200000.0
17       1176500.0
18        955000.0
19        890000.0
20       1330000.0
21        900000.0
22       1090000.0
23        500000.0
24       1100000.0
25       1315000.0
26        426000.0
27       1447500.0
28        457000.0
29       1135000.0
           ...    
13550     595000.0
13551     682000.0
13552     640000.0
13553    2720000.0
13554    1715000.0
13555    3100000.0
13556     350000.0
13557    1753000.0
13558    1745000.0
13559    1190000.0
13560    1400000.0
13561    1288000.0
13562    1450000.0
13563    1271000.0
13564     540000.0
13565    1263500.0
13566    1250000.0
13567    1316000.0
13568     951000.0
13569    1323000.0
13570     970000.0
13571    133

In [11]:
# selecting features
melbourn_data = home_data.drop(["Price"], axis = 1)
X = melbourn_data.select_dtypes(exclude = "object")

In [12]:
# split the data into Training and validation data for both Features and target variable
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.80, test_size = 0.20, random_state = 1)

In [15]:
# Define a function to measuring quality for each approach
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train, X_val, y_train, y_val):
    model = RandomForestRegressor(n_estimators = 10)
    model.fit(X_train, y_train)
    val_preds = model.predict(X_val)
    return (mean_absolute_error(y_val, val_preds))

In [16]:
# handling missing data:Approach ---- drop columns
# get columns name with missing values
cols_with_missing_val = [col for col in X_train.columns
                        if X_train[col].isnull().any()]

# Drop columns with missing data in training and validation
reduced_X_train = X_train.drop(cols_with_missing_val, axis = 1)
reduced_X_val = X_val.drop(cols_with_missing_val, axis = 1)

# score from approach 1 
print("MAE score from dropping missing columns: ", )
print(score_dataset(reduced_X_train, reduced_X_val, y_train, y_val))

MAE score from dropping missing columns: 
187342.39714887674


In [19]:
# handling missing data: Approach 2 ----Imputation
from sklearn.impute import SimpleImputer

#imputations
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_val = pd.DataFrame(my_imputer.transform(X_val))

#Imputaton removes columns name, put them back
imputed_X_train.columns = X_train.columns
imputed_X_val.columns = X_val.columns

#Score from approach 2
print("MAE score from dropping missing columns: ")
print(score_dataset(imputed_X_train, imputed_X_val, y_train, y_val))

MAE score from dropping missing columns: 
175239.33087874326
