In [1]:
import pandas as pd
pd.set_option("display.max_column", None)

In [2]:
home_data = pd.read_csv("melb_data.csv")

In [3]:
home_data.shape

(13580, 21)

In [4]:
y = home_data.Price

In [5]:
X = home_data.drop("Price", axis = 1)

In [6]:
X.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,S,Biggin,4/02/2016,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,SP,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,PI,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,VB,Nelson,4/06/2016,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [18]:
from sklearn.model_selection import train_test_split

X_train_full, X_val_full, y_train, y_val = train_test_split(X, y, 
                                                                      train_size = 0.8, 
                                                                      test_size = 0.2, 
                                                                      random_state = 1)

In [19]:
categorical_cols = [cname for cname in X_train_full.columns 
                   if X_train_full[cname].nunique() < 10 and 
                   X_train_full[cname].dtype == "object" ]

numerical_cols = [cname for cname in X_train_full.columns 
                  if X_train_full[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numerical_cols 
X_train = X_train_full[my_cols].copy()
X_val = X_val_full[my_cols].copy()

In [20]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
1041,h,S,Southern Metropolitan,3,11.2,3186.0,3.0,1.0,2.0,366.0,156.0,1920.0,-37.9038,145.0001,10579.0
1989,h,S,Northern Metropolitan,3,7.8,3058.0,3.0,1.0,0.0,238.0,131.0,1900.0,-37.7539,144.9612,11204.0
10157,h,S,Northern Metropolitan,3,5.2,3056.0,3.0,1.0,1.0,439.0,,,-37.77047,144.97005,11918.0
1711,u,S,Southern Metropolitan,2,11.4,3163.0,2.0,1.0,2.0,0.0,100.0,1973.0,-37.8863,145.066,7822.0
11565,h,S,Western Metropolitan,4,11.0,3018.0,4.0,2.0,4.0,615.0,,,-37.87057,144.83623,5301.0


In [25]:
#pipelines 

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# preprocessing for numerical column
numerical_transformer = SimpleImputer(strategy = 'constant')

#preprocessing for categorical column
categorical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = "most_frequent")),
    ('onehot', OneHotEncoder(handle_unknown = "ignore"))
])

# bundle preprocessing for numerical and categorical column
preprocessor = ColumnTransformer(
    transformers =[ 
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])



In [26]:
# define model 
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators = 100, random_state = 0)

In [27]:
#create and evaluate 
from sklearn.metrics import mean_absolute_error

# building preprocessing and modeling in a pipeline
my_pipeline = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('model', model)
])

#preprocessing of training data and fit model 
my_pipeline.fit(X_train, y_train)

#prediction
preds = my_pipeline.predict(X_val)

# evaluate the model
print("MAE : ", mean_absolute_error(y_val, preds))



MAE :  156312.91707447925
