In [2]:
# Using a pipeline to produce cleaner and more bug-free code.
# A pipeline bundles together preprocessing and modelling steps.
# first import data and set test and train variables.

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read Data
data = pd.read_csv('melb_data.csv')
cols_to_use = ['Rooms', 'Distance', 'Landsize', 'BuildingArea', 'YearBuilt']
X = data[cols_to_use]
y = data.Price
train_X, test_X, train_y, test_y = train_test_split(X, y)

In [5]:
# We want a modeling process that uses an Imputer to fill in missing values, followed by a RandomForestRegressor 
# to make predictions. These can be bundled together with the make_pipeline function as shown below.

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer

my_pipeline = make_pipeline(Imputer(), RandomForestRegressor())

In [7]:
# Now fit and predict using this pipeline as a fused whole.

In [8]:
my_pipeline.fit(train_X, train_y)
predictions = my_pipeline.predict(test_X)

In [17]:
import numpy as np

In [19]:
print(predictions)

[ 969100. 1199100.  590050. ... 1466600.  987350.  419300.]
