In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("melb_data.csv")
df.shape

(13580, 21)

In [3]:
y = df["Price"]
X = df[["Type", "Method", "Distance", "Rooms", "Regionname", "Date", "Bedroom2", "Bathroom", "Car", "Landsize", "BuildingArea", "YearBuilt","Lattitude", "Longtitude","Propertycount"]]

X_train,X_test,y_train, y_test = train_test_split(X,y)

## STEP 1: Define Preprocessing Steps 

In [4]:
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import OneHotEncoder 
from sklearn.metrics import mean_absolute_error 

In [5]:
# impute missing value for numerical data 
num_trans = SimpleImputer(strategy="constant")

# preprocessing for categorical data (imput missing values and use OneHotEncoding)
cat_trans = Pipeline(steps=[("imputer", SimpleImputer(strategy="most_frequent")),
                           ("onehot", OneHotEncoder(handle_unknown="ignore"))])


num_cols = [col for col in X_train.columns if X_train[col].dtype in ["int64", "float64"] ]
cat_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Bundle pre_processing (i.e for numerical and categorical data)
preprocessor = ColumnTransformer(transformers=[("num", num_trans, num_cols), ("cat", cat_trans, cat_cols)])

## Step 2: Define The Model 

In [6]:
from sklearn.ensemble import RandomForestRegressor

In [7]:
model = RandomForestRegressor(random_state=1)

## Step 3: Create and Evaluate Pipeline 

In [8]:
my_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

my_pipeline.fit(X_train, y_train)
prediction = my_pipeline.predict(X_test)

In [9]:
output = pd.DataFrame({"Id" : X_test.index, "SalePrice": prediction})
output = output.to_csv("predicted_melb.csv", index=False)

In [10]:
from sklearn.metrics import mean_absolute_error 

In [11]:
print('MAE:', mean_absolute_error(y_test, prediction))

MAE: 161319.8357820324
