In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split

In [15]:
df = pd.read_csv("melb_data.csv")
df.head

<bound method NDFrame.head of               Suburb           Address  Rooms Type      Price Method  \
0         Abbotsford      85 Turner St      2    h  1480000.0      S   
1         Abbotsford   25 Bloomburg St      2    h  1035000.0      S   
2         Abbotsford      5 Charles St      3    h  1465000.0     SP   
3         Abbotsford  40 Federation La      3    h   850000.0     PI   
4         Abbotsford       55a Park St      4    h  1600000.0     VB   
...              ...               ...    ...  ...        ...    ...   
13575  Wheelers Hill      12 Strada Cr      4    h  1245000.0      S   
13576   Williamstown     77 Merrett Dr      3    h  1031000.0     SP   
13577   Williamstown       83 Power St      3    h  1170000.0      S   
13578   Williamstown      96 Verdon St      4    h  2500000.0     PI   
13579     Yarraville        6 Agnes St      4    h  1285000.0     SP   

        SellerG        Date  Distance  Postcode  ...  Bathroom  Car  Landsize  \
0        Biggin   3/12/2

In [3]:
y = df["Price"]
X = df[["Type", "Method", "Distance", "Rooms", "Regionname", "Date", "Bedroom2", "Bathroom", "Car", "Landsize", "BuildingArea", "YearBuilt","Lattitude", "Longtitude","Propertycount"]]

X_train,X_test,y_train, y_test = train_test_split(X,y,test_size=0.2)

## STEP 1: Define Preprocessing Steps 

In [5]:
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import OneHotEncoder 

In [6]:
# impute missing value for numerical data 
num_trans = SimpleImputer(strategy="constant")

# preprocessing for categorical data (imput missing values and use OneHotEncoding)
cat_trans = Pipeline(steps=[("imputer", SimpleImputer(strategy="most_frequent")),
                           ("onehot", OneHotEncoder(handle_unknown="ignore"))])


num_cols = [col for col in X_train.columns if X_train[col].dtype in ["int64", "float64"] ]
cat_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Bundle pre_processing (i.e for numerical and categorical data)
preprocessor = ColumnTransformer(transformers=[("num", num_trans, num_cols), ("cat", cat_trans, cat_cols)])

## Step 2: Define The Model 

In [7]:
from sklearn.ensemble import RandomForestRegressor

In [8]:
model = RandomForestRegressor(random_state=1)

## Step 3: Create and Evaluate Pipeline 

In [9]:
my_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

my_pipeline.fit(X_train, y_train)
prediction = my_pipeline.predict(X_test)

In [14]:
output = pd.DataFrame({"Id" : X_test.index, "SalePrice": prediction})
output = output.to_csv("predicted_melb.csv", index=False)