In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv("./melb_data.csv")

y = data.Price
X = data.drop(["Price"], axis=1)

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

low_cardinality_cols = [col for col in X_train_full.columns if X_train_full[col].nunique() < 10 and X_train_full[col].dtype == "objeect"]

numerical_columns = [col for col in X_train_full if X_train_full[col].dtype in ["int64","float64"]]

selected_cols = low_cardinality_cols + numerical_columns
X_train = X_train_full[selected_cols].copy()
X_valid = X_valid_full[selected_cols].copy()

In [10]:
X_train.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
1041,3,11.2,3186.0,3.0,1.0,2.0,366.0,156.0,1920.0,-37.9038,145.0001,10579.0
1989,3,7.8,3058.0,3.0,1.0,0.0,238.0,131.0,1900.0,-37.7539,144.9612,11204.0
10157,3,5.2,3056.0,3.0,1.0,1.0,439.0,,,-37.77047,144.97005,11918.0
1711,2,11.4,3163.0,2.0,1.0,2.0,0.0,100.0,1973.0,-37.8863,145.066,7822.0
11565,4,11.0,3018.0,4.0,2.0,4.0,615.0,,,-37.87057,144.83623,5301.0


In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [13]:
# Preprocessing the numerical data

numerical_transformer = SimpleImputer(strategy="constant")

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ('onehot', OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_columns),
        ("cat", categorical_transformer, low_cardinality_cols)
    ]
)

In [16]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=0)

In [17]:
from sklearn.metrics import mean_absolute_error

# Bundle preprocessing and modeling

my_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

my_pipeline.fit(X_train, y_train)

predictions = my_pipeline.predict(X_valid)

score = mean_absolute_error(y_valid, predictions)
print(f"MAE:\t{score}")

MAE:	168051.93318798655
