In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

data = pd.read_csv("./melb_data.csv", index_col=0)

data.shape

(13580, 20)

In [5]:
y = data["Price"]
X = data.drop(["Price"], axis=1)

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()]
X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_valid_full.drop(cols_with_missing, axis=1, inplace=True)

low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]

numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ["int64","float64"]]

my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()


In [6]:
X_train.head()


Unnamed: 0_level_0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
Suburb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
St Kilda,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0
Williamstown,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0
Sunshine,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0
Glenroy,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0
Sunshine North,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0


In [7]:
# list of categorical variables
s = (X_train.dtypes == "object")
object_cols = list(s[s].index)
print(object_cols)

['Type', 'Method', 'Regionname']


In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=1)
    model.fit(X_train, y_train)
    predictions = model.predict(X_valid)

    return mean_absolute_error(predictions, y_valid)

In [11]:
# approach 1: dropping categoricals

drop_X_train = X_train.select_dtypes(exclude=["object"])
drop_X_valid = X_valid.select_dtypes(exclude=["object"])

print(f"Score from dropping: \t{score_dataset(drop_X_train, drop_X_valid, y_train, y_valid)}")

Score from dropping: 	174632.25689207987


In [12]:
# second approach, ordinal encoding, assigning a value to each input type.

from sklearn.preprocessing import OrdinalEncoder

label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

ordinal_encoder = OrdinalEncoder()

label_X_train[object_cols] = ordinal_encoder.fit_transform(label_X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(label_X_valid[object_cols])

print(f"Score from ordinal encoding:\t{score_dataset(label_X_train, label_X_valid, y_train, y_valid)}")

Score from ordinal encoding:	165256.28786135072


In [18]:
# third approach One-Hot encoding

from sklearn.preprocessing import OneHotEncoder

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# dropping the object cols and replacing with OH cols

num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

# just making sure all columns have string type

OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)

print(f"Score from OH approach:\t{score_dataset(OH_X_train, OH_X_valid, y_train, y_valid)}")

Score from OH approach:	166500.44717161093
