In [None]:
# importing libraries
import pandas as pd
import numpy as np


In [None]:
# loading data
mel_data = pd.read_csv('../input/melbourne-housing-snapshot/melb_data.csv')
mel_data.head()

In [None]:
# separate target from predictors
y = mel_data.Price
X = mel_data.drop(['Price'], axis = 1)

In [None]:
X.head()

In [None]:
# Total missing values in the dataset
missing_values = mel_data.isnull().sum()
missing_values

In [None]:
total_values_in_the_data = np.product(mel_data.shape)
total_values_in_the_data

In [None]:
total_missing_values = missing_values.sum()
total_missing_values

In [None]:
total_percent = (total_missing_values / total_values_in_the_data) * 100
total_percent

In [None]:
# Divide the data into training and testing dataset
from sklearn.model_selection import train_test_split
X_train_full, X_test_full, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 0 )

In [None]:
# drop columns with missing values
cols_with_missing = [cols for cols in X_train_full.columns if X_train_full[cols].isnull().any()]
X_train_full.drop(cols_with_missing, axis = 1, inplace = True)
X_test_full.drop(cols_with_missing, axis = 1, inplace = True)

In [None]:
cols_with_missing

In [None]:
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and
                        X_train_full[cname].dtype =='object']


# select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [None]:
low_cardinality_cols

In [None]:
numerical_cols

In [None]:
X_train.head()

In [None]:
mel_data.head()

In [None]:
# get list of categorical variables

s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical Variables")
print(object_cols)

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

# function comparing different appraches
def score_dataset(X_train, X_test, y_train, y_test):
  model = RandomForestRegressor(n_estimators = 100, random_state = 0)
  model.fit(X_train, y_train)
  pred = model.predict(X_test)
  return mean_absolute_error(y_test, pred)

**Score from Approach 1 (Drop Categorical Variables)**

In [None]:
drop_X_train = X_train.select_dtypes(exclude =["object"])
drop_X_test = X_test.select_dtypes(exclude =["object"])

print("MAE from Approach 1 (Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_test, y_train, y_test))

**Score from Approach 2 (Ordinal Encoding)**

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# Make copy to avoid changing orginal data
label_X_train = X_train.copy()
label_X_test = X_test.copy()

# Apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(label_X_train[object_cols])
label_X_test[object_cols] = ordinal_encoder.fit_transform(label_X_test[object_cols])

print("MAE from Approach 2 (Ordinal Encoding):")
print(score_dataset(label_X_train, label_X_test, y_train, y_test))

**Score from Approach 3 (One-Hot Encoding)**

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.fit_transform(X_test[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_test.index = X_test.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis = 1 )
num_X_test = X_test.drop(object_cols, axis = 1 )

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

# Ensure all columns have string type
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_test.columns = OH_X_test.columns.astype(str)

print("MAE from Approach 3 (One-Hot Encoding):")
print(score_dataset(OH_X_train, OH_X_test, y_train, y_test))