In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [17]:
file_path = '../../Datasets/melb_data.csv'
melb_data = pd.read_csv(file_path)

melb_target = melb_data.Price

cols_with_missing_values = [col for col in melb_data.columns if melb_data[col].isnull().any() and melb_data[col].dtype != 'object']

print("Data Shape: ", melb_data.shape)
melb_data[cols_with_missing_values].isnull().sum()
melb_data.columns

Data Shape:  (13580, 21)


Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

- Drop BuildingArea and YearBuilt Column
- Drop rows for Missing Car Column values

In [18]:
melb_data = melb_data.drop(columns=['BuildingArea', 'YearBuilt'])
melb_data = melb_data.dropna(subset=['Car'])

X_train, X_valid, y_train, y_valid = train_test_split(melb_data.drop(columns=['Price']), melb_data.Price, test_size=0.2, random_state=0)

low_cardinality_cols = [cname for cname in X_train.columns if
                        X_train[cname].nunique() < 10 and
                        X_train[cname].dtype == 'object']

numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

all_cols = low_cardinality_cols + numerical_cols
X_train = X_train[all_cols].copy()
X_valid = X_valid[all_cols].copy()

low_cardinality_cols

['Type', 'Method', 'Regionname']

In [19]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,Lattitude,Longtitude,Propertycount
12745,h,S,Southern Metropolitan,3,13.8,3188.0,3.0,1.0,1.0,464.0,-37.93753,145.0102,5454.0
1235,u,VB,Southern Metropolitan,1,10.7,3187.0,1.0,1.0,1.0,0.0,-37.9164,145.0228,6938.0
5426,h,SP,Northern Metropolitan,2,2.6,3121.0,2.0,1.0,0.0,156.0,-37.8209,145.0052,14949.0
839,h,S,Southern Metropolitan,3,13.9,3165.0,3.0,1.0,2.0,584.0,-37.9183,145.071,10969.0
8451,h,S,Eastern Metropolitan,4,13.8,3107.0,4.0,2.0,2.0,650.0,-37.7686,145.1023,5420.0


In [28]:
# List of categorical variables
s = (X_train.dtypes == 'object')

object_cols = []

print(s, "\n")
print(s.index, "\n")
print(s.values, "\n")

for col in s.index:
    if s[col]:
        object_cols.append(col)
        
object_cols

Type              True
Method            True
Regionname        True
Rooms            False
Distance         False
Postcode         False
Bedroom2         False
Bathroom         False
Car              False
Landsize         False
Lattitude        False
Longtitude       False
Propertycount    False
dtype: bool 

Index(['Type', 'Method', 'Regionname', 'Rooms', 'Distance', 'Postcode',
       'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'Lattitude', 'Longtitude',
       'Propertycount'],
      dtype='object') 

[ True  True  True False False False False False False False False False
 False] 



['Type', 'Method', 'Regionname']

In [29]:
from sklearn.preprocessing import OrdinalEncoder

OE = OrdinalEncoder()

label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

label_X_train[object_cols] = OE.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = OE.transform(X_valid[object_cols])

def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [30]:
print("MAE (Ordinal Encoding):", score_dataset(label_X_train, label_X_valid, y_train, y_valid))

MAE (Ordinal Encoding): 170438.40639018035


In [38]:
from sklearn.preprocessing import OneHotEncoder

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

OH_X_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_X_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

OH_X_train.index = X_train.index
OH_X_valid.index = X_valid.index

num_X_train = X_train.drop(columns=object_cols, axis=1)
num_X_valid = X_valid.drop(columns=object_cols, axis=1)

OH_X_train = pd.concat([num_X_train, OH_X_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_X_valid], axis=1)

OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)

In [39]:
print("MAE (One-Hot Encoding):", score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))

MAE (One-Hot Encoding): 170467.87348167325
