In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [3]:
file_path = '../../Datasets/melb_data.csv'

melb_data = pd.read_csv(file_path)

y = melb_data.Price

# Drop the target variable from the predictors [axis=1 indicates we are dropping a full column]
predictors = melb_data.drop(['Price'], axis=1)

# Select only numeric predictors [exclude=['object'] indicates we are excluding non-numeric columns]
X = predictors.select_dtypes(exclude=['object'])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [4]:
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

#### Approach 1: Drop Columns with Missing values

In [5]:
cols_with_missing_values = [col for col in X_train.columns if X_train[col].isnull().any()]

# cols_with_missing_values

reduced_X_train = X_train.drop(cols_with_missing_values, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing_values, axis=1)

print("MAE (Drop columns with missing values):", score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

MAE (Drop columns with missing values): 175703.48185157913


#### Approach 2: Imputation

In [6]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer()

# fit -> calculate imputation values
# transform -> impute the missing values
# fit_transform -> fit + transform
imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))

# only transform the validation data since fit is already done
imputed_X_valid = pd.DataFrame(imputer.transform(X_valid))

imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

In [7]:
print("MAE (Imputation):", score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

MAE (Imputation): 169237.0268668034


#### Approach 3: Flag Missingness

In [10]:
X_train[cols_with_missing_values].isnull()

Unnamed: 0,Car,BuildingArea,YearBuilt
12167,False,True,False
6524,False,True,True
8413,False,True,True
2919,False,True,False
6043,False,False,False
...,...,...,...
13123,False,True,True
3264,False,False,False
9845,False,False,False
10799,False,True,True


In [11]:
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

for col in cols_with_missing_values:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()
    
my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_train_valid = pd.DataFrame(my_imputer.transform(X_valid_plus))

imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_train_valid.columns = X_valid_plus.columns

In [12]:
print("MAE (Imputation + Missing Indicator):", score_dataset(imputed_X_train_plus, imputed_X_train_valid, y_train, y_valid))

MAE (Imputation + Missing Indicator): 169795.45249719475
