Importing the libraries required for the building of models and the reading of data

In [12]:
#import essential and basic libraries
import numpy as np
import pandas as pd
#import library to split data into train and test
from sklearn.model_selection import train_test_split
#import model type
from sklearn.ensemble import RandomForestRegressor
#import the MAE, a scoring system for models
from sklearn.metrics import mean_absolute_error

Read CSV file

In [13]:
data = pd.read_csv('./data/melb_data.csv')

Identifying "Target"(prediction value) and Dropping the prediction value

In [14]:
y = data.Price
melb_predictors = data.drop(['Price'], axis=1)

Only selecting numerical variables and then splitting data into train and validation groups

In [15]:
X = melb_predictors.select_dtypes(exclude=['object'])
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)

Scoring system for models

In [16]:
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

Removing columns with missing values

In [20]:
# Get names of columns with missing values
cols_with_missing = [col for col in X_train.columns
                     if X_train[col].isnull().any()]

# Drop columns in training and validation data
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)

Imputation(add random(usually the mean) values to the empty spaces)

In [18]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns


MAE from Approach 2 (Imputation):
178166.46269899711


Obtain the Mean Absolute Error for the model; The lower the better

In [22]:
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

183550.22137772635


Prepare categorical data for analysis

In [23]:
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 10 and 
                        X_train[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numerical_cols
X_train = X_train[my_cols].copy()
X_valid = X_valid  [my_cols].copy()