In [1]:
# importing libraries
import pandas as pd
import numpy as np

In [2]:
# Loading the datasets and showing it
mel_data = pd.read_csv("melb_data.csv")
mel_data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [3]:
#Checking the missing values in the dataset
missing_values = mel_data.isnull().sum()
missing_values

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

In [4]:
# calculating the total data present in the dataset
total_dataset = np.product(mel_data.shape)
total_dataset

285180

In [5]:
# getting only the missing values
total_missing_values = missing_values.sum()
total_missing_values

13256

In [6]:
# calculating the total percentage of missing values
total_percent = ( total_missing_values / total_dataset ) * 100
total_percent

4.648292306613367

In [7]:
y = mel_data.Price
y.head()

0    1480000.0
1    1035000.0
2    1465000.0
3     850000.0
4    1600000.0
Name: Price, dtype: float64

In [8]:
#droping price column and removing the objects from the dataset
mel_predictors = mel_data.drop(['Price'], axis=1)
X = mel_predictors.select_dtypes(exclude = ['object'])
X

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
0,2,2.5,3067.0,2.0,1.0,1.0,202.0,,,-37.79960,144.99840,4019.0
1,2,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,-37.80790,144.99340,4019.0
2,3,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,-37.80930,144.99440,4019.0
3,3,2.5,3067.0,3.0,2.0,1.0,94.0,,,-37.79690,144.99690,4019.0
4,4,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,-37.80720,144.99410,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...
13575,4,16.7,3150.0,4.0,2.0,2.0,652.0,,1981.0,-37.90562,145.16761,7392.0
13576,3,6.8,3016.0,3.0,2.0,2.0,333.0,133.0,1995.0,-37.85927,144.87904,6380.0
13577,3,6.8,3016.0,3.0,2.0,4.0,436.0,,1997.0,-37.85274,144.88738,6380.0
13578,4,6.8,3016.0,4.0,1.0,5.0,866.0,157.0,1920.0,-37.85908,144.89299,6380.0


In [9]:
#divide the data into training and testing dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 0)

In [25]:
#calculating the mean_absolute_error and multiple functions with respect to different Machine Learning Models

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Simple Linear Regression
def score_dataset_simple(X_train, X_test, y_train, y_test):
  model = LinearRegression()
  model.fit(X_train, y_train)
  preds = model.predict(X_test)
  return mean_absolute_error(y_test, preds)

# Support Vector Regression
def score_dataset_random(X_train, X_test, y_train, y_test):
  model = RandomForestRegressor()
  model.fit(X_train, y_train)
  preds = model.predict(X_test)
  return mean_absolute_error(y_test, preds)


**Approach 1 Dropping Columns with Missing Values**

In [26]:
#checking the columns which has a missing values
cols_with_missing = [cols for cols in X_train.columns if X_train[cols].isnull().any()]
print("the columns which has a missing values is ", cols_with_missing)
#Drop columns in training and testing data
reduced_X_train = X_train.drop(cols_with_missing, axis =1 )
reduced_X_test = X_test.drop(cols_with_missing, axis =1 )

print(" MAE from Approach 1 (Drop columns with missing values): ")
# Simple Linear Regression
print("\nSimple Linear Regression ",score_dataset_simple(reduced_X_train, reduced_X_test, y_train, y_test))

# Random Forest Regression
print("\nRandom Forest Regression ",score_dataset_random(reduced_X_train, reduced_X_test, y_train, y_test))


the columns which has a missing values is  ['Car', 'BuildingArea', 'YearBuilt']
 MAE from Approach 1 (Drop columns with missing values): 

Simple Linear Regression  307246.8033415164

Random Forest Regression  175460.3583219078


**Approach 2 using SimpleImputer with Missing Values**

In [50]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer()
# Imputation
imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
imputed_X_test = pd.DataFrame(imputer.fit_transform(X_test))

# #Imputation removed column; put them back
imputed_X_train.columns = X_train.columns
imputed_X_test.columns = X_test.columns

print("MAE from Approach 2 (Imputation):")
# Simple Linear Regression
print("\nSimple Linear Regression ",score_dataset_simple(imputed_X_train, imputed_X_test, y_train, y_test))

# Random Forest Regression
print("\nRandom Forest Regression ",score_dataset_random(imputed_X_train, imputed_X_test, y_train, y_test))


MAE from Approach 2 (Imputation):

Simple Linear Regression  295662.84217331914

Random Forest Regression  168821.17688337193


**Approach 3 using SimpleImputer Extension with Missing Values**

In [52]:
# Make a copy to avoid orginal data (when imputing)
X_train_plus = X_train.copy()
X_test_plus = X_test.copy()

# Make new columns indicating what will be imputed
for col in cols_with_missing:
  X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
  X_test_plus[col + '_was_missing'] = X_test_plus[col].isnull()

# Imputation
imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(imputer.fit_transform(X_train_plus))
imputed_X_test_plus = pd.DataFrame(imputer.transform(X_test_plus))

# Imputation removed column names; put them back
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_test_plus.columns = X_test_plus.columns

print("MAE from Approach 2 (Imputation):")
# Simple Linear Regression
print("\nSimple Linear Regression ",score_dataset_simple(imputed_X_train_plus, imputed_X_test_plus, y_train, y_test))

# Random Forest Regression
print("\nRandom Forest Regression ",score_dataset_random(imputed_X_train_plus, imputed_X_test_plus, y_train, y_test))


MAE from Approach 2 (Imputation):

Simple Linear Regression  295735.9172186967

Random Forest Regression  168807.5560293499


In [53]:
# Shape of training data (num_rows, num_columns)
print(X_train.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

(10864, 12)
Car               49
BuildingArea    5156
YearBuilt       4307
dtype: int64


In [54]:
# Shape of testing data (num_rows, num_columns)
print(X_test.shape)

# Number of missing values in each column of testing data
missing_val_count_by_column = (X_test.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

(2716, 12)
Car               13
BuildingArea    1294
YearBuilt       1068
dtype: int64
