# *Kaggle: Intermediate Machine Learning Techniques*

In [41]:
# Import libraries
import pandas as pd
import numpy as np
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
%matplotlib inline

In [3]:
df = pd.read_csv("melb_data.csv")
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [6]:
df.shape

(13580, 21)

## *Preparing the features & target variable*

In [33]:
# Target variable

y = df['Price']
y.head()

0    1480000.0
1    1035000.0
2    1465000.0
3     850000.0
4    1600000.0
Name: Price, dtype: float64

In [34]:
# Select numerical predictors
x_preds = df.drop(['Price'], axis=1)
x = x_preds.select_dtypes(exclude=['object'])
x.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
0,2,2.5,3067.0,2.0,1.0,1.0,202.0,,,-37.7996,144.9984,4019.0
1,2,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,-37.8079,144.9934,4019.0
2,3,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,-37.8093,144.9944,4019.0
3,3,2.5,3067.0,3.0,2.0,1.0,94.0,,,-37.7969,144.9969,4019.0
4,4,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,-37.8072,144.9941,4019.0


In [35]:
x_train , x_valid, y_train, y_valid = train_test_split(x, y, train_size=0.8, test_size=0.2, random_state=0)

## *Missing data dealing approaches* 

- *Dropping the data/columns*
- *Imputation (Method 2)*
- *Enhanced Imputation (Method 3)*

#### *We define a function to measure the performance of all the approaches*

In [36]:
def evaluate_approach(x_train, x_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(x_train, y_train)   # Train the model
    preds = model.predict(x_valid) # Predict for test set
    return mean_absolute_error(y_valid, preds)

## *Missing Values-Approach 1: Simple drop of values*

In [37]:
cols_with_missing = [col for col in x_train.columns
                    if x_train[col].isnull().any()]
cols_with_missing

['Car', 'BuildingArea', 'YearBuilt']

In [38]:
# Drop the columns 
refined_x_train = x_train.drop(cols_with_missing, axis= 1)
refined_x_valid = x_valid.drop(cols_with_missing, axis= 1)
refined_x_train.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0
6524,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0
8413,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0
2919,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0
6043,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0


In [40]:
# Check score 
print("Approach 1: MAE Score: ")
print(evaluate_approach(refined_x_train, refined_x_valid, y_train, y_valid))

Approach 1: MAE Score: 
183550.22137772635


## *Missing Values-Approach 2: Imputation (Method 1)*

In [44]:
# Imputation
my_imputer = SimpleImputer(strategy='mean')
imputed_x_train = pd.DataFrame(my_imputer.fit_transform(x_train))
imputed_x_valid = pd.DataFrame(my_imputer.transform(x_valid))

# Reenter the column names
imputed_x_train.columns = x_train.columns
imputed_x_valid.columns = x_valid.columns

imputed_x_train.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
0,1.0,5.0,3182.0,1.0,1.0,1.0,0.0,153.764119,1940.0,-37.85984,144.9867,13240.0
1,2.0,8.0,3016.0,2.0,2.0,1.0,193.0,153.764119,1964.839866,-37.858,144.9005,6380.0
2,3.0,12.6,3020.0,3.0,1.0,1.0,555.0,153.764119,1964.839866,-37.7988,144.822,3755.0
3,3.0,13.0,3046.0,3.0,1.0,1.0,265.0,153.764119,1995.0,-37.7083,144.9158,8870.0
4,3.0,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0


In [51]:
imputed_x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10864 entries, 0 to 10863
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Rooms          10864 non-null  float64
 1   Distance       10864 non-null  float64
 2   Postcode       10864 non-null  float64
 3   Bedroom2       10864 non-null  float64
 4   Bathroom       10864 non-null  float64
 5   Car            10864 non-null  float64
 6   Landsize       10864 non-null  float64
 7   BuildingArea   10864 non-null  float64
 8   YearBuilt      10864 non-null  float64
 9   Lattitude      10864 non-null  float64
 10  Longtitude     10864 non-null  float64
 11  Propertycount  10864 non-null  float64
dtypes: float64(12)
memory usage: 1018.6 KB


In [50]:
imputed_x_train.shape

(10864, 12)

#### *As we can see; all the NULL values have been removed*

In [52]:
# Get Approach score
print("Approach 2: MAE Score: ")
print(evaluate_approach(imputed_x_train, imputed_x_valid, y_train, y_valid))

Approach 2: MAE Score: 
178166.46269899711


### *Approach 2 has lower MAE than Approach 1 ---> Approach 2 performed better!*

## *Missing Values-Approach 3: Imputation (Extension Method 2)*

In [53]:
# Make copies of dataset
x_train_plus = x_train.copy()
x_valid_plus = x_valid.copy()

# Make columns for the missing values
for col in cols_with_missing:
    x_train_plus[col, '_was_missing'] = x_train_plus[col].isnull()
    x_valid_plus[col, '_was_missing'] = x_valid_plus[col].isnull()
    
# Imputation
my_imputer = SimpleImputer()
imputed_x_train_plus = pd.DataFrame(my_imputer.fit_transform(x_train_plus))
imputed_x_valid_plus = pd.DataFrame(my_imputer.transform(x_valid_plus))

# Reset the column names
imputed_x_train_plus.columns = x_train_plus.columns
imputed_x_valid_plus.columns = x_valid_plus.columns

# Get Approach score
print("Approach 3: MAE Score: ")
print(evaluate_approach(imputed_x_train_plus, imputed_x_valid_plus, y_train, y_valid))

Approach 3: MAE Score: 
178927.503183954


In [54]:
imputed_x_train_plus.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount,"(Car, _was_missing)","(BuildingArea, _was_missing)","(YearBuilt, _was_missing)"
0,1.0,5.0,3182.0,1.0,1.0,1.0,0.0,153.764119,1940.0,-37.85984,144.9867,13240.0,0.0,1.0,0.0
1,2.0,8.0,3016.0,2.0,2.0,1.0,193.0,153.764119,1964.839866,-37.858,144.9005,6380.0,0.0,1.0,1.0
2,3.0,12.6,3020.0,3.0,1.0,1.0,555.0,153.764119,1964.839866,-37.7988,144.822,3755.0,0.0,1.0,1.0
3,3.0,13.0,3046.0,3.0,1.0,1.0,265.0,153.764119,1995.0,-37.7083,144.9158,8870.0,0.0,1.0,0.0
4,3.0,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0,0.0,0.0,0.0


### *Approach 3 performed slightly worse than Approach 2 (This can be the case sometimes)*

### *Imputation ---> Performs better than dropping values (Information Lost)*