In [1]:
import pandas as pd
pd.set_option("display.max_columns", None)

In [2]:
home_data = pd.read_csv("melb_data.csv")

In [3]:
home_data.sample(n=5, random_state = 5)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
9599,Keilor East,68 Nyah St,3,h,1001000.0,S,Nelson,17/06/2017,11.7,3033.0,3.0,1.0,2.0,605.0,,,Moonee Valley,-37.73137,144.86882,Western Metropolitan,5629.0
7330,Oakleigh,1173 North Rd,3,h,1035000.0,S,Buxton,12/11/2016,14.0,3166.0,3.0,1.0,2.0,650.0,,,Monash,-37.91,145.0914,Southern Metropolitan,3224.0
2885,Glenroy,4 Lyons St,3,h,600000.0,S,Barry,8/10/2016,13.0,3046.0,3.0,1.0,2.0,702.0,,,Moreland,-37.6999,144.9387,Northern Metropolitan,8870.0
7677,Camberwell,20 Crellin Gr,2,h,1900000.0,S,Noel,13/05/2017,7.8,3124.0,2.0,1.0,3.0,633.0,134.3,1960.0,Boroondara,-37.8427,145.0824,Southern Metropolitan,8920.0
979,Box Hill,89 Thames St,4,h,2770000.0,S,Lindellas,12/11/2016,13.1,3128.0,4.0,1.0,2.0,715.0,,1950.0,Whitehorse,-37.8135,145.1218,Eastern Metropolitan,4605.0


In [4]:
home_data.columns


Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [5]:
# selecting target variable
y = home_data.Price

In [7]:
# selecting feature variable
X = home_data.drop("Price", axis=1)

In [8]:
X.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,S,Biggin,4/02/2016,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,SP,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,PI,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,VB,Nelson,4/06/2016,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [9]:
from sklearn.model_selection import train_test_split

X_train_full, X_val_full, y_train_full, y_val_full = train_test_split(X,y, 
                                                                      train_size = .8, 
                                                                      test_size = .2, 
                                                                      random_state = 0)

In [15]:
#cardinality means number of unique values in columns
#selecting categorical columns with relatively low cardinality
low_cards_cols = [cname for cname in X_train_full.columns 
                  if X_train_full[cname].nunique() < 10
                 and X_train_full[cname].dtypes == "object"]
# selecting numerical columns
numerical_cols = [cname for cname in X_train_full.columns
                 if X_train_full[cname].dtype in ['int64', 'float64']]

my_cols = low_cards_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_val = X_val_full[my_cols].copy()

In [16]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,1.0,0.0,,1940.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,1.0,193.0,,,-37.858,144.9005,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,1.0,555.0,,,-37.7988,144.822,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,1.0,265.0,,1995.0,-37.7083,144.9158,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0


In [18]:
# get a list of categorical columns
s = (X_train.dtypes == "object")
object_cols = list(s[s].index)

print(object_cols)


['Type', 'Method', 'Regionname']


In [20]:
from sklearn.preprocessing import LabelEncoder

# make a copy avoiding changing original data
label_X_train = X_train.copy()
label_X_val = X_val.copy()

# applying label encoder to each column with categorical variable
label_encoder = LabelEncoder()
for col in object_cols:
    label_X_train[col] = label_encoder.fit_transform(X_train[col])
    label_X_val[col] = label_encoder.transform(X_val[col])

In [21]:
label_X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
12167,2,1,5,1,5.0,3182.0,1.0,1.0,1.0,0.0,,1940.0,-37.85984,144.9867,13240.0
6524,0,2,6,2,8.0,3016.0,2.0,2.0,1.0,193.0,,,-37.858,144.9005,6380.0
8413,0,1,6,3,12.6,3020.0,3.0,1.0,1.0,555.0,,,-37.7988,144.822,3755.0
2919,2,3,2,3,13.0,3046.0,3.0,1.0,1.0,265.0,,1995.0,-37.7083,144.9158,8870.0
6043,0,1,6,3,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0


In [22]:
label_X_train.tail()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
13123,0,3,2,3,5.2,3056.0,3.0,1.0,2.0,212.0,,,-37.77695,144.95785,11918.0
3264,0,1,0,3,10.5,3081.0,3.0,1.0,1.0,748.0,101.0,1950.0,-37.7416,145.0481,2947.0
9845,0,0,2,4,6.7,3058.0,4.0,2.0,2.0,441.0,255.0,2002.0,-37.73572,144.97256,11204.0
10799,0,1,2,3,12.0,3073.0,3.0,1.0,1.0,606.0,,,-37.72057,145.02615,21650.0
2732,0,3,6,4,6.4,3011.0,4.0,2.0,1.0,319.0,130.0,1915.0,-37.7943,144.8875,7570.0


In [24]:
from sklearn.impute import SimpleImputer
X_train_plus = label_X_train.copy()
X_val_plus = label_X_val.copy()


# imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_val = pd.DataFrame(my_imputer.transform(X_val_plus))

# imputation removes columns names; put them back
imputed_X_train.columns = X_train_plus.columns
imputed_X_val.columns = X_val_plus.columns

In [25]:
# define a function to check the quality for each approach
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


def score_dataset(X_trian, X_val, y_train, y_val):
    model = RandomForestRegressor(n_estimators=100, random_state = 5)
    model.fit(X_train, y_train)
    preds_val = model.predict(X_val)
    return mean_absolute_error(y_val, preds_val)
    
    

In [35]:
#print MAE score for simpleImpute and labelEncoding
print("MAE score: ")
print(score_dataset(imputed_X_train, imputed_X_val, y_train_full, y_val_full))

MAE score: 


ValueError: could not convert string to float: 'u'

In [29]:
y_train_full.head()

12167    481000.0
6524     895000.0
8413     651500.0
2919     482500.0
6043     591000.0
Name: Price, dtype: float64

In [30]:
imputed_X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
0,2.0,1.0,5.0,1.0,5.0,3182.0,1.0,1.0,1.0,0.0,153.764119,1940.0,-37.85984,144.9867,13240.0
1,0.0,2.0,6.0,2.0,8.0,3016.0,2.0,2.0,1.0,193.0,153.764119,1964.839866,-37.858,144.9005,6380.0
2,0.0,1.0,6.0,3.0,12.6,3020.0,3.0,1.0,1.0,555.0,153.764119,1964.839866,-37.7988,144.822,3755.0
3,2.0,3.0,2.0,3.0,13.0,3046.0,3.0,1.0,1.0,265.0,153.764119,1995.0,-37.7083,144.9158,8870.0
4,0.0,1.0,6.0,3.0,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0


In [31]:
imputed_X_train.tail()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
10859,0.0,3.0,2.0,3.0,5.2,3056.0,3.0,1.0,2.0,212.0,153.764119,1964.839866,-37.77695,144.95785,11918.0
10860,0.0,1.0,0.0,3.0,10.5,3081.0,3.0,1.0,1.0,748.0,101.0,1950.0,-37.7416,145.0481,2947.0
10861,0.0,0.0,2.0,4.0,6.7,3058.0,4.0,2.0,2.0,441.0,255.0,2002.0,-37.73572,144.97256,11204.0
10862,0.0,1.0,2.0,3.0,12.0,3073.0,3.0,1.0,1.0,606.0,153.764119,1964.839866,-37.72057,145.02615,21650.0
10863,0.0,3.0,6.0,4.0,6.4,3011.0,4.0,2.0,1.0,319.0,130.0,1915.0,-37.7943,144.8875,7570.0


In [32]:
imputed_X_val.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
0,0.0,3.0,6.0,4.0,8.0,3016.0,4.0,2.0,2.0,450.0,190.0,1910.0,-37.861,144.8985,6380.0
1,0.0,1.0,6.0,2.0,6.6,3011.0,2.0,1.0,0.0,172.0,81.0,1900.0,-37.81,144.8896,2417.0
2,0.0,3.0,6.0,3.0,10.5,3020.0,3.0,1.0,1.0,581.0,153.764119,1964.839866,-37.7674,144.82421,4217.0
3,1.0,0.0,5.0,3.0,4.5,3181.0,2.0,2.0,1.0,128.0,134.0,2000.0,-37.8526,145.0071,7717.0
4,0.0,1.0,2.0,3.0,8.5,3044.0,3.0,2.0,2.0,480.0,153.764119,1964.839866,-37.72523,144.94567,7485.0


In [33]:
imputed_X_val.tail()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
2711,2.0,0.0,6.0,2.0,6.4,3011.0,2.0,1.0,1.0,47.0,35.0,2013.0,-37.8014,144.8959,7570.0
2712,0.0,3.0,6.0,4.0,8.0,3016.0,4.0,2.0,4.0,551.0,153.764119,1964.839866,-37.8579,144.8786,6380.0
2713,0.0,1.0,0.0,3.0,10.8,3105.0,3.0,1.0,1.0,757.0,153.764119,1964.839866,-37.78094,145.10131,4480.0
2714,0.0,1.0,6.0,4.0,6.2,3039.0,4.0,1.0,3.0,478.0,152.0,1925.0,-37.76421,144.90571,6232.0
2715,0.0,3.0,2.0,2.0,1.6,3066.0,2.0,1.0,2.0,159.0,86.0,1880.0,-37.7962,144.9887,4553.0
