In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [2]:
home_data = pd.read_csv("../Intro_ML/Data/melb_data.csv")
home_data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [3]:
len(home_data.columns)

21

In [4]:
s = (home_data.dtypes=='object')
s[s].index

Index(['Suburb', 'Address', 'Type', 'Method', 'SellerG', 'Date', 'CouncilArea',
       'Regionname'],
      dtype='object')

In [5]:
X_full = home_data.drop(['Price'], axis=1)
y = home_data.Price

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X_full, y)

In [7]:
## Approach 1: drop categorical variables

## exercise

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [9]:
X = pd.read_csv('../Intro_ML/Data/train.csv', index_col='Id')
X_test = pd.read_csv('../Intro_ML/Data/test.csv', index_col='Id')

X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

In [10]:
cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)

In [11]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      train_size=0.8, test_size=0.2, 
                                                      random_state=0)

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [13]:
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    pred = model.predict(X_valid)
    return mean_absolute_error(pred, y_valid)

In [14]:
drop_X_train= X_train.select_dtypes(exclude='object')
drop_X_valid = X_valid.select_dtypes(exclude='object')

In [15]:
print("MAE from approach 1: drop the categorical variables: ")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))

MAE from approach 1: drop the categorical variables: 
17837.82570776256


In [16]:
print('Condition 2: ', X_train['Condition2'].unique())
print('Condition 2 in validation data: ', X_valid['Condition2'].unique())

Condition 2:  ['Norm' 'PosA' 'Feedr' 'PosN' 'Artery' 'RRAe']
Condition 2 in validation data:  ['Norm' 'RRAn' 'RRNn' 'Artery' 'Feedr' 'PosN']


In [17]:
objects_cols = [col for col in X_train.columns if X_train[col].dtype=='object']
good_label_cols = [col for col in objects_cols if
                  set(X_train[col]) == set(X_valid[col])]
bad_label_cols = set(objects_cols) - set(good_label_cols)
print("categorical columns that will be label encoded: ", good_label_cols)
print("\nCategorical columns that will be dropped from the dataset: ", bad_label_cols)

categorical columns that will be label encoded:  ['MSZoning', 'Street', 'LotShape', 'LandContour', 'LotConfig', 'BldgType', 'HouseStyle', 'ExterQual', 'CentralAir', 'KitchenQual', 'PavedDrive', 'SaleCondition']

Categorical columns that will be dropped from the dataset:  {'Condition1', 'SaleType', 'Foundation', 'RoofMatl', 'LandSlope', 'ExterCond', 'Heating', 'Exterior2nd', 'Exterior1st', 'Condition2', 'RoofStyle', 'Neighborhood', 'HeatingQC', 'Functional', 'Utilities'}


In [18]:
from sklearn.preprocessing import LabelEncoder

In [19]:
#help(LabelEncoder)

In [20]:
label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_valid = X_valid.drop(bad_label_cols, axis=1)

le = LabelEncoder()
for i in good_label_cols:
    label_X_train[i] = le.fit_transform(X_train[i])
    label_X_valid[i] = le.transform(X_valid[i])

In [21]:
print("MAE from approach 2: use numerical value: ")
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

MAE from approach 2: use numerical value: 
17575.291883561644


In [22]:
object_nunique = list(map(lambda col: X_train[col].nunique(), objects_cols))
d = dict(zip(objects_cols, object_nunique))

sorted(d.items(), key=lambda x:x[1])

[('Street', 2),
 ('Utilities', 2),
 ('CentralAir', 2),
 ('LandSlope', 3),
 ('PavedDrive', 3),
 ('LotShape', 4),
 ('LandContour', 4),
 ('ExterQual', 4),
 ('KitchenQual', 4),
 ('MSZoning', 5),
 ('LotConfig', 5),
 ('BldgType', 5),
 ('ExterCond', 5),
 ('HeatingQC', 5),
 ('Condition2', 6),
 ('RoofStyle', 6),
 ('Foundation', 6),
 ('Heating', 6),
 ('Functional', 6),
 ('SaleCondition', 6),
 ('RoofMatl', 7),
 ('HouseStyle', 8),
 ('Condition1', 9),
 ('SaleType', 9),
 ('Exterior1st', 15),
 ('Exterior2nd', 16),
 ('Neighborhood', 25)]

In [23]:
high_cardinality_numcols = sum(map(lambda x: x> 10, d.values()))
num_cols_neighborhood = d['Neighborhood']

## those with fewer cardinality will use one-hot-encoding,
## those with higher cardinality will be dropped

In [25]:
low_cardinality_cols = [col for col in objects_cols if X_train[col].nunique() < 10]
high_cardinality_cols = list(set(objects_cols) - set(low_cardinality_cols))
print('Categorical columns that will be one-hot encoded: ', low_cardinality_cols)
print('Categorical columns that will be dropped: ', high_cardinality_cols)

Categorical columns that will be one-hot encoded:  ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']
Categorical columns that will be dropped:  ['Exterior2nd', 'Neighborhood', 'Exterior1st']


In [27]:
#X_train.head()

In [28]:
from sklearn.preprocessing import OneHotEncoder

In [30]:
#help(OneHotEncoder)

In [33]:
oh_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cat_train = pd.DataFrame(oh_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cat_valid = pd.DataFrame(oh_encoder.transform(X_valid[low_cardinality_cols]))
# add back the index
OH_cat_train.index = X_train.index
OH_cat_valid.index = X_valid.index

## get back the numerical columns
OH_num_train = X_train.drop(objects_cols, axis=1)
OH_num_valid = X_valid.drop(objects_cols, axis=1)

## add those numerical columns back to OH
OH_X_train = pd.concat([OH_num_train, OH_cat_train], axis=1)
OH_X_valid = pd.concat([OH_num_valid, OH_cat_valid], axis=1)

OH_X_train.head()

Unnamed: 0_level_0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,112,113,114,115,116,117,118,119,120,121
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
619,20,11694,9,5,2007,2007,48,0,1774,1822,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
871,20,6600,5,5,1962,1962,0,0,894,894,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
93,30,13360,5,7,1921,2006,713,0,163,876,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
818,20,13265,8,5,2002,2002,1218,0,350,1568,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
303,20,13704,7,5,2001,2002,0,0,1541,1541,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [34]:
print("MAE from approach 3: one-hot encoding: ")
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))

MAE from approach 3: one-hot encoding: 
17525.345719178084
