In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
data=pd.read_csv("melb_data.csv")

In [2]:
y=data.Price
X=data.drop('Price',axis=1)
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)

In [3]:
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()] 
X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_valid_full.drop(cols_with_missing, axis=1, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [4]:
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

In [5]:
numerical_cols = [cname for cname in X_train_full
                  .columns if X_train_full[cname]
                  .dtype in ['int64', 'float64']]

In [6]:
my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [7]:
s = (X_train.dtypes == 'object')
print(s)
object_cols = list(s[s].index)

print("\nCategorical variables:")
print(object_cols)

Type              True
Method            True
Regionname        True
Rooms            False
Distance         False
Postcode         False
Bedroom2         False
Bathroom         False
Landsize         False
Lattitude        False
Longtitude       False
Propertycount    False
dtype: bool

Categorical variables:
['Type', 'Method', 'Regionname']


In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

# 1.

In [9]:
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

print("MAE from Approach 1 (Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))

MAE from Approach 1 (Drop categorical variables):
175703.48185157913


# 2.

In [10]:
from sklearn.preprocessing import OrdinalEncoder

# Make copy to avoid changing original data 
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# Apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])

print("MAE from Approach 2 (Ordinal Encoding):") 
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

MAE from Approach 2 (Ordinal Encoding):
165936.40548390493


# 3.

In [11]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.fit_transform(X_valid[object_cols]))
OH_cols_valid
# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

print("MAE from Approach 3 (One-Hot Encoding):") 
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))

MAE from Approach 3 (One-Hot Encoding):
166089.4893009678


### Practice

# 1.

In [12]:
df_full=pd.read_csv("train.csv",index_col='Id')
print(df_full.shape)
df_test_full=pd.read_csv('test.csv',index_col='Id')
print(df_test_full.shape)

(1460, 80)
(1459, 79)


In [13]:
df_full.dropna(axis=0,subset=['SalePrice'],inplace=True)
print(df_full.shape)
x=df_full.drop('SalePrice',axis=1)
print(x.shape)
y=df_full['SalePrice']
print(y.shape)

(1460, 80)
(1460, 79)
(1460,)


In [14]:
cols_have_missing=[col for col in x.columns if x[col].isna().any()]
x.drop(cols_have_missing,axis=1,inplace=True)
print(x.shape)
df_test_full.drop(cols_have_missing,axis=1,inplace=True)
print(df_test_full.shape)

(1460, 60)
(1459, 60)


In [15]:
x_numeric=x.select_dtypes(exclude=['object'])
x_numeric

Unnamed: 0_level_0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,8450,7,5,2003,2003,706,0,150,856,...,548,0,61,0,0,0,0,0,2,2008
2,20,9600,6,8,1976,1976,978,0,284,1262,...,460,298,0,0,0,0,0,0,5,2007
3,60,11250,7,5,2001,2002,486,0,434,920,...,608,0,42,0,0,0,0,0,9,2008
4,70,9550,7,5,1915,1970,216,0,540,756,...,642,0,35,272,0,0,0,0,2,2006
5,60,14260,8,5,2000,2000,655,0,490,1145,...,836,192,84,0,0,0,0,0,12,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,7917,6,5,1999,2000,0,0,953,953,...,460,0,40,0,0,0,0,0,8,2007
1457,20,13175,6,6,1978,1988,790,163,589,1542,...,500,349,0,0,0,0,0,0,2,2010
1458,70,9042,7,9,1941,2006,275,0,877,1152,...,252,0,60,0,0,0,0,2500,5,2010
1459,20,9717,5,6,1950,1996,49,1029,0,1078,...,240,366,0,112,0,0,0,0,4,2010


In [16]:
x_train,x_test,y_train,y_test=train_test_split(x_numeric,y,test_size=0.2,random_state=0)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(1168, 33)
(1168,)
(292, 33)
(292,)


In [17]:
x_test.shape

(292, 33)

In [18]:
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(x_train, y_train)
preds = model.predict(x_test)
mean_absolute_error(y_test,preds)

17837.82570776256

# 2.

In [19]:
df_full=pd.read_csv("train.csv",index_col='Id')
df_test_full=pd.read_csv('test.csv',index_col='Id')


df_full.dropna(axis=0,subset=['SalePrice'],inplace=True)
x=df_full.drop('SalePrice',axis=1)
y=df_full['SalePrice']



cols_have_missing=[col for col in x.columns if x[col].isna().any()]
x.drop(cols_have_missing,axis=1,inplace=True)
df_test_full.drop(cols_have_missing,axis=1,inplace=True)


x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [20]:
cat_columns=[col for col in x_train.columns if x_train[col].dtype=='object']
good_columns=[col  for col in cat_columns if set(x_test[col]).issubset(set(x_train[col]))]

In [21]:
bad_columns=list(set(cat_columns)-set(good_columns))
bad_columns

['Condition2', 'RoofMatl', 'Functional']

In [22]:
from sklearn.preprocessing import OrdinalEncoder

encoded_x_train=x_train.drop(bad_columns,axis=1)
encoded_x_test=x_test.drop(bad_columns,axis=1)

In [23]:
ordinal_encoder=OrdinalEncoder()
encoded_x_test[good_columns]=ordinal_encoder.fit_transform(encoded_x_test[good_columns])
encoded_x_train[good_columns]=ordinal_encoder.fit_transform(encoded_x_train[good_columns])

In [24]:
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(encoded_x_train, y_train)
preds = model.predict(encoded_x_test)
mean_absolute_error(y_test,preds)

17274.866860730595

# 3.

In [25]:
df_full=pd.read_csv("train.csv",index_col='Id')
df_test_full=pd.read_csv('test.csv',index_col='Id')


df_full.dropna(axis=0,subset=['SalePrice'],inplace=True)
x=df_full.drop('SalePrice',axis=1)
y=df_full['SalePrice']



cols_have_missing=[col for col in x.columns if x[col].isna().any()]
x.drop(cols_have_missing,axis=1,inplace=True)
df_test_full.drop(cols_have_missing,axis=1,inplace=True)


In [26]:
high_cardinality_columns=[col for col in cat_columns if x_train[col].nunique() > 10]
low_cardinality_columns=[col for col in cat_columns if x_train[col].nunique() < 10]

In [27]:
x_numeric=x.select_dtypes(exclude=['object'])
print(x_numeric.shape)

(1460, 33)


In [28]:
x_low=pd.get_dummies(x[low_cardinality_columns],drop_first=True)
print(x_low.shape)

(1460, 102)


In [29]:
x_new= pd.concat([x_numeric, x_low], axis=1)
x_new.shape
x_train,x_test,y_train,y_test=train_test_split(x_new,y,test_size=0.2,random_state=0)

In [30]:
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(x_train, y_train)
preds = model.predict(x_test)
mean_absolute_error(y_test,preds)

17480.32462328767