In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [25]:
data = pd.read_csv('../Intro_ML/Data/train.csv')

In [26]:
y = data.SalePrice
X = data.drop(['SalePrice'], axis=1)
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X,y,train_size=0.8, test_size=0.2,
                                                               random_state=0)

In [27]:
categorical_cols = [cname for cname in X_train_full.columns 
                   if X_train_full[cname].nunique()< 10 and
                   X_train_full[cname].dtype == 'object']
numerical_cols = [cname for cname in X_train_full.columns
                 if X_train_full[cname].dtype in ['int64', 'float64']]

In [28]:
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [29]:
X_train.head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Condition1,Condition2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
618,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Norm,Norm,...,774,0,108,0,0,260,0,0,7,2007
870,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,PosN,Norm,...,308,0,0,0,0,0,0,0,8,2009
92,RL,Pave,Grvl,IR1,HLS,AllPub,Inside,Gtl,Norm,Norm,...,432,0,0,44,0,0,0,0,8,2009
817,RL,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,Norm,Norm,...,857,150,59,0,0,0,0,0,7,2008
302,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Norm,Norm,...,843,468,81,0,0,0,0,0,1,2006


In [30]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [31]:
numerical_transformer = SimpleImputer(strategy='constant')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [32]:
from sklearn.ensemble import RandomForestRegressor

In [33]:
model = RandomForestRegressor(n_estimators=100, random_state=0)

In [34]:
from sklearn.metrics import mean_absolute_error

In [35]:
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                             ('model', model)
                             ])
my_pipeline.fit(X_train, y_train)

pred = my_pipeline.predict(X_valid)

score = mean_absolute_error(pred, y_valid)
print('MAE:', score)

MAE: 17740.290308219177


## Exercise:

In [38]:
help(SimpleImputer)

Help on class SimpleImputer in module sklearn.impute:

class SimpleImputer(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin)
 |  SimpleImputer(missing_values=nan, strategy='mean', fill_value=None, verbose=0, copy=True)
 |  
 |  Imputation transformer for completing missing values.
 |  
 |  Read more in the :ref:`User Guide <impute>`.
 |  
 |  Parameters
 |  ----------
 |  missing_values : number, string, np.nan (default) or None
 |      The placeholder for the missing values. All occurrences of
 |      `missing_values` will be imputed.
 |  
 |  strategy : string, optional (default="mean")
 |      The imputation strategy.
 |  
 |      - If "mean", then replace missing values using the mean along
 |        each column. Can only be used with numeric data.
 |      - If "median", then replace missing values using the median along
 |        each column. Can only be used with numeric data.
 |      - If "most_frequent", then replace missing using the most frequent
 |        value alon

In [18]:
#output = pd.DataFrame({'Id':X_test.index,
#                     'SalePrice': pred})
#output.to_csv('submission.csv', index=False)