In [12]:
import pandas as pd
data_file_path = './test.csv'
data = pd.read_csv(data_file_path)
data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [13]:
submission_file_path = './sample_submission.csv'
y = pd.read_csv(submission_file_path).SalePrice
X = data

In [14]:
from sklearn.model_selection import train_test_split
X_train_full, X_val_full, y_train, y_val = train_test_split(X,y)

In [15]:
numeric_col = [cname for cname in X_train_full.columns if X_train_full[cname].dtypes in ['int64', 'float64']]
category_col = [cname for cname in X_train_full.columns if X_train_full[cname].nunique()<10 and X_train_full[cname].dtypes == 'object']
full_col = numeric_col + category_col

In [16]:
X_train = X_train_full[full_col].copy()
X_val = X_val_full[full_col].copy()

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

numeric_transformer = SimpleImputer(strategy='constant')
category_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
])

processor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_col),
        ['cat', numeric_transformer, numeric_col]
    ]
)

In [18]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=0)

In [20]:
from sklearn.metrics import mean_absolute_error

my_pipline = Pipeline(steps=[
    ('processor', processor),
    ('model',model)
])
my_pipline.fit(X_train, y_train)
pred = my_pipline.predict(X_val)
print('MAE', mean_absolute_error(y_val, pred))

MAE 1438.0969035019048
