In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

X_full = pd.read_csv("./train.csv", index_col="Id")
X_test_full = pd.read_csv("./test.csv", index_col="Id")

X_full.dropna(axis=0, inplace=True, subset=["SalePrice"])
y = X_full.SalePrice
X_full.drop(["SalePrice"], axis=1, inplace=True)

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, train_size=0.8, test_size=0.2, random_state=0)

categorical_cols = [col for col in X_train_full.columns if X_train_full[col].dtype == "object" and 
                   X_train_full[col].nunique() < 10]
numerical_cols = [col for col in X_train_full.columns if X_train_full[col].dtype in ["int64", "float64"]]

my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

numerical_transformer = SimpleImputer(strategy="constant")

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])

model = RandomForestRegressor(n_estimators=100, random_state=0)

bundle = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

bundle.fit(X_train, y_train)

predictions = bundle.predict(X_valid)
print(f"MAE is:\t{mean_absolute_error(predictions, y_valid)}")

MAE is:	17614.81993150685
