In [16]:
import numpy as np
import pandas as pd

In [17]:
X_full = pd.read_csv('train.csv', index_col='Id')
X_test_full = pd.read_csv('test.csv', index_col='Id')

In [18]:
X_full.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


**Remove rows with missing target, seperate target from predictors**

In [19]:
X_full.dropna(subset=['SalePrice'], axis=0, inplace=True)
y = X_full.SalePrice
X_full.drop('SalePrice', axis=1, inplace=True)

**train and test splitted**

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, test_size=0.2, random_state=7)

**categorical and numerical columns**

In [22]:
categorical_cols = [col for col in X_train_full if X_train_full[col].nunique() < 10 
                                               and X_train_full[col].dtypes == 'object']

In [23]:
numerical_cols = [col for col in X_train_full if X_train_full[col].dtypes in ['int64', 'float64']]

In [24]:
my_cols = categorical_cols + numerical_cols

X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

# Train

In [25]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

**Preprocessing for numerical data**

In [26]:
numerical_transformer = SimpleImputer(strategy='mean')

**Preprocessing for categorical data**

In [28]:
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                         ('onehot', OneHotEncoder(handle_unknown='ignore'))])

**Bundle**

In [31]:
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_cols), 
                                              ('cat', categorical_transformer, categorical_cols)])

**Define model**

In [32]:
model = RandomForestRegressor(n_estimators=100, random_state=7)

**Bundle preprocessing and model code in a pipeline**

In [33]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])

**fit model, predict, MAE**

In [34]:
clf.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', SimpleImputer(),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                                   'OverallCond', 'YearBuilt',
                                                   'YearRemodAdd', 'MasVnrArea',
                                                   'BsmtFinSF1', 'BsmtFinSF2',
                                                   'BsmtUnfSF', 'TotalBsmtSF',
                                                   '1stFlrSF', '2ndFlrSF',
                                                   'LowQualFinSF', 'GrLivArea',
                                                   'BsmtFullBath',
                                                   'BsmtHalfBath', 'FullBath',
                                                   'HalfBath', 'Bed...
                                              

In [35]:
preds = clf.predict(X_valid)

In [36]:
print('MAE:', mean_absolute_error(y_valid, preds))

MAE: 16822.586952054797


# Test

In [37]:
preds_test = clf.predict(X_test)

In [40]:
output = pd.DataFrame({'Id': X_test.index, 'SalePrice': preds_test})

output.to_csv('submission_4.csv', index=False)