## Read data

In [2]:
import pandas as pd

# Read the data
X = pd.read_csv('./input/train.csv', index_col='Id')
X_test = pd.read_csv('./input/test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice              
X.drop(['SalePrice'], axis=1, inplace=True)

## Split data on train and test

In [3]:
from sklearn.model_selection import train_test_split

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

## Create preprocessor

#### Get low cardinality and numeric cols

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 10 and 
                        X_train[cname].dtype == "object"]

# Select numeric columns
numeric_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

#### Preprocessing data

In [7]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#### Create preprocessor

In [8]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numeric_cols),
        ('cat', categorical_transformer, low_cardinality_cols)
    ])

## Choose model

In [12]:
from xgboost import XGBRegressor

# Define model
model = XGBRegressor(silent=True,
                            scale_pos_weight=1,
                            learning_rate=0.01,
                            colsample_bytree = 0.44,
                            subsample = 0.86,
                            n_estimators=5000,
                            reg_alpha = 0.1,
                            max_depth=5,
                            gamma=10,
                            n_jobs=4,
                            reg_lambda=1.01)

## Create pipeline

In [13]:
from sklearn.pipeline import Pipeline

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

## Train model use cross validation

In [14]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits=5, random_state=42, shuffle=True)
scores = -1 * cross_val_score(clf, X_train, y_train, cv=kfold, scoring='neg_mean_absolute_error')

print(scores)
print("Average MAE score:", scores.mean())

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


[17488.54243456 13609.44537927 16576.95092147 14001.16102803
 15421.12568737]
Average MAE score: 15419.445090141366


## Predict on test data and save

#### Fit model

In [25]:
clf.fit(X_train, y_train)

  if getattr(data, 'base', None) is not None and \


Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                                   'OverallCo

#### Predict

In [27]:
# Predict on test data
preds_test = clf.predict(X_test)

#### Save predictions

In [29]:
# Save predictions in format used for competition scoring
output = pd.DataFrame({'Id': X_test.index, 'SalePrice': preds_test})
output.to_csv('./output/submission.csv', index=False)