## Read data

In [33]:
import pandas as pd

# Read the data
X = pd.read_csv('./input/train.csv', index_col='Id')
X_test = pd.read_csv('./input/test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice              
X.drop(['SalePrice'], axis=1, inplace=True)

## Split data on train and test

In [34]:
from sklearn.model_selection import train_test_split

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

## Create preprocessor

#### Get low cardinality and numeric cols

In [35]:
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 10 and 
                        X_train[cname].dtype == "object"]

# Select numeric columns
numeric_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

#### Preprocessing data

In [36]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#### Create preprocessor

In [37]:
from sklearn.compose import ColumnTransformer

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numeric_cols),
        ('cat', categorical_transformer, low_cardinality_cols)
    ])

## Choose model

[LigthGBM parameter list](https://lightgbm.readthedocs.io/en/latest/Parameters.html)

In [47]:
# import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

params = {'boosting_type': 'gbdt',
          'max_depth' : -1,
          'objective': 'regression_l1',
          'nthread': 3, # Updated from nthread
          'num_leaves': 16,
          'learning_rate': 0.005,
          'max_bin': 512,
          'subsample_for_bin': 200,
          'subsample': 0.75,
          'subsample_freq': 1,
          'colsample_bytree': 0.66,
          'reg_alpha': 1.2,
          'reg_lambda': 10,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 5,
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'rmse'}

model = LGBMRegressor(boosting_type= 'gbdt',
          objective = params['objective'],
          n_jobs = 3, # Updated from 'nthread'
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'],
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'],
          subsample_freq = params['subsample_freq'],
          min_split_gain = params['min_split_gain'],
          min_child_weight = params['min_child_weight'],
          min_child_samples = params['min_child_samples'],
          scale_pos_weight = params['scale_pos_weight'])

## Create pipeline

In [48]:
# Bundle preprocessing and modeling code in a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

## Train model use cross validation

In [49]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits=5, random_state=42, shuffle=True)
scores = -1 * cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='neg_mean_absolute_error')

print(scores)
print("Average MAE score:", scores.mean())

[18318.07713282 14320.24845288 17001.70807777 14263.9312193
 17314.78918327]
Average MAE score: 16243.750813208087


## Tunning model

#### Get pipeline parameter list

In [50]:
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'model', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__num', 'preprocessor__cat', 'preprocessor__num__add_indicator', 'preprocessor__num__copy', 'preprocessor__num__fill_value', 'preprocessor__num__missing_values', 'preprocessor__num__strategy', 'preprocessor__num__verbose', 'preprocessor__cat__memory', 'preprocessor__cat__steps', 'preprocessor__cat__verbose', 'preprocessor__cat__imputer', 'preprocessor__cat__onehot', 'preprocessor__cat__imputer__add_indicator', 'preprocessor__cat__imputer__copy', 'preprocessor__cat__imputer__fill_value', 'preprocessor__cat__imputer__missing_values', 'preprocessor__cat__imputer__strategy', 'preprocessor__cat__imputer__verbose', 'preprocessor__cat__onehot__categorical_features', 'preprocessor__cat__onehot__categories', 'preprocessor__cat__onehot__drop', 

#### Search best hyperparameters using [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

[Parameter tunning guide](https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html)

In [None]:
from sklearn.model_selection import GridSearchCV

test_params = {
    'model__learning_rate': [0.1, 0.05, 0.01, 0.005],
    'model__n_estimators': [10, 100, 1000, 5000],
    'model__num_leaves': [2, 4, 8, 16, 32, 64],
    'model__boosting_type' : ['gbdt'],
    'model__objective' : ['regression_l1'],
    'model__random_state' : [501], # Updated from 'seed'
    'model__colsample_bytree' : [0.1, 0.2, 0.4, 0.8, 1],
    'model__subsample' : [0.1, 0.2, 0.4, 0.8, 1],
    'model__reg_alpha' : [0.5, 1, 1.5],
    'model__reg_lambda' : [0.5, 1, 1.5],
}

my_model = GridSearchCV(estimator=pipeline, param_grid=test_params)
my_model.fit(X_train, y_train)

print(my_model.best_params_)
print(my_model.best_score_)



## Predict on test data and save

#### Fit model

In [51]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                                   'OverallCo

#### Predict

In [52]:
# Predict on test data
preds_test = pipeline.predict(X_test)

#### Save predictions

In [53]:
# Save predictions in format used for competition scoring
output = pd.DataFrame({'Id': X_test.index, 'SalePrice': preds_test})
output.to_csv('./output/submission.csv', index=False)