<h1>
    <p style="text-align:center; font-size:180%"> House Prices - Advanced Regression Techniques <br><br> 🏡</p> 
</h1>

<h2>
    <p style="text-align:center; font-family:Verdana; letter-spacing:0.5px; font-size:120%"> Predict sales prices and practice feature engineering, RFs, and gradient boosting 
    </p>
</h2> 
    

<center>
    <img src="https://storage.googleapis.com/kaggle-competitions/kaggle/5407/media/housesbanner.png"> 
</center>

<br><br>

**With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges you to predict the final price of each home. Resources:**

* `train.csv` - the training set
* `test.csv` - the test set
* `data_description.txt` - full description of each column, originally prepared by Dean De Cock but lightly edited to match the column names used here
* `sample_submission.csv` - a benchmark submission from a linear regression on year and month of sale, lot square footage, and number of bedrooms

<div style="display:fill;
            border-radius:10px;
            background-color:#246be3;
            font-family:Verdana;
            letter-spacing:1px;
            border: 2px solid #002a6e; 
            text-align:center;
            color:white; 
            font-size:120%">

<h2>Libraries and Settings<br> 📓</h2> </div>


In [None]:
# Data manipulation
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

# Maths
import numpy as np

# Patching sklearn
!pip install scikit-learn-intelex
from sklearnex import patch_sklearn
patch_sklearn()


# Model Building 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold, KFold

import optuna
from optuna.integration import LightGBMPruningCallback

import shap
from catboost import Pool
from catboost import CatBoostRegressor



# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import pprint

# Lgbm
from lightgbm import LGBMRegressor
import lightgbm as lgb


# Settings
sns.set(rc = {'figure.figsize': (26, 8)})
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style = "ticks", rc = custom_params)

<div style="display:fill;
            border-radius:10px;
            background-color:#246be3;
            font-family:Verdana;
            letter-spacing:1px;
            border: 2px solid #002a6e; 
            text-align:center;
            color:white; 
            font-size:120%">

<h2>Data Loading and EDA <br> 🤓</h2> </div>


In [None]:
# Data loading
train_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv', index_col = 'Id')
test_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv', index_col = 'Id')

# Drop Not relevant columns
def drop_nrv(df):
    na_df = pd.DataFrame(df.isna().sum(), 
                         columns = ['Number of NaN'])
    
    na_df = na_df.sort_values('Number of NaN', 
                              ascending = False).head(10)
    
    na_df['Perc'] = round(na_df['Number of NaN']/len(train_data.index)*100, 2)
    
    # Drop columns that have more than 50% of NaN
    to_drop = list(na_df[na_df['Perc'] > 50.00].index)
    na_df['Perc'] = na_df['Perc'].astype(str) + '%'
    df.drop(to_drop, 
            inplace = True, axis = 1)
    
    print(to_drop, ' columns have been removed (> 50% NaN)\n', sep = '')
    
    return na_df, df.astype('float64', errors = 'ignore')

print('TRAIN DATA\n')
missing_val_train, train_data = drop_nrv(train_data)
print(missing_val_train, '\n\n', '-'*80, '\n', sep = '')

print('TEST DATA\n')
missing_val_test, test_data = drop_nrv(test_data)
print(missing_val_test, '\n\n', '-'*80, '\n', sep = '')

print(f'Train Data rows: {train_data.shape[0]} \nTrain Data Columns: {train_data.shape[1]}\n')
print(f'Test Data rows: {test_data.shape[0]} \nTest Data Columns: {test_data.shape[1]}')

In [None]:
def obj_int_identifier(train_data, test_data):

    num_unique_val = pd.DataFrame(train_data.nunique(), columns = ['Unique Values']).sort_values(by = 'Unique Values')
    cat = num_unique_val[num_unique_val['Unique Values']<=25].index
    cont  = num_unique_val[num_unique_val['Unique Values']>25].index
    
    train_data_cont_var = train_data.filter(cont).columns
    train_data_disc_var = train_data.filter(cat).columns
    
    print('Train data total columns: ', 
          len(train_data_cont_var)+len(train_data_disc_var), 
          '\nContinuous Features: ', len(train_data_cont_var),
          '\nDiscrete Features: ', len(train_data_disc_var), '\n', sep ='')
    
    test_data_cont_var = test_data.filter(cont).columns
    test_data_disc_var = test_data.filter(cat).columns

    print('Test data total columns: ', 
          len(test_data_cont_var)+len(test_data_disc_var), 
          '\nContinuous Features: ', len(test_data_cont_var),
          '\nDiscrete Features: ', len(test_data_disc_var), sep ='')
    
    return train_data_cont_var, train_data_disc_var, test_data_cont_var, test_data_disc_var

train_data_cont_var, train_data_disc_var, test_data_cont_var, test_data_disc_var = obj_int_identifier(train_data, test_data)

In [None]:
# Align Features
def discrepancies_check(train, test):
    dict_train = {}
    dict_test = {}
    
    for el in train_data_disc_var:
        dict_train[el] = train[el].unique().tolist()
    
    for el in test_data_disc_var:
        dict_test[el] = test[el].unique().tolist()
    
    if dict_train.keys() == dict_test.keys():
        print('Train and Test set have the same discrete features.\nResults:')
    else: 
        print('Pay attention, different discrete features in Train and Test!\n')

    dict_diff = {}
    train_or_test = {}
    
    for key in dict_train.keys():
        if set(dict_train[key]) ^ set(dict_test[key]) != set():
            dict_diff[key] = list(set(dict_train[key]) ^ set(dict_test[key]))
    
    for key in dict_train.keys():
        if (set(dict_test[key]) - set(dict_train[key]) != set()) & (set(dict_train[key]) - set(dict_test[key]) != set()):
            train_or_test[key] = 'Both'        
        elif set(dict_train[key]) - set(dict_test[key]) != set():
            train_or_test[key] = 'Train'
        elif set(dict_test[key]) - set(dict_train[key]) != set():
            train_or_test[key] = 'Test'        
        elif set(dict_train[key]) ^ set(dict_test[key]) == set():
            pass
        else:
            print('Pay attention possible errors!')
    
    df = pd.DataFrame(index = dict_diff.keys(), columns = ['Discrepancies'])
    df['Discrepancies'] = dict_diff.values()
    
    df1 = pd.DataFrame(index = train_or_test.keys(), columns = ['Where'])
    df1['Where'] = train_or_test.values()
    
    final_df = df.merge(df1, right_index = True, left_index = True)
    
    return final_df
    
discrepancies_check(train_data, test_data)

<div class="alert alert-block alert-info"> 📌 Ex.1: in Train Utilities has NoSeWa, while in Test Utilities has nan.</div>

In [None]:
print('Utilities in Train data: ', train_data['Utilities'].unique(), '\n',
      'Utilities in Test data: ', test_data['Utilities'].unique(), sep = '')

<div class="alert alert-block alert-info"> 📌 Ex.2: in Train Electrical has Mix and nan, while in Test these observations are not present.</div>

In [None]:
print('Utilities in Train data: ', train_data['Electrical'].unique(), '\n',
      'Utilities in Test data: ', test_data['Electrical'].unique(), sep = '')

# 1.1 Train Data Visualization

In [None]:
train_data[train_data_cont_var].hist(figsize=(26, 20), layout=(5, 4))

plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(10, 6, figsize=(26, 40))

for variable, subplot in zip(train_data_disc_var, ax.flatten()):
    sns.countplot(x  = train_data[variable], ax=subplot)
    subplot.figure.tight_layout()
    for label in subplot.get_xticklabels():
        label.set_rotation(45)

fig.delaxes(ax[9][3])
fig.delaxes(ax[9][4])
fig.delaxes(ax[9][5])

In [None]:
sns.pairplot(train_data[train_data_cont_var])  
plt.tight_layout()
plt.show()

In [None]:
sns.heatmap(train_data[train_data_cont_var].corr(), annot=True, annot_kws={"size": 15},fmt='.1f')
plt.show()

# 1.2 Test Data Visualization

In [None]:
test_data[test_data_cont_var].hist(figsize=(26, 20), layout=(5, 4))

plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(10, 6, figsize=(26, 40))

for variable, subplot in zip(train_data_disc_var, ax.flatten()):
    sns.countplot(x  = train_data[variable], ax=subplot)
    subplot.figure.tight_layout()
    for label in subplot.get_xticklabels():
        label.set_rotation(45)

fig.delaxes(ax[9][2])
fig.delaxes(ax[9][3])
fig.delaxes(ax[9][4])
fig.delaxes(ax[9][5])

In [None]:
sns.pairplot(test_data[test_data_cont_var])  
plt.tight_layout()
plt.show()

In [None]:
sns.heatmap(test_data[test_data_cont_var].corr(), annot=True, annot_kws={"size": 15},fmt='.1f')
plt.show()

# 1.3 Exploring High Correlations with `SalePrice`

In [None]:
fig,ax = plt.subplots(3,2, figsize=(26,20))

sns.boxplot(x=train_data['OverallQual'],
            y=train_data['SalePrice'], 
            ax=ax[0,0]).set_title('OverallQual and SalePrice',
                                fontsize = 20)

sns.boxplot(x=train_data['OverallCond'],
            y=train_data['SalePrice'], 
            ax=ax[1,0]).set_title('OverallCond and SalePrice',
                                fontsize = 20)

sns.scatterplot(x = train_data['YearBuilt'],
                y = train_data['SalePrice'],
                hue = train_data['OverallQual'],
            ax=ax[0,1]).set_title('YearBuilt and SalePrice',
                                  fontsize = 20)

sns.scatterplot(x=train_data['GrLivArea'],
                y=train_data['SalePrice'], 
                hue = train_data['OverallQual'],
                ax=ax[1,1]).set_title('GrLivArea and SalePrice',
                                      fontsize = 20)

sns.boxplot(x = train_data['TotRmsAbvGrd'],
            y = train_data['SalePrice'],
            ax = ax[2,0]).set_title('TotRmsAbvGrd and SalePrice',
                                  fontsize = 20)

sns.scatterplot(x=train_data['TotalBsmtSF'],
                y=train_data['SalePrice'], 
                hue = train_data['OverallQual'],
                ax=ax[2,1]).set_title('TotalBsmtSF and SalePrice',
                                      fontsize = 20)
fig.show()

<div style="display:fill;
            border-radius:10px;
            background-color:#246be3;
            font-family:Verdana;
            letter-spacing:1px;
            border: 2px solid #002a6e; 
            text-align:center;
            color:white; 
            font-size:120%">

<h2>Feature Enineering<br> 🧑‍🏭</h2> </div>

# 2.1 Imputing NaN

## train_data

In [None]:
# For the categorical variable is better to track where the missing values are
train_data[train_data_disc_var] = train_data[train_data_disc_var].fillna('missing')

# The mean is used to fill NaN. As for GarageYrBlt 0 is used in order to track where the missing values are
miss_cont = pd.DataFrame(train_data[train_data_cont_var].isna().sum(), columns = ['missing']).sort_values('missing', ascending = False).head(3)
train_data['LotFrontage'] = train_data['LotFrontage'].fillna(train_data['LotFrontage'].mean())
train_data['GarageYrBlt'] = train_data['GarageYrBlt'].fillna(0)
train_data['MasVnrArea'] = train_data['MasVnrArea'].fillna(train_data['MasVnrArea'].mean())
miss_cont

## test_data

In [None]:
# For the categorical variable is better to track where the missing values are
test_data[test_data_disc_var] = test_data[test_data_disc_var].fillna('missing')

# The mean is used to fill NaN. As for GarageYrBlt 0 is used in order to track where the missing values are
miss_cont = pd.DataFrame(test_data[test_data_cont_var].isna().sum(), columns = ['missing']).sort_values('missing', ascending = False).head(8)
test_data['LotFrontage'] = test_data['LotFrontage'].fillna(test_data['LotFrontage'].mean())
test_data['GarageYrBlt'] = test_data['GarageYrBlt'].fillna(0)
test_data['MasVnrArea'] = test_data['MasVnrArea'].fillna(test_data['MasVnrArea'].mean())
test_data['GarageArea'] = test_data['GarageArea'].fillna(test_data['MasVnrArea'].mean())
test_data['BsmtUnfSF'] = test_data['BsmtUnfSF'].fillna(test_data['MasVnrArea'].mean())
test_data['BsmtFinSF2'] = test_data['BsmtFinSF2'].fillna(test_data['MasVnrArea'].mean())
test_data['TotalBsmtSF'] = test_data['TotalBsmtSF'].fillna(test_data['MasVnrArea'].mean())
test_data['BsmtFinSF1'] = test_data['BsmtFinSF1'].fillna(test_data['MasVnrArea'].mean())
miss_cont

In [None]:
print('Residual NanN in Train:',train_data.isna().sum().sum())
print('Residual NanN in Test:',test_data.isna().sum().sum())

In [None]:
#Check if something went wrong with columns until now
print(f'Train Data rows: {train_data.shape[0]} \nTrain Data Columns: {train_data.shape[1]}\n')
print(f'Test Data rows: {test_data.shape[0]} \nTest Data Columns: {test_data.shape[1]}')

# 2.2 Transforming Skewed Distributions

## Train Data

In [None]:
fig,ax = plt.subplots(12,2, figsize=(26,80))
i = 0

log_t_train = [  'ScreenPorch', 'LotFrontage', 'EnclosedPorch', 'BsmtFinSF2', 'OpenPorchSF',
                 'WoodDeckSF', 'MasVnrArea', '2ndFlrSF', 'SalePrice', '1stFlrSF',
                 'GrLivArea', 'LotArea']

for col in log_t_train: 
    sns.histplot(data=train_data[train_data_cont_var], 
            x=col,
            kde=True,
            ax=ax[i,0]).set_title(f'Original {col}, skew: {round(train_data[train_data_cont_var][col].skew(),4)}',
                                fontsize = 20)

    sns.histplot(data=np.log1p(train_data[train_data_cont_var]), 
            x=col,
            kde=True,
            ax=ax[i,1]).set_title(f'Log transformed {col}, skew: {round(np.log1p(train_data[train_data_cont_var][col]).skew(),4)}',
                                fontsize = 20)
    i=i+1
    

In [None]:
train_data[log_t_train] = np.log1p(train_data[log_t_train])

## Test Data

In [None]:
fig,ax = plt.subplots(11,2, figsize=(26,80))
i = 0

log_t_test = [  'ScreenPorch', 'LotFrontage', 'EnclosedPorch', 'BsmtFinSF2', 'OpenPorchSF',
                 'WoodDeckSF', 'MasVnrArea', '2ndFlrSF', '1stFlrSF',
                 'GrLivArea', 'LotArea']

for col in log_t_test: 
    sns.histplot(data=test_data[test_data_cont_var], 
            x=col,
            kde=True,
            ax=ax[i,0]).set_title(f'Original {col}, skew: {round(test_data[test_data_cont_var][col].skew(),4)}',
                                fontsize = 20)

    sns.histplot(data=np.log1p(test_data[test_data_cont_var]), 
            x=col,
            kde=True,
            ax=ax[i,1]).set_title(f'Log transformed {col}, skew: {round(np.log1p(test_data[test_data_cont_var][col]).skew(),4)}',
                                fontsize = 20)
    i=i+1

In [None]:
test_data[log_t_test] = np.log1p(test_data[log_t_test])

In [None]:
#Check if something went wrong with columns until now
print(f'Train Data rows: {train_data.shape[0]} \nTrain Data Columns: {train_data.shape[1]}\n')
print(f'Test Data rows: {test_data.shape[0]} \nTest Data Columns: {test_data.shape[1]}')

print('\nResidual NanN in Train:',train_data.isna().sum().sum())
print('Residual NanN in Test:',test_data.isna().sum().sum())

# 2.3 Variable encoding

In [None]:
# This function labels columns and returns the modified data frame
def ptp(col, df):
    
    le = preprocessing.LabelEncoder()
    ptp_corr = dict()
    
    for name in col:
        le.fit(df[name].ravel().astype('str'))
        k = name  
        c = dict()
        
        for el in le.classes_:
            c[el] = int(le.transform(np.asarray(el).ravel()))
        
        ptp_corr[k] = c
        df[name] = le.transform(df[name].ravel().astype('str'))
    
    return ptp_corr, df

map_p2p1, train_data_lab_enc = ptp(train_data_disc_var, train_data)
map_p2p2, test_data_lab_enc = ptp(test_data_disc_var, test_data)

#Check if something went wrong with columns until now
print(f'Train Data rows: {train_data_lab_enc.shape[0]} \nTrain Data Columns: {train_data_lab_enc.shape[1]}\n')
print(f'Test Data rows: {test_data_lab_enc.shape[0]} \nTest Data Columns: {test_data_lab_enc.shape[1]}')

In [None]:
# Features Encoding
def var_encoding(train, test):
    # Solving the problem of discrepancies between Train and test
    train['Origin']='TrainData'
    test['Origin']='TestData'
    comprehensive = pd.concat([train, test])
    
    dummy = train_data_disc_var
    
    comprehensive = pd.get_dummies(comprehensive, columns=dummy)
    
    new_train = comprehensive[comprehensive['Origin']=='TrainData']
    new_train.drop('Origin', inplace = True, axis = 1)
    new_test = comprehensive[comprehensive['Origin']=='TestData']
    new_test.drop('Origin', inplace = True, axis = 1)
    new_test.drop('SalePrice', inplace = True, axis = 1)
    
    if set(new_train.columns) - set(dummy) == set(new_train.columns):
        print('Original columns dropped from train_data!')
    else:
        print('Some original columns are still present in train_data, please check!')
        
    if set(new_test.columns) - set(dummy) == set(new_test.columns):
        print('Original columns dropped from test_data!\n')
    else:
        print('Some original columns are still present in test_data, please check!\n')
    
    return new_train, new_test

new_train, new_test = var_encoding(train_data_lab_enc, test_data_lab_enc)

#Check if something went wrong with columns until now
print(f'Train Data rows: {new_train.shape[0]} \nTrain Data Columns: {new_train.shape[1]}\n')
print(f'Test Data rows: {new_test.shape[0]} \nTest Data Columns: {new_test.shape[1]}')

<div style="display:fill;
            border-radius:10px;
            background-color:#246be3;
            font-family:Verdana;
            letter-spacing:1px;
            border: 2px solid #002a6e; 
            text-align:center;
            color:white; 
            font-size:120%">

<h2>Model training<br> 🧑‍🏫</h2> </div>

In [None]:
new_train.reset_index(inplace = True)
idx = new_train['Id']
new_train = new_train.drop(['Id'], axis = 1)

X = new_train.drop(['SalePrice'], axis = 1)
y = new_train['SalePrice']


In [None]:
def scaling_feat(train_set, test_set):
    print(f'Dimensions before scaling: \ntrain_set: {train_set.shape} \ntest_set: {test_set.shape}')
    
    scaler = StandardScaler()

    train_set_scaled = scaler.fit_transform(train_set)
    test_set_scaled = scaler.transform(test_set)

    train_set = pd.DataFrame(train_set_scaled, index=train_set.index, columns=train_set.columns)
    test_set = pd.DataFrame(test_set_scaled, index=test_set.index, columns=test_set.columns)
    
    print(f'\nDimensions after scaling: \ntrain_set: {train_set.shape} \ntest_set: {test_set.shape}')
    
    return train_set, test_set

train_set, test_set = scaling_feat(X, new_test)

***
# 3.1 Catboost

In [None]:
train_x, validation_x, train_y, validation_y = train_test_split(train_set, 
                                                                y, 
                                                                test_size=0.1,
                                                                random_state=1505)

train_x.columns = train_set.columns
validation_x.columns = train_set.columns

In [None]:
# Preforming a Grid Search to find the best combination of parameters

grid = {'iterations': [5000, 10000],
        'learning_rate': [0.1, 0.05, 0.025],
        'depth': [2, 3, 6],
        'l2_leaf_reg': [0.1, 0.25, 0.5]}

final_model = CatBoostRegressor(logging_level = 'Silent',
                                od_type = 'Iter', 
                                od_wait = 100)

gscv = GridSearchCV(estimator = final_model, param_grid = grid, scoring = 'neg_root_mean_squared_error', cv = 5)

# Fitting the model
gscv.fit(train_x, train_y)

# Estimator with the best performance
print(gscv.best_estimator_)

# Best score
print(gscv.best_score_)

# Returns the best parameters
print(gscv.best_params_)

{'depth': 3, 'iterations': 5000, 'l2_leaf_reg': 0.25, 'learning_rate': 0.025}

In [None]:
# Cat-Boost Regressor Validation
params = gscv.best_params_
         
cat = CatBoostRegressor(**params,
                        random_seed = 1505)

cat_model = cat.fit(train_x,
                    train_y,
                    plot = False,
                    verbose = False)

catf_pred = cat_model.predict(validation_x)
catf_RMSE_score = mean_squared_error(validation_y, catf_pred, squared = True)
catf_RMSE_score

In [None]:
train_p = Pool(train_x)
val_p = Pool(validation_x)

explainer = shap.TreeExplainer(cat_model) # insert your model
shap_values = explainer.shap_values(train_p) # insert your train Pool object

shap.initjs()
shap.summary_plot(shap_values, train_x)

In [None]:
# Test CSV Submission

test_pred = cat_model.predict(test_set)
submission = pd.DataFrame(test_set.index, columns = ['Id'])
test_pred = np.expm1(test_pred)
submission['SalePrice'] = test_pred 
submission.to_csv('submission.csv', index=False)

submission.head()

# 3.2 LGBMRegressor alternative model with Optuna 

```{python}
def objective(trial, data = X, target = y):
    
    train_x, test_x, train_y, test_y = train_test_split(data, 
                                                        target, 
                                                        test_size=0.2,
                                                        random_state=1505)
   
    param = {
        'metric': 'rmse', 
        'random_state': 1505,
        'n_estimators': 20000,
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    
    est = lgb.early_stopping(1000, first_metric_only=False, verbose=True)
    logm = lgb.log_evaluation(period=10000, show_stdv=True)
    
    model = LGBMRegressor(**param)  
    model.fit(train_x, 
              train_y,
              eval_set = [(test_x,test_y)],
              callbacks = [est, logm])
    
    preds = model.predict(test_x)    
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse
```

```{python}
optuna.logging.set_verbosity(optuna.logging.ERROR)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
```

```{python}
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")
```

```{python}
params = study.best_params   
params['random_state'] = 1505
params['n_estimators'] = 20000 
params['metric'] = 'rmse'
```

```{python}
params['cat_smooth'] = params.pop('min_data_per_groups')
```

```{python}
preds = np.zeros(new_test.shape[0])

kf = KFold(n_splits=10,
                     random_state=1505,
                     shuffle=True)

rmse=[]  # list contains rmse for each fold
n=0

est = lgb.early_stopping(500, first_metric_only=False, verbose=True)
logm = lgb.log_evaluation(period=5000, show_stdv=True)

for trn_idx, test_idx in kf.split(X, y):
    X_tr,X_val=X.iloc[trn_idx],X.iloc[test_idx]
    y_tr,y_val=y.iloc[trn_idx],y.iloc[test_idx]
    model = LGBMRegressor(**params)
    model.fit(X_tr,
              y_tr,
              eval_set=[(X_val,y_val)],
              callbacks = [est, logm])
    preds+=model.predict(new_test)/kf.n_splits
    rmse.append(mean_squared_error(y_val, model.predict(X_val), squared=False))
    print(n+1,rmse[n])
    n+=1

```

```{python}
# Save predictions to file
output = pd.DataFrame({'Id': test_set.index,
                       'SalePrice': np.exp(preds)})

# Check format
output.head()
```

```{python}
output.to_csv('submission.csv', index=False)
```