## Importing libraries

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib as plt

---

## Importing data

In [2]:
y_train_path = os.path.join('data', 'preprocessed_y_train.csv')
X_train_path = os.path.join('data', 'preprocessed_X_train.csv')
X_test_path = os.path.join('data', 'preprocessed_X_test.csv')

y_train = pd.read_csv(y_train_path, index_col = 'Id')
X_train = pd.read_csv(X_train_path, index_col = 'Id')
X_test = pd.read_csv(X_test_path, index_col = 'Id')

In [3]:
y_train.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1,208500
2,181500
3,223500
4,140000
5,250000


In [4]:
X_train.head()

Unnamed: 0_level_0,LotFrontage,LotArea,Alley,LotShape,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,...,Condition1_RRAn,Condition1_infrequent_sklearn,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,HouseStyle_infrequent_sklearn
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.208034,-0.207142,0.5,0.0,0.0,0.651479,-0.5172,1.050994,0.878668,0.510015,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.409895,-0.091886,0.5,0.0,0.0,-0.071836,2.179628,0.156734,-0.429577,-0.572835,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,-0.084449,0.07348,0.5,0.333333,0.0,0.651479,-0.5172,0.984752,0.830215,0.322174,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,-0.414011,-0.096897,0.5,0.333333,0.0,0.651479,-0.5172,-1.863632,-0.720298,-0.572835,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,0.574676,0.375148,0.5,0.333333,0.0,1.374795,-0.5172,0.951632,0.733308,1.360826,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [5]:
X_test.head()

Unnamed: 0_level_0,LotFrontage,LotArea,Alley,LotShape,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,...,Condition1_RRAn,Condition1_infrequent_sklearn,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,HouseStyle_infrequent_sklearn
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,0.409895,0.110763,0.5,0.0,0.0,-0.795151,0.381743,-0.340077,-1.15638,-0.572835,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1462,0.45109,0.37585,0.5,0.333333,0.0,-0.071836,0.381743,-0.43944,-1.30174,0.023838,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1463,0.162723,0.332053,0.5,0.333333,0.0,-0.795151,-0.5172,0.852269,0.6364,-0.572835,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1464,0.327504,-0.054002,0.5,0.333333,0.0,-0.071836,0.381743,0.88539,0.6364,-0.46234,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1465,-1.11433,-0.552407,0.5,0.333333,0.0,1.374795,-0.5172,0.686666,0.345679,-0.572835,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


---

## Importing models

In [6]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression

## 1) XGBoost

### Standard parameters

In [7]:
std_params = {'random_state': 0,
              'n_estimators': 10000,
               'max_depth': 6,
               'early_stopping_rounds': 10,
               'learning_rate': 0.05,
               'reg_lambda': 1,
               'reg_alpha': 1,
               'predictor': 'gpu_predictor'}

### Creating and fitting model

In [8]:
params = {'random_state': 0,
          'n_estimators': 10000,
          'max_depth': 6,
          'early_stopping_rounds': 10,
          'learning_rate': 0.05,
          'reg_lambda': 1,
          'reg_alpha': 1,
          'predictor': 'gpu_predictor'}

def create_xgb_model(params):
    xgb_model = XGBRegressor(random_state = params['random_state'],
                       n_estimators = params['n_estimators'], 
                       max_depth = params['max_depth'], 
                       learning_rate = params['learning_rate'],
                       early_stopping_rounds = params['early_stopping_rounds'],
                       reg_lambda = params['reg_lambda'], 
                       reg_alpha = params['reg_alpha'],
                       predictor = params['predictor'])
    return xgb_model

### Feature Selection

In [9]:
select_k_best = SelectKBest(mutual_info_regression, k=150)
X_train = pd.DataFrame(select_k_best.fit_transform(X_train, y_train), index=X_train.index, columns=select_k_best.get_feature_names_out())
X_test = pd.DataFrame(select_k_best.transform(X_test), index=X_test.index, columns=select_k_best.get_feature_names_out())

  y = column_or_1d(y, warn=True)


### Using KFold cross-validation method to analyze generatization capability

In [10]:
import warnings
warnings.filterwarnings('ignore')

eval_set = [(X_train, y_train)]

indexes = np.array(X_train.index)
rand = np.random.RandomState(params['random_state'])
rand.shuffle(indexes)
X_shuffled = X_train.loc[indexes]
y_shuffled = y_train.loc[indexes]

folds = 5
kf = KFold(n_splits = folds)
kfold_scores = []

for (train_indexes, val_indexes) in kf.split(X_train):
    X_train_kfold = X_shuffled.iloc[train_indexes].copy()
    y_train_kfold = y_shuffled.iloc[train_indexes].copy()
    X_val_kfold = X_shuffled.iloc[val_indexes].copy()
    y_val_kfold = y_shuffled.iloc[val_indexes].copy()
    
    model = create_xgb_model(params)
    model.fit(X_train_kfold, y_train_kfold, eval_metric = 'rmse', eval_set = eval_set, verbose = 0)
    y_pred = model.predict(X_val_kfold)
    kfold_scores.append(np.sqrt(mean_squared_error(np.log(y_val_kfold), np.log(y_pred))))
    
print(f'KFold final score: {sum(kfold_scores)/len(kfold_scores)}')

KFold final score: 0.12862414411914438


Important:
**Kfold final score is an estimate of the competition score.** However, it's certainly overestimated or underestimated.

---

## Exporting result to submission

In [11]:
xgb_model = create_xgb_model(params)
xgb_model.fit(X_train, y_train, eval_metric = 'rmse', eval_set = params['eval_set'])
y_pred = xgb_model.predict(X_test)
y_pred = pd.DataFrame(y_pred, index = X_test.index, columns = ['SalePrice'])

KeyError: 'eval_set'

In [None]:
y_pred.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,123645.34375
1462,155524.75
1463,183568.515625
1464,194779.125
1465,186486.21875


In [None]:
y_pred_path = os.path.join('data', 'xgb_predictions.csv')

y_pred.to_csv(y_pred_path, index = True)