# The XGBoost 1 feature regression

## Imports

### Py Imports

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sbn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import pickle

### Datasets import

In [2]:
train = pd.read_csv('./sources/clean_train.csv', index_col=0)
test = pd.read_csv('./sources/clean_test.csv', index_col=0)

### Load variables

In [3]:
with open('./columns.pkl', 'rb') as col:
    all_col = pickle.load(col)
    
num_col = all_col['num_col']
cat_col = all_col['cat_col']

## work on dataset

### Scale datas

In [4]:
Y_train_full = train['SalePrice']
X_train_full = train.drop('SalePrice', axis=1)

In [5]:
train.shape

(1457, 221)

In [6]:
X_train_full.shape

(1457, 220)

In [7]:
Y_train_full.shape

(1457,)

In [8]:
# Define a function to reduce dataframe with one 'Surface' feature

def set_one_feature(dataframe):
    dataframe['Surface'] = dataframe['GrLivArea'] + dataframe['TotalBsmtSF']
    return dataframe

In [9]:
set_one_feature(X_train_full)
X_train_full = X_train_full.Surface.values.astype(float)
X_train_full = np.array(X_train_full).reshape((-1, 1))

In [10]:
X_test = set_one_feature(test)
type(X_test)

pandas.core.frame.DataFrame

In [11]:
X_test = X_test.Surface.values.astype(float)
X_test = np.array(X_test).reshape((-1, 1))

In [12]:
Y_train_full = train.SalePrice.values.astype(float)
Y_train_full = np.array(Y_train_full).reshape((-1, 1))

In [13]:
Y_train_full.shape

(1457, 1)

In [14]:
X_train_full.shape

(1457, 1)

## Model building

Split the train set into train and validation sets

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X_train_full, Y_train_full, test_size=0.2, random_state=42)

In [17]:
# Find the best suited model

model_cv = GridSearchCV(model,
                        param_grid={
                            'kernel': ('linear',),
                            'min_child_weight': [4, 5],
                            'gamma': [i/10.0 for i in range(3, 6)],
                            'subsample': [i/10.0 for i in range(6, 11)],
                            'colsample_bytree': [i/10.0 for i in range(6, 11)],
                            'max_depth': [2, 3, 4]
                        })

model_cv.fit(X_train, y_train)
model_cv.best_estimator_



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6, gamma=0.3,
             importance_type='gain', kernel='linear', learning_rate=0.1,
             max_delta_step=0, max_depth=3, min_child_weight=4, missing=None,
             n_estimators=100, n_jobs=1, nthread=None,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
             subsample=0.8, verbosity=1)

In [16]:
# Initialize a model

model = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                     colsample_bynode=1, colsample_bytree=0.6, gamma=0.3,
                     importance_type='gain', kernel='linear', learning_rate=0.1,
                     max_delta_step=0, max_depth=4, min_child_weight=4, missing=None,
                     n_estimators=100, n_jobs=1, nthread=None,
                     objective='reg:squarederror', random_state=0, reg_alpha=0,
                     reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
                     subsample=0.8, verbosity=1
                     )

Train and evaluate the model

In [18]:
model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.9, gamma=0.3,
             importance_type='gain', kernel='linear', learning_rate=0.1,
             max_delta_step=0, max_depth=4, min_child_weight=4, missing=None,
             n_estimators=100, n_jobs=1, nthread=None,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
             subsample=0.8, verbosity=1)

In [19]:
y_preds = model.predict(X_val)
y_train_pred = model.predict(X_train)

In [20]:
val_rmse = np.sqrt(mean_squared_error(y_val, y_preds))
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))

print("Train RMSE: %f" % (train_rmse))
print("Val RMSE: %f" % (val_rmse))

Train RMSE: 0.220178
Val RMSE: 0.238701


Make predicitons

In [21]:
y_test_pred = model.predict(X_test)